1 #include "occ_status.hpp" 2 3 #include "occ_manager.hpp" 4 #include "occ_sensor.hpp" 5 #include "powermode.hpp" 6 #include "utils.hpp" 7 8 #include <phosphor-logging/lg2.hpp> 9 10 #include <filesystem> 11 12 namespace open_power 13 { 14 namespace occ 15 { 16 17 using namespace phosphor::logging; 18 19 using ThrottleObj = 20 sdbusplus::xyz::openbmc_project::Control::Power::server::Throttle; 21 22 // Handles updates to occActive property 23 bool Status::occActive(bool value) 24 { 25 if (value != this->occActive()) 26 { 27 lg2::info("Status::occActive OCC{INST} changed to {STATE}", "INST", 28 instance, "STATE", value); 29 if (value) 30 { 31 // Clear prior throttle reason (before setting device active) 32 updateThrottle(false, THROTTLED_ALL); 33 34 // Set the device active 35 device.setActive(true); 36 37 // Reset last OCC state 38 lastState = 0; 39 40 // Start watching for errors (throttles, etc) 41 try 42 { 43 addErrorWatch(); 44 } 45 catch (const OpenFailure& e) 46 { 47 // Failed to add watch for throttle events, request reset to try 48 // to recover comm 49 lg2::error( 50 "Status::occActive: Unable to add error watch(s) for OCC{INST} watch: {ERROR}", 51 "INST", instance, "ERROR", e.what()); 52 deviceError(Error::Descriptor(OCC_COMM_ERROR_PATH)); 53 return Base::Status::occActive(false); 54 } 55 56 // Update the OCC active sensor 57 Base::Status::occActive(value); 58 59 if (device.master()) 60 { 61 // Update powercap bounds from OCC 62 manager.updatePcapBounds(); 63 } 64 65 // Call into Manager to let know that we have bound 66 if (this->managerCallBack) 67 { 68 this->managerCallBack(instance, value); 69 } 70 } 71 else 72 { 73 #ifdef POWER10 74 if (pmode && device.master()) 75 { 76 // Prevent mode changes 77 pmode->setMasterActive(false); 78 } 79 if (safeStateDelayTimer.isEnabled()) 80 { 81 // stop safe delay timer 82 safeStateDelayTimer.setEnabled(false); 83 } 84 #endif 85 // Call into Manager to let know that we will unbind. 86 if (this->managerCallBack) 87 { 88 this->managerCallBack(instance, value); 89 } 90 91 // Stop watching for errors 92 removeErrorWatch(); 93 94 // Set the device inactive 95 device.setActive(false); 96 97 // Clear throttles (OCC not active after disabling device) 98 updateThrottle(false, THROTTLED_ALL); 99 } 100 } 101 else if (value && !device.active()) 102 { 103 // Existing error watch is on a dead file descriptor. 104 removeErrorWatch(); 105 106 /* 107 * In it's constructor, Status checks Device::bound() to see if OCC is 108 * active or not. 109 * Device::bound() checks for occX-dev0 directory. 110 * We will lose occX-dev0 directories during FSI rescan. 111 * So, if we start this application (and construct Status), and then 112 * later do FSI rescan, we will end up with occActive = true and device 113 * NOT bound. Lets correct that situation here. 114 */ 115 device.setActive(true); 116 117 // Add error watch again 118 try 119 { 120 addErrorWatch(); 121 } 122 catch (const OpenFailure& e) 123 { 124 // Failed to add watch for throttle events, request reset to try to 125 // recover comm 126 lg2::error( 127 "Status::occActive: Unable to add error watch(s) again for OCC{INST} watch: {ERROR}", 128 "INST", instance, "ERROR", e.what()); 129 deviceError(Error::Descriptor(OCC_COMM_ERROR_PATH)); 130 return Base::Status::occActive(false); 131 } 132 } 133 else if (!value && device.active()) 134 { 135 removeErrorWatch(); 136 137 // In the event that the application never receives the active signal 138 // even though the OCC is active (this can occur if the BMC is rebooted 139 // with the host on, since the initial OCC driver probe will discover 140 // the OCCs), this application needs to be able to unbind the device 141 // when we get the OCC inactive signal. 142 device.setActive(false); 143 } 144 return Base::Status::occActive(value); 145 } 146 147 // Callback handler when a device error is reported. 148 void Status::deviceError(Error::Descriptor d) 149 { 150 #ifdef POWER10 151 if (pmode && device.master()) 152 { 153 // Prevent mode changes 154 pmode->setMasterActive(false); 155 } 156 #endif 157 158 if (d.log) 159 { 160 FFDC::createOCCResetPEL(instance, d.path, d.err, d.callout); 161 } 162 163 // This would deem OCC inactive 164 this->occActive(false); 165 166 // Reset the OCC 167 this->resetOCC(); 168 } 169 170 // Sends message to host control command handler to reset OCC 171 void Status::resetOCC() 172 { 173 lg2::info(">>Status::resetOCC() - requesting reset for OCC{INST}", "INST", 174 instance); 175 this->occActive(false); 176 #ifdef PLDM 177 if (resetCallBack) 178 { 179 this->resetCallBack(instance); 180 } 181 #else 182 constexpr auto CONTROL_HOST_PATH = "/org/open_power/control/host0"; 183 constexpr auto CONTROL_HOST_INTF = "org.open_power.Control.Host"; 184 185 // This will throw exception on failure 186 auto service = utils::getService(CONTROL_HOST_PATH, CONTROL_HOST_INTF); 187 188 auto& bus = utils::getBus(); 189 auto method = bus.new_method_call(service.c_str(), CONTROL_HOST_PATH, 190 CONTROL_HOST_INTF, "Execute"); 191 // OCC Reset control command 192 method.append(convertForMessage(Control::Host::Command::OCCReset).c_str()); 193 194 // OCC Sensor ID for callout reasons 195 method.append(std::variant<uint8_t>(std::get<0>(sensorMap.at(instance)))); 196 bus.call_noreply(method); 197 return; 198 #endif 199 } 200 201 // Handler called by Host control command handler to convey the 202 // status of the executed command 203 void Status::hostControlEvent(sdbusplus::message_t& msg) 204 { 205 std::string cmdCompleted{}; 206 std::string cmdStatus{}; 207 208 msg.read(cmdCompleted, cmdStatus); 209 210 lg2::debug("Host control signal values: command={CMD}, status={STATUS}", 211 "CMD", cmdCompleted, "STATUS", cmdStatus); 212 213 if (Control::Host::convertResultFromString(cmdStatus) != 214 Control::Host::Result::Success) 215 { 216 if (Control::Host::convertCommandFromString(cmdCompleted) == 217 Control::Host::Command::OCCReset) 218 { 219 // Must be a Timeout. Log an Error trace 220 lg2::error( 221 "Error resetting the OCC: path={PATH}, sensorid={SENSOR}", 222 "PATH", path, "SENSOR", std::get<0>(sensorMap.at(instance))); 223 } 224 } 225 return; 226 } 227 228 // Called from Manager::pollerTimerExpired() in preperation to POLL OCC. 229 void Status::readOccState() 230 { 231 if (stateValid) 232 { 233 // Reset retry count (since state is good) 234 currentOccReadRetriesCount = occReadRetries; 235 } 236 occReadStateNow(); 237 } 238 239 #ifdef POWER10 240 // Special processing that needs to happen once the OCCs change to ACTIVE state 241 void Status::occsWentActive() 242 { 243 CmdStatus status = CmdStatus::SUCCESS; 244 245 // IPS data will get sent automatically after a mode change if the mode 246 // supports it. 247 pmode->needToSendIPS(); 248 249 status = pmode->sendModeChange(); 250 if (status != CmdStatus::SUCCESS) 251 { 252 lg2::error( 253 "Status::occsWentActive: OCC mode change failed with status {STATUS}", 254 "STATUS", status); 255 256 // Disable and reset to try recovering 257 deviceError(); 258 } 259 } 260 261 // Send Ambient and Altitude to the OCC 262 CmdStatus Status::sendAmbient(const uint8_t inTemp, const uint16_t inAltitude) 263 { 264 CmdStatus status = CmdStatus::FAILURE; 265 bool ambientValid = true; 266 uint8_t ambientTemp = inTemp; 267 uint16_t altitude = inAltitude; 268 269 if (ambientTemp == 0xFF) 270 { 271 // Get latest readings from manager 272 manager.getAmbientData(ambientValid, ambientTemp, altitude); 273 lg2::debug( 274 "sendAmbient: valid: {VALID}, Ambient: {TEMP}C, altitude: {ALT}m", 275 "VALID", ambientValid, "TEMP", ambientTemp, "ALT", altitude); 276 } 277 278 std::vector<std::uint8_t> cmd, rsp; 279 cmd.reserve(11); 280 cmd.push_back(uint8_t(CmdType::SEND_AMBIENT)); 281 cmd.push_back(0x00); // Data Length (2 bytes) 282 cmd.push_back(0x08); // 283 cmd.push_back(0x00); // Version 284 cmd.push_back(ambientValid ? 0 : 0xFF); // Ambient Status 285 cmd.push_back(ambientTemp); // Ambient Temperature 286 cmd.push_back(altitude >> 8); // Altitude in meters (2 bytes) 287 cmd.push_back(altitude & 0xFF); // 288 cmd.push_back(0x00); // Reserved (3 bytes) 289 cmd.push_back(0x00); 290 cmd.push_back(0x00); 291 lg2::debug("sendAmbient: SEND_AMBIENT " 292 "command to OCC{INST} ({SIZE} bytes)", 293 "INST", instance, "SIZE", cmd.size()); 294 status = occCmd.send(cmd, rsp); 295 if (status == CmdStatus::SUCCESS) 296 { 297 if (rsp.size() == 5) 298 { 299 if (RspStatus::SUCCESS != RspStatus(rsp[2])) 300 { 301 lg2::error( 302 "sendAmbient: SEND_AMBIENT failed with rspStatus {STATUS}", 303 "STATUS", lg2::hex, rsp[2]); 304 dump_hex(rsp); 305 status = CmdStatus::FAILURE; 306 } 307 } 308 else 309 { 310 lg2::error( 311 "sendAmbient: INVALID SEND_AMBIENT response length:{SIZE}", 312 "SIZE", rsp.size()); 313 dump_hex(rsp); 314 status = CmdStatus::FAILURE; 315 } 316 } 317 else 318 { 319 lg2::error("sendAmbient: SEND_AMBIENT FAILED! with status {STATUS}", 320 "STATUS", lg2::hex, uint8_t(status)); 321 322 if (status == CmdStatus::COMM_FAILURE) 323 { 324 // Disable due to OCC comm failure and reset to try recovering 325 deviceError(Error::Descriptor(OCC_COMM_ERROR_PATH)); 326 } 327 } 328 329 return status; 330 } 331 332 // Called when safe timer expires to determine if OCCs need to be reset 333 void Status::safeStateDelayExpired() 334 { 335 if (this->occActive()) 336 { 337 lg2::info( 338 "safeStateDelayExpired: OCC{INST} state missing or not valid, requesting reset", 339 "INST", instance); 340 // Disable and reset to try recovering 341 deviceError(Error::Descriptor(SAFE_ERROR_PATH)); 342 } 343 } 344 #endif // POWER10 345 346 fs::path Status::getHwmonPath() 347 { 348 using namespace std::literals::string_literals; 349 350 if (!fs::exists(hwmonPath)) 351 { 352 static bool tracedFail[8] = {0}; 353 354 if (!hwmonPath.empty()) 355 { 356 lg2::warning( 357 "Status::getHwmonPath(): path no longer exists: {PATH}", "PATH", 358 hwmonPath); 359 hwmonPath.clear(); 360 } 361 362 // Build the base HWMON path 363 fs::path prefixPath = 364 fs::path{OCC_HWMON_PATH + "occ-hwmon."s + 365 std::to_string(instance + 1) + "/hwmon/"s}; 366 367 // Get the hwmonXX directory name 368 try 369 { 370 // there should only be one directory 371 const int numDirs = std::distance( 372 fs::directory_iterator(prefixPath), fs::directory_iterator{}); 373 if (numDirs == 1) 374 { 375 hwmonPath = *fs::directory_iterator(prefixPath); 376 tracedFail[instance] = false; 377 } 378 else 379 { 380 if (!tracedFail[instance]) 381 { 382 lg2::error( 383 "Status::getHwmonPath(): Found multiple ({NUM}) hwmon paths!", 384 "NUM", numDirs); 385 tracedFail[instance] = true; 386 } 387 } 388 } 389 catch (const fs::filesystem_error& e) 390 { 391 if (!tracedFail[instance]) 392 { 393 lg2::error( 394 "Status::getHwmonPath(): error accessing {PATH}: {ERROR}", 395 "PATH", prefixPath, "ERROR", e.what()); 396 tracedFail[instance] = true; 397 } 398 } 399 } 400 401 return hwmonPath; 402 } 403 404 // Called to read state and handle any errors 405 void Status::occReadStateNow() 406 { 407 unsigned int state; 408 const fs::path filename = 409 fs::path(DEV_PATH) / 410 fs::path(sysfsName + "." + std::to_string(instance + 1)) / "occ_state"; 411 412 std::ifstream file; 413 bool goodFile = false; 414 415 // open file. 416 file.open(filename, std::ios::in); 417 const int openErrno = errno; 418 419 // File is open and state can be used. 420 if (file.is_open() && file.good()) 421 { 422 goodFile = true; 423 file >> state; 424 // Read the error code (if any) to check status of the read 425 std::ios_base::iostate readState = file.rdstate(); 426 if (readState) 427 { 428 // There was a failure reading the file 429 if (lastOccReadStatus != -1) 430 { 431 // Trace error bits 432 std::string errorBits = ""; 433 if (readState & std::ios_base::eofbit) 434 { 435 errorBits += " EOF"; 436 } 437 if (readState & std::ios_base::failbit) 438 { 439 errorBits += " failbit"; 440 } 441 if (readState & std::ios_base::badbit) 442 { 443 errorBits += " badbit"; 444 } 445 lg2::error( 446 "readOccState: Failed to read OCC{INST} state: Read error on I/O operation - {ERROR}", 447 "INST", instance, "ERROR", errorBits); 448 lastOccReadStatus = -1; 449 } 450 goodFile = false; 451 } 452 453 if (goodFile && (state != lastState)) 454 { 455 // Trace OCC state changes 456 lg2::info( 457 "Status::readOccState: OCC{INST} state {STATE} (lastState: {PRIOR})", 458 "INST", instance, "STATE", lg2::hex, state, "PRIOR", lg2::hex, 459 lastState); 460 lastState = state; 461 #ifdef POWER10 462 if (OccState(state) == OccState::ACTIVE) 463 { 464 if (pmode && device.master()) 465 { 466 // Set the master OCC on the PowerMode object 467 pmode->setMasterOcc(path); 468 // Enable mode changes 469 pmode->setMasterActive(); 470 471 // Special processing by master OCC when it goes active 472 occsWentActive(); 473 } 474 475 CmdStatus status = sendAmbient(); 476 if (status != CmdStatus::SUCCESS) 477 { 478 lg2::error( 479 "readOccState: Sending Ambient failed with status {STATUS}", 480 "STATUS", status); 481 } 482 } 483 484 // If OCC in known Good State. 485 if ((OccState(state) == OccState::ACTIVE) || 486 (OccState(state) == OccState::CHARACTERIZATION) || 487 (OccState(state) == OccState::OBSERVATION)) 488 { 489 // Good OCC State then sensors valid again 490 stateValid = true; 491 492 if (safeStateDelayTimer.isEnabled()) 493 { 494 // stop safe delay timer (no longer in SAFE state) 495 safeStateDelayTimer.setEnabled(false); 496 } 497 } 498 else 499 { 500 // OCC is in SAFE or some other unsupported state 501 if (!safeStateDelayTimer.isEnabled()) 502 { 503 lg2::error( 504 "readOccState: Invalid OCC{INST} state of {STATE}, starting safe state delay timer", 505 "INST", instance, "STATE", state); 506 // start safe delay timer (before requesting reset) 507 using namespace std::literals::chrono_literals; 508 safeStateDelayTimer.restartOnce(60s); 509 } 510 // Not a supported state (update sensors to NaN and not 511 // functional) 512 stateValid = false; 513 } 514 #else 515 // Before P10 state not checked, only used good file open. 516 stateValid = true; 517 #endif 518 } 519 } 520 #ifdef POWER10 521 else 522 { 523 // Unable to read state 524 stateValid = false; 525 } 526 #endif 527 file.close(); 528 529 // if failed to Read a state or not a valid state -> Attempt retry 530 // after 1 Second delay if allowed. 531 if ((!goodFile) || (!stateValid)) 532 { 533 if (!goodFile) 534 { 535 // If not able to read, OCC may be offline 536 if (openErrno != lastOccReadStatus) 537 { 538 lg2::error( 539 "Status::readOccState: open/read failed trying to read OCC{INST} state (open errno={ERROR})", 540 "INST", instance, "ERROR", openErrno); 541 lastOccReadStatus = openErrno; 542 } 543 } 544 else 545 { 546 // else this failed due to state not valid. 547 if (state != lastState) 548 { 549 lg2::error( 550 "Status::readOccState: OCC{INST} Invalid state {STATE} (last state: {PRIOR})", 551 "INST", instance, "STATE", lg2::hex, state, "PRIOR", 552 lg2::hex, lastState); 553 } 554 } 555 556 #ifdef READ_OCC_SENSORS 557 manager.setSensorValueToNaN(instance); 558 #endif 559 560 // See occReadRetries for number of retry attempts. 561 if (currentOccReadRetriesCount > 0) 562 { 563 --currentOccReadRetriesCount; 564 } 565 else 566 { 567 lg2::error("readOccState: failed to read OCC{INST} state!", "INST", 568 instance); 569 570 // State could not be determined, set it to NO State. 571 lastState = 0; 572 573 // Disable the ability to send Failed actions until OCC is 574 // Active again. 575 stateValid = false; 576 577 // Disable due to OCC comm failure and reset to try recovering 578 deviceError(Error::Descriptor(OCC_COMM_ERROR_PATH)); 579 580 // Reset retry count (for next attempt after recovery) 581 currentOccReadRetriesCount = occReadRetries; 582 } 583 } 584 else 585 { 586 if (lastOccReadStatus != 0) 587 { 588 lg2::info( 589 "Status::readOccState: successfully read OCC{INST} state: {STATE}", 590 "INST", instance, "STATE", state); 591 lastOccReadStatus = 0; // no error 592 } 593 } 594 } 595 596 // Update processor throttle status on dbus 597 void Status::updateThrottle(const bool isThrottled, const uint8_t newReason) 598 { 599 if (!throttleHandle) 600 { 601 return; 602 } 603 604 uint8_t newThrottleCause = throttleCause; 605 606 if (isThrottled) // throttled due to newReason 607 { 608 if ((newReason & throttleCause) == 0) 609 { 610 // set the bit(s) for passed in reason 611 newThrottleCause |= newReason; 612 } 613 // else no change 614 } 615 else // no longer throttled due to newReason 616 { 617 if ((newReason & throttleCause) != 0) 618 { 619 // clear the bit(s) for passed in reason 620 newThrottleCause &= ~newReason; 621 } 622 // else no change 623 } 624 625 if (newThrottleCause != throttleCause) 626 { 627 if (newThrottleCause == THROTTLED_NONE) 628 { 629 lg2::debug( 630 "updateThrottle: OCC{INST} no longer throttled (prior reason: {REASON})", 631 "INST", instance, "REASON", throttleCause); 632 throttleCause = THROTTLED_NONE; 633 throttleHandle->throttled(false); 634 throttleHandle->throttleCauses({}); 635 } 636 else 637 { 638 lg2::debug( 639 "updateThrottle: OCC{INST} is throttled with reason {REASON} (prior reason: {PRIOR})", 640 "INST", instance, "REASON", newThrottleCause, "PRIOR", 641 throttleCause); 642 throttleCause = newThrottleCause; 643 644 std::vector<ThrottleObj::ThrottleReasons> updatedCauses; 645 if (throttleCause & THROTTLED_POWER) 646 { 647 updatedCauses.push_back( 648 throttleHandle->ThrottleReasons::PowerLimit); 649 } 650 if (throttleCause & THROTTLED_THERMAL) 651 { 652 updatedCauses.push_back( 653 throttleHandle->ThrottleReasons::ThermalLimit); 654 } 655 if (throttleCause & THROTTLED_SAFE) 656 { 657 updatedCauses.push_back( 658 throttleHandle->ThrottleReasons::ManagementDetectedFault); 659 } 660 throttleHandle->throttleCauses(updatedCauses); 661 throttleHandle->throttled(true); 662 } 663 } 664 // else no change to throttle status 665 } 666 667 // Get processor path associated with this OCC 668 void Status::readProcAssociation() 669 { 670 std::string managingPath = path + "/power_managing"; 671 lg2::debug("readProcAssociation: getting endpoints for {MANAGE} ({PATH})", 672 "MANAGE", managingPath, "PATH", path); 673 try 674 { 675 utils::PropertyValue procPathProperty{}; 676 procPathProperty = utils::getProperty( 677 managingPath, "xyz.openbmc_project.Association", "endpoints"); 678 auto result = std::get<std::vector<std::string>>(procPathProperty); 679 if (result.size() > 0) 680 { 681 procPath = result[0]; 682 lg2::info("readProcAssociation: OCC{INST} has proc={PATH}", "INST", 683 instance, "PATH", procPath); 684 } 685 else 686 { 687 lg2::error( 688 "readProcAssociation: No processor associated with OCC{INST} / {PATH}", 689 "INST", instance, "PATH", path); 690 } 691 } 692 catch (const sdbusplus::exception_t& e) 693 { 694 lg2::error( 695 "readProcAssociation: Unable to get proc assocated with {PATH} - {ERROR}", 696 "PATH", path, "ERROR", e.what()); 697 procPath = {}; 698 } 699 } 700 701 } // namespace occ 702 } // namespace open_power 703