1 #include "occ_status.hpp" 2 3 #include "occ_manager.hpp" 4 #include "occ_sensor.hpp" 5 #include "powermode.hpp" 6 #include "utils.hpp" 7 8 #include <phosphor-logging/lg2.hpp> 9 10 #include <filesystem> 11 12 namespace open_power 13 { 14 namespace occ 15 { 16 17 using namespace phosphor::logging; 18 19 using ThrottleObj = 20 sdbusplus::xyz::openbmc_project::Control::Power::server::Throttle; 21 22 // Handles updates to occActive property occActive(bool value)23 bool Status::occActive(bool value) 24 { 25 if (value != this->occActive()) 26 { 27 lg2::info("Status::occActive OCC{INST} changed to {STATE}", "INST", 28 instance, "STATE", value); 29 if (value) 30 { 31 // Clear prior throttle reason (before setting device active) 32 updateThrottle(false, THROTTLED_ALL); 33 34 // Set the device active 35 device.setActive(true); 36 37 // Reset last OCC state 38 lastState = 0; 39 40 // Start watching for errors (throttles, etc) 41 try 42 { 43 addErrorWatch(); 44 } 45 catch (const OpenFailure& e) 46 { 47 // Failed to add watch for throttle events, request reset to try 48 // to recover comm 49 lg2::error( 50 "Status::occActive: Unable to add error watch(s) for OCC{INST} watch: {ERROR}", 51 "INST", instance, "ERROR", e.what()); 52 deviceError(Error::Descriptor(OCC_COMM_ERROR_PATH)); 53 return Base::Status::occActive(false); 54 } 55 56 // Update the OCC active sensor 57 Base::Status::occActive(value); 58 59 if (device.master()) 60 { 61 // Update powercap bounds from OCC 62 manager.updatePcapBounds(); 63 } 64 65 // Call into Manager to let know that we have bound 66 if (this->managerCallBack) 67 { 68 this->managerCallBack(instance, value); 69 } 70 } 71 else 72 { 73 #ifdef POWER10 74 if (pmode && device.master()) 75 { 76 // Prevent mode changes 77 pmode->setMasterActive(false); 78 } 79 if (safeStateDelayTimer.isEnabled()) 80 { 81 // stop safe delay timer 82 safeStateDelayTimer.setEnabled(false); 83 } 84 #endif 85 // Call into Manager to let know that we will unbind. 86 if (this->managerCallBack) 87 { 88 this->managerCallBack(instance, value); 89 } 90 91 // Stop watching for errors 92 removeErrorWatch(); 93 94 // Set the device inactive 95 device.setActive(false); 96 97 // Clear throttles (OCC not active after disabling device) 98 updateThrottle(false, THROTTLED_ALL); 99 } 100 } 101 else if (value && !device.active()) 102 { 103 // Existing error watch is on a dead file descriptor. 104 removeErrorWatch(); 105 106 /* 107 * In it's constructor, Status checks Device::bound() to see if OCC is 108 * active or not. 109 * Device::bound() checks for occX-dev0 directory. 110 * We will lose occX-dev0 directories during FSI rescan. 111 * So, if we start this application (and construct Status), and then 112 * later do FSI rescan, we will end up with occActive = true and device 113 * NOT bound. Lets correct that situation here. 114 */ 115 device.setActive(true); 116 117 // Add error watch again 118 try 119 { 120 addErrorWatch(); 121 } 122 catch (const OpenFailure& e) 123 { 124 // Failed to add watch for throttle events, request reset to try to 125 // recover comm 126 lg2::error( 127 "Status::occActive: Unable to add error watch(s) again for OCC{INST} watch: {ERROR}", 128 "INST", instance, "ERROR", e.what()); 129 deviceError(Error::Descriptor(OCC_COMM_ERROR_PATH)); 130 return Base::Status::occActive(false); 131 } 132 } 133 else if (!value && device.active()) 134 { 135 removeErrorWatch(); 136 137 // In the event that the application never receives the active signal 138 // even though the OCC is active (this can occur if the BMC is rebooted 139 // with the host on, since the initial OCC driver probe will discover 140 // the OCCs), this application needs to be able to unbind the device 141 // when we get the OCC inactive signal. 142 device.setActive(false); 143 } 144 return Base::Status::occActive(value); 145 } 146 147 // Callback handler when a device error is reported. deviceError(Error::Descriptor d)148 void Status::deviceError(Error::Descriptor d) 149 { 150 #ifdef POWER10 151 if (pmode && device.master()) 152 { 153 // Prevent mode changes 154 pmode->setMasterActive(false); 155 } 156 #endif 157 158 if (d.log) 159 { 160 FFDC::createOCCResetPEL(instance, d.path, d.err, d.callout, 161 d.isInventoryCallout); 162 } 163 164 // This would deem OCC inactive 165 this->occActive(false); 166 167 // Reset the OCC 168 this->resetOCC(); 169 } 170 171 // Sends message to host control command handler to reset OCC resetOCC()172 void Status::resetOCC() 173 { 174 lg2::info(">>Status::resetOCC() - requesting reset for OCC{INST}", "INST", 175 instance); 176 this->occActive(false); 177 #ifdef PLDM 178 if (resetCallBack) 179 { 180 this->resetCallBack(instance); 181 } 182 #else 183 constexpr auto CONTROL_HOST_PATH = "/org/open_power/control/host0"; 184 constexpr auto CONTROL_HOST_INTF = "org.open_power.Control.Host"; 185 186 // This will throw exception on failure 187 auto service = utils::getService(CONTROL_HOST_PATH, CONTROL_HOST_INTF); 188 189 auto& bus = utils::getBus(); 190 auto method = bus.new_method_call(service.c_str(), CONTROL_HOST_PATH, 191 CONTROL_HOST_INTF, "Execute"); 192 // OCC Reset control command 193 method.append(convertForMessage(Control::Host::Command::OCCReset).c_str()); 194 195 // OCC Sensor ID for callout reasons 196 method.append(std::variant<uint8_t>(std::get<0>(sensorMap.at(instance)))); 197 bus.call_noreply(method); 198 return; 199 #endif 200 } 201 202 // Handler called by Host control command handler to convey the 203 // status of the executed command hostControlEvent(sdbusplus::message_t & msg)204 void Status::hostControlEvent(sdbusplus::message_t& msg) 205 { 206 std::string cmdCompleted{}; 207 std::string cmdStatus{}; 208 209 msg.read(cmdCompleted, cmdStatus); 210 211 lg2::debug("Host control signal values: command={CMD}, status={STATUS}", 212 "CMD", cmdCompleted, "STATUS", cmdStatus); 213 214 if (Control::Host::convertResultFromString(cmdStatus) != 215 Control::Host::Result::Success) 216 { 217 if (Control::Host::convertCommandFromString(cmdCompleted) == 218 Control::Host::Command::OCCReset) 219 { 220 // Must be a Timeout. Log an Error trace 221 lg2::error( 222 "Error resetting the OCC: path={PATH}, sensorid={SENSOR}", 223 "PATH", path, "SENSOR", std::get<0>(sensorMap.at(instance))); 224 } 225 } 226 return; 227 } 228 229 // Called from Manager::pollerTimerExpired() in preperation to POLL OCC. readOccState()230 void Status::readOccState() 231 { 232 if (stateValid) 233 { 234 // Reset retry count (since state is good) 235 currentOccReadRetriesCount = occReadRetries; 236 } 237 occReadStateNow(); 238 } 239 240 #ifdef POWER10 241 // Special processing that needs to happen once the OCCs change to ACTIVE state occsWentActive()242 void Status::occsWentActive() 243 { 244 CmdStatus status = CmdStatus::SUCCESS; 245 246 // IPS data will get sent automatically after a mode change if the mode 247 // supports it. 248 pmode->needToSendIPS(); 249 250 status = pmode->sendModeChange(); 251 if (status != CmdStatus::SUCCESS) 252 { 253 lg2::error( 254 "Status::occsWentActive: OCC mode change failed with status {STATUS}", 255 "STATUS", status); 256 257 // Disable and reset to try recovering 258 deviceError(); 259 } 260 } 261 262 // Send Ambient and Altitude to the OCC sendAmbient(const uint8_t inTemp,const uint16_t inAltitude)263 CmdStatus Status::sendAmbient(const uint8_t inTemp, const uint16_t inAltitude) 264 { 265 CmdStatus status = CmdStatus::FAILURE; 266 bool ambientValid = true; 267 uint8_t ambientTemp = inTemp; 268 uint16_t altitude = inAltitude; 269 270 if (ambientTemp == 0xFF) 271 { 272 // Get latest readings from manager 273 manager.getAmbientData(ambientValid, ambientTemp, altitude); 274 lg2::debug( 275 "sendAmbient: valid: {VALID}, Ambient: {TEMP}C, altitude: {ALT}m", 276 "VALID", ambientValid, "TEMP", ambientTemp, "ALT", altitude); 277 } 278 279 std::vector<std::uint8_t> cmd, rsp; 280 cmd.reserve(11); 281 cmd.push_back(uint8_t(CmdType::SEND_AMBIENT)); 282 cmd.push_back(0x00); // Data Length (2 bytes) 283 cmd.push_back(0x08); // 284 cmd.push_back(0x00); // Version 285 cmd.push_back(ambientValid ? 0 : 0xFF); // Ambient Status 286 cmd.push_back(ambientTemp); // Ambient Temperature 287 cmd.push_back(altitude >> 8); // Altitude in meters (2 bytes) 288 cmd.push_back(altitude & 0xFF); // 289 cmd.push_back(0x00); // Reserved (3 bytes) 290 cmd.push_back(0x00); 291 cmd.push_back(0x00); 292 lg2::debug("sendAmbient: SEND_AMBIENT " 293 "command to OCC{INST} ({SIZE} bytes)", 294 "INST", instance, "SIZE", cmd.size()); 295 status = occCmd.send(cmd, rsp); 296 if (status == CmdStatus::SUCCESS) 297 { 298 if (rsp.size() == 5) 299 { 300 if (RspStatus::SUCCESS != RspStatus(rsp[2])) 301 { 302 lg2::error( 303 "sendAmbient: SEND_AMBIENT failed with rspStatus {STATUS}", 304 "STATUS", lg2::hex, rsp[2]); 305 dump_hex(rsp); 306 status = CmdStatus::FAILURE; 307 } 308 } 309 else 310 { 311 lg2::error( 312 "sendAmbient: INVALID SEND_AMBIENT response length:{SIZE}", 313 "SIZE", rsp.size()); 314 dump_hex(rsp); 315 status = CmdStatus::FAILURE; 316 } 317 } 318 else 319 { 320 lg2::error("sendAmbient: SEND_AMBIENT FAILED! with status {STATUS}", 321 "STATUS", lg2::hex, uint8_t(status)); 322 323 if (status == CmdStatus::COMM_FAILURE) 324 { 325 // Disable due to OCC comm failure and reset to try recovering 326 deviceError(Error::Descriptor(OCC_COMM_ERROR_PATH)); 327 } 328 } 329 330 return status; 331 } 332 333 // Called when safe timer expires to determine if OCCs need to be reset safeStateDelayExpired()334 void Status::safeStateDelayExpired() 335 { 336 if (this->occActive()) 337 { 338 lg2::info( 339 "safeStateDelayExpired: OCC{INST} state missing or not valid, requesting reset", 340 "INST", instance); 341 // Disable and reset to try recovering 342 deviceError(Error::Descriptor(SAFE_ERROR_PATH)); 343 } 344 } 345 #endif // POWER10 346 getHwmonPath()347 fs::path Status::getHwmonPath() 348 { 349 using namespace std::literals::string_literals; 350 351 if (!fs::exists(hwmonPath)) 352 { 353 static bool tracedFail[8] = {0}; 354 355 if (!hwmonPath.empty()) 356 { 357 lg2::warning( 358 "Status::getHwmonPath(): path no longer exists: {PATH}", "PATH", 359 hwmonPath); 360 hwmonPath.clear(); 361 } 362 363 // Build the base HWMON path 364 fs::path prefixPath = 365 fs::path{OCC_HWMON_PATH + "occ-hwmon."s + 366 std::to_string(instance + 1) + "/hwmon/"s}; 367 368 // Get the hwmonXX directory name 369 try 370 { 371 // there should only be one directory 372 const int numDirs = std::distance( 373 fs::directory_iterator(prefixPath), fs::directory_iterator{}); 374 if (numDirs == 1) 375 { 376 hwmonPath = *fs::directory_iterator(prefixPath); 377 tracedFail[instance] = false; 378 } 379 else 380 { 381 if (!tracedFail[instance]) 382 { 383 lg2::error( 384 "Status::getHwmonPath(): Found multiple ({NUM}) hwmon paths!", 385 "NUM", numDirs); 386 tracedFail[instance] = true; 387 } 388 } 389 } 390 catch (const fs::filesystem_error& e) 391 { 392 if (!tracedFail[instance]) 393 { 394 lg2::error( 395 "Status::getHwmonPath(): error accessing {PATH}: {ERROR}", 396 "PATH", prefixPath, "ERROR", e.what()); 397 tracedFail[instance] = true; 398 } 399 } 400 } 401 402 return hwmonPath; 403 } 404 405 // Called to read state and handle any errors occReadStateNow()406 void Status::occReadStateNow() 407 { 408 unsigned int state; 409 const fs::path filename = 410 fs::path(DEV_PATH) / 411 fs::path(sysfsName + "." + std::to_string(instance + 1)) / "occ_state"; 412 413 std::ifstream file; 414 bool goodFile = false; 415 416 // open file. 417 file.open(filename, std::ios::in); 418 const int openErrno = errno; 419 420 // File is open and state can be used. 421 if (file.is_open() && file.good()) 422 { 423 goodFile = true; 424 file >> state; 425 // Read the error code (if any) to check status of the read 426 std::ios_base::iostate readState = file.rdstate(); 427 if (readState) 428 { 429 // There was a failure reading the file 430 if (lastOccReadStatus != -1) 431 { 432 // Trace error bits 433 std::string errorBits = ""; 434 if (readState & std::ios_base::eofbit) 435 { 436 errorBits += " EOF"; 437 } 438 if (readState & std::ios_base::failbit) 439 { 440 errorBits += " failbit"; 441 } 442 if (readState & std::ios_base::badbit) 443 { 444 errorBits += " badbit"; 445 } 446 lg2::error( 447 "readOccState: Failed to read OCC{INST} state: Read error on I/O operation - {ERROR}", 448 "INST", instance, "ERROR", errorBits); 449 lastOccReadStatus = -1; 450 } 451 goodFile = false; 452 } 453 454 if (goodFile && (state != lastState)) 455 { 456 // Trace OCC state changes 457 lg2::info( 458 "Status::readOccState: OCC{INST} state {STATE} (lastState: {PRIOR})", 459 "INST", instance, "STATE", lg2::hex, state, "PRIOR", lg2::hex, 460 lastState); 461 lastState = state; 462 #ifdef POWER10 463 if (OccState(state) == OccState::ACTIVE) 464 { 465 if (pmode && device.master()) 466 { 467 // Set the master OCC on the PowerMode object 468 pmode->setMasterOcc(path); 469 // Enable mode changes 470 pmode->setMasterActive(); 471 472 // Special processing by master OCC when it goes active 473 occsWentActive(); 474 } 475 476 CmdStatus status = sendAmbient(); 477 if (status != CmdStatus::SUCCESS) 478 { 479 lg2::error( 480 "readOccState: Sending Ambient failed with status {STATUS}", 481 "STATUS", status); 482 } 483 } 484 485 // If OCC in known Good State. 486 if ((OccState(state) == OccState::ACTIVE) || 487 (OccState(state) == OccState::CHARACTERIZATION) || 488 (OccState(state) == OccState::OBSERVATION)) 489 { 490 // Good OCC State then sensors valid again 491 stateValid = true; 492 493 if (safeStateDelayTimer.isEnabled()) 494 { 495 // stop safe delay timer (no longer in SAFE state) 496 safeStateDelayTimer.setEnabled(false); 497 } 498 } 499 else 500 { 501 // OCC is in SAFE or some other unsupported state 502 if (!safeStateDelayTimer.isEnabled()) 503 { 504 lg2::error( 505 "readOccState: Invalid OCC{INST} state of {STATE}, starting safe state delay timer", 506 "INST", instance, "STATE", state); 507 // start safe delay timer (before requesting reset) 508 using namespace std::literals::chrono_literals; 509 safeStateDelayTimer.restartOnce(60s); 510 } 511 // Not a supported state (update sensors to NaN and not 512 // functional) 513 stateValid = false; 514 } 515 #else 516 // Before P10 state not checked, only used good file open. 517 stateValid = true; 518 #endif 519 } 520 } 521 #ifdef POWER10 522 else 523 { 524 // Unable to read state 525 stateValid = false; 526 } 527 #endif 528 file.close(); 529 530 // if failed to Read a state or not a valid state -> Attempt retry 531 // after 1 Second delay if allowed. 532 if ((!goodFile) || (!stateValid)) 533 { 534 if (!goodFile) 535 { 536 // If not able to read, OCC may be offline 537 if (openErrno != lastOccReadStatus) 538 { 539 lg2::error( 540 "Status::readOccState: open/read failed trying to read OCC{INST} state (open errno={ERROR})", 541 "INST", instance, "ERROR", openErrno); 542 lastOccReadStatus = openErrno; 543 } 544 } 545 else 546 { 547 // else this failed due to state not valid. 548 if (state != lastState) 549 { 550 lg2::error( 551 "Status::readOccState: OCC{INST} Invalid state {STATE} (last state: {PRIOR})", 552 "INST", instance, "STATE", lg2::hex, state, "PRIOR", 553 lg2::hex, lastState); 554 } 555 } 556 557 #ifdef READ_OCC_SENSORS 558 manager.setSensorValueToNaN(instance); 559 #endif 560 561 // See occReadRetries for number of retry attempts. 562 if (currentOccReadRetriesCount > 0) 563 { 564 --currentOccReadRetriesCount; 565 } 566 else 567 { 568 lg2::error("readOccState: failed to read OCC{INST} state!", "INST", 569 instance); 570 571 // State could not be determined, set it to NO State. 572 lastState = 0; 573 574 // Disable the ability to send Failed actions until OCC is 575 // Active again. 576 stateValid = false; 577 578 // Disable due to OCC comm failure and reset to try recovering 579 // (processor callout will be added) 580 deviceError(Error::Descriptor(OCC_COMM_ERROR_PATH, ECOMM, 581 procPath.c_str(), true)); 582 583 // Reset retry count (for next attempt after recovery) 584 currentOccReadRetriesCount = occReadRetries; 585 } 586 } 587 else 588 { 589 if (lastOccReadStatus != 0) 590 { 591 lg2::info( 592 "Status::readOccState: successfully read OCC{INST} state: {STATE}", 593 "INST", instance, "STATE", state); 594 lastOccReadStatus = 0; // no error 595 } 596 } 597 } 598 599 // Update processor throttle status on dbus updateThrottle(const bool isThrottled,const uint8_t newReason)600 void Status::updateThrottle(const bool isThrottled, const uint8_t newReason) 601 { 602 if (!throttleHandle) 603 { 604 return; 605 } 606 607 uint8_t newThrottleCause = throttleCause; 608 609 if (isThrottled) // throttled due to newReason 610 { 611 if ((newReason & throttleCause) == 0) 612 { 613 // set the bit(s) for passed in reason 614 newThrottleCause |= newReason; 615 } 616 // else no change 617 } 618 else // no longer throttled due to newReason 619 { 620 if ((newReason & throttleCause) != 0) 621 { 622 // clear the bit(s) for passed in reason 623 newThrottleCause &= ~newReason; 624 } 625 // else no change 626 } 627 628 if (newThrottleCause != throttleCause) 629 { 630 if (newThrottleCause == THROTTLED_NONE) 631 { 632 lg2::debug( 633 "updateThrottle: OCC{INST} no longer throttled (prior reason: {REASON})", 634 "INST", instance, "REASON", throttleCause); 635 throttleCause = THROTTLED_NONE; 636 throttleHandle->throttled(false); 637 throttleHandle->throttleCauses({}); 638 } 639 else 640 { 641 lg2::debug( 642 "updateThrottle: OCC{INST} is throttled with reason {REASON} (prior reason: {PRIOR})", 643 "INST", instance, "REASON", newThrottleCause, "PRIOR", 644 throttleCause); 645 throttleCause = newThrottleCause; 646 647 std::vector<ThrottleObj::ThrottleReasons> updatedCauses; 648 if (throttleCause & THROTTLED_POWER) 649 { 650 updatedCauses.push_back( 651 throttleHandle->ThrottleReasons::PowerLimit); 652 } 653 if (throttleCause & THROTTLED_THERMAL) 654 { 655 updatedCauses.push_back( 656 throttleHandle->ThrottleReasons::ThermalLimit); 657 } 658 if (throttleCause & THROTTLED_SAFE) 659 { 660 updatedCauses.push_back( 661 throttleHandle->ThrottleReasons::ManagementDetectedFault); 662 } 663 throttleHandle->throttleCauses(updatedCauses); 664 throttleHandle->throttled(true); 665 } 666 } 667 // else no change to throttle status 668 } 669 670 // Get processor path associated with this OCC readProcAssociation()671 void Status::readProcAssociation() 672 { 673 std::string managingPath = path + "/power_managing"; 674 lg2::debug("readProcAssociation: getting endpoints for {MANAGE} ({PATH})", 675 "MANAGE", managingPath, "PATH", path); 676 try 677 { 678 utils::PropertyValue procPathProperty{}; 679 procPathProperty = utils::getProperty( 680 managingPath, "xyz.openbmc_project.Association", "endpoints"); 681 auto result = std::get<std::vector<std::string>>(procPathProperty); 682 if (result.size() > 0) 683 { 684 procPath = result[0]; 685 lg2::info("readProcAssociation: OCC{INST} has proc={PATH}", "INST", 686 instance, "PATH", procPath); 687 } 688 else 689 { 690 lg2::error( 691 "readProcAssociation: No processor associated with OCC{INST} / {PATH}", 692 "INST", instance, "PATH", path); 693 } 694 } 695 catch (const sdbusplus::exception_t& e) 696 { 697 lg2::error( 698 "readProcAssociation: Unable to get proc assocated with {PATH} - {ERROR}", 699 "PATH", path, "ERROR", e.what()); 700 procPath = {}; 701 } 702 } 703 704 } // namespace occ 705 } // namespace open_power 706