1 /** 2 * Copyright © 2017 IBM Corporation 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 #include "config.h" 17 18 #include "power_supply.hpp" 19 20 #include "elog-errors.hpp" 21 #include "gpio.hpp" 22 #include "names_values.hpp" 23 #include "pmbus.hpp" 24 #include "types.hpp" 25 #include "utility.hpp" 26 27 #include <org/open_power/Witherspoon/Fault/error.hpp> 28 #include <phosphor-logging/lg2.hpp> 29 #include <xyz/openbmc_project/Common/Device/error.hpp> 30 31 #include <functional> 32 33 namespace phosphor 34 { 35 namespace power 36 { 37 namespace psu 38 { 39 40 using namespace phosphor::logging; 41 using namespace sdbusplus::org::open_power::Witherspoon::Fault::Error; 42 using namespace sdbusplus::xyz::openbmc_project::Common::Device::Error; 43 44 #ifdef __clang__ 45 #pragma clang diagnostic push 46 #pragma clang diagnostic ignored "-Wpessimizing-move" 47 #endif 48 PowerSupply::PowerSupply(const std::string& name, size_t inst, 49 const std::string& objpath, const std::string& invpath, 50 sdbusplus::bus_t& bus, const sdeventplus::Event& e, 51 std::chrono::seconds& t, std::chrono::seconds& p) : 52 Device(name, inst), monitorPath(objpath), pmbusIntf(objpath), 53 inventoryPath(INVENTORY_OBJ_PATH + invpath), bus(bus), presentInterval(p), 54 presentTimer(e, std::bind([this]() { 55 // The hwmon path may have changed. 56 pmbusIntf.findHwmonDir(); 57 this->present = true; 58 59 // Sync the INPUT_HISTORY data for all PSs 60 syncHistory(); 61 62 // Update the inventory for the new device 63 updateInventory(); 64 })), 65 powerOnInterval(t), 66 powerOnTimer(e, std::bind([this]() { this->powerOn = true; })) 67 { 68 getAccessType(); 69 70 using namespace sdbusplus::bus; 71 using namespace phosphor::pmbus; 72 std::uint16_t statusWord = 0; 73 try 74 { 75 // Read the 2 byte STATUS_WORD value to check for faults. 76 statusWord = pmbusIntf.read(STATUS_WORD, Type::Debug); 77 if (!((statusWord & status_word::INPUT_FAULT_WARN) || 78 (statusWord & status_word::VIN_UV_FAULT))) 79 { 80 resolveError(inventoryPath, 81 std::string(PowerSupplyInputFault::errName)); 82 } 83 } 84 catch (const ReadFailure& e) 85 { 86 lg2::info("Unable to read the 2 byte STATUS_WORD value to check " 87 "for power-supply input faults."); 88 } 89 presentMatch = std::make_unique<match_t>( 90 bus, match::rules::propertiesChanged(inventoryPath, INVENTORY_IFACE), 91 [this](auto& msg) { this->inventoryChanged(msg); }); 92 // Get initial presence state. 93 updatePresence(); 94 95 // Write the SN, PN, etc to the inventory 96 updateInventory(); 97 98 // Subscribe to power state changes 99 powerOnMatch = std::make_unique<match_t>( 100 bus, match::rules::propertiesChanged(POWER_OBJ_PATH, POWER_IFACE), 101 [this](auto& msg) { this->powerStateChanged(msg); }); 102 // Get initial power state. 103 updatePowerState(); 104 } 105 #ifdef __clang__ 106 #pragma clang diagnostic pop 107 #endif 108 109 void PowerSupply::getAccessType() 110 { 111 using namespace phosphor::power::util; 112 fruJson = loadJSONFromFile(PSU_JSON_PATH); 113 if (fruJson == nullptr) 114 { 115 lg2::error("InternalFailure when parsing the JSON file"); 116 return; 117 } 118 inventoryPMBusAccessType = getPMBusAccessType(fruJson); 119 } 120 121 void PowerSupply::captureCmd(util::NamesValues& nv, const std::string& cmd, 122 phosphor::pmbus::Type type) 123 { 124 if (pmbusIntf.exists(cmd, type)) 125 { 126 try 127 { 128 auto val = pmbusIntf.read(cmd, type); 129 nv.add(cmd, val); 130 } 131 catch (const std::exception& e) 132 { 133 lg2::info("Unable to capture metadata, CMD={CMD}", "CMD", cmd); 134 } 135 } 136 } 137 138 void PowerSupply::analyze() 139 { 140 using namespace phosphor::pmbus; 141 142 try 143 { 144 if (present) 145 { 146 std::uint16_t statusWord = 0; 147 148 // Read the 2 byte STATUS_WORD value to check for faults. 149 statusWord = pmbusIntf.read(STATUS_WORD, Type::Debug); 150 readFail = 0; 151 152 checkInputFault(statusWord); 153 154 if (powerOn && (inputFault == 0) && !faultFound) 155 { 156 checkFanFault(statusWord); 157 checkTemperatureFault(statusWord); 158 checkOutputOvervoltageFault(statusWord); 159 checkCurrentOutOverCurrentFault(statusWord); 160 checkPGOrUnitOffFault(statusWord); 161 } 162 163 updateHistory(); 164 } 165 } 166 catch (const ReadFailure& e) 167 { 168 if (readFail < FAULT_COUNT) 169 { 170 readFail++; 171 } 172 173 if (!readFailLogged && readFail >= FAULT_COUNT) 174 { 175 commit<ReadFailure>(); 176 readFailLogged = true; 177 } 178 } 179 180 return; 181 } 182 183 void PowerSupply::inventoryChanged(sdbusplus::message_t& msg) 184 { 185 std::string msgSensor; 186 std::map<std::string, std::variant<uint32_t, bool>> msgData; 187 msg.read(msgSensor, msgData); 188 189 // Check if it was the Present property that changed. 190 auto valPropMap = msgData.find(PRESENT_PROP); 191 if (valPropMap != msgData.end()) 192 { 193 if (std::get<bool>(valPropMap->second)) 194 { 195 clearFaults(); 196 presentTimer.restartOnce(presentInterval); 197 } 198 else 199 { 200 present = false; 201 presentTimer.setEnabled(false); 202 203 // Clear out the now outdated inventory properties 204 updateInventory(); 205 } 206 } 207 208 return; 209 } 210 211 void PowerSupply::updatePresence() 212 { 213 // Use getProperty utility function to get presence status. 214 std::string service = "xyz.openbmc_project.Inventory.Manager"; 215 util::getProperty(INVENTORY_IFACE, PRESENT_PROP, inventoryPath, service, 216 bus, this->present); 217 } 218 219 void PowerSupply::powerStateChanged(sdbusplus::message_t& msg) 220 { 221 int32_t state = 0; 222 std::string msgSensor; 223 std::map<std::string, std::variant<int32_t>> msgData; 224 msg.read(msgSensor, msgData); 225 226 // Check if it was the Present property that changed. 227 auto valPropMap = msgData.find("state"); 228 if (valPropMap != msgData.end()) 229 { 230 state = std::get<int32_t>(valPropMap->second); 231 232 // Power is on when state=1. Set the fault logged variables to false 233 // and start the power on timer when the state changes to 1. 234 if (state) 235 { 236 clearFaults(); 237 powerOnTimer.restartOnce(powerOnInterval); 238 } 239 else 240 { 241 powerOnTimer.setEnabled(false); 242 powerOn = false; 243 } 244 } 245 } 246 247 void PowerSupply::updatePowerState() 248 { 249 powerOn = util::isPoweredOn(bus); 250 } 251 252 void PowerSupply::checkInputFault(const uint16_t statusWord) 253 { 254 using namespace phosphor::pmbus; 255 256 if ((inputFault < FAULT_COUNT) && 257 ((statusWord & status_word::INPUT_FAULT_WARN) || 258 (statusWord & status_word::VIN_UV_FAULT))) 259 { 260 if (inputFault == 0) 261 { 262 lg2::info("INPUT or VIN_UV fault, STATUS_WORD={STATUS_WORD}", 263 "STATUS_WORD", lg2::hex | lg2::field16, statusWord); 264 } 265 266 inputFault++; 267 } 268 else 269 { 270 if ((inputFault > 0) && !(statusWord & status_word::INPUT_FAULT_WARN) && 271 !(statusWord & status_word::VIN_UV_FAULT)) 272 { 273 inputFault = 0; 274 faultFound = false; 275 // When an input fault occurs, the power supply cannot be on. 276 // However, the check for the case where the power supply should be 277 // on will stop when there is a fault found. 278 // Clear the powerOnFault when the inputFault is cleared to reset 279 // the powerOnFault de-glitching. 280 powerOnFault = 0; 281 282 lg2::info("INPUT_FAULT_WARN cleared, POWERSUPPLY={POWERSUPPLY}", 283 "POWERSUPPLY", inventoryPath); 284 285 resolveError(inventoryPath, 286 std::string(PowerSupplyInputFault::errName)); 287 288 if (powerOn) 289 { 290 // The power supply will not be immediately powered on after 291 // the input power is restored. 292 powerOn = false; 293 // Start up the timer that will set the state to indicate we 294 // are ready for the powered on fault checks. 295 powerOnTimer.restartOnce(powerOnInterval); 296 } 297 } 298 } 299 300 if (!faultFound && (inputFault >= FAULT_COUNT)) 301 { 302 // If the power is on, report the fault in an error log entry. 303 if (powerOn) 304 { 305 util::NamesValues nv; 306 nv.add("STATUS_WORD", statusWord); 307 captureCmd(nv, STATUS_INPUT, Type::Debug); 308 309 using metadata = 310 org::open_power::Witherspoon::Fault::PowerSupplyInputFault; 311 312 report<PowerSupplyInputFault>( 313 metadata::RAW_STATUS(nv.get().c_str()), 314 metadata::CALLOUT_INVENTORY_PATH(inventoryPath.c_str())); 315 316 faultFound = true; 317 } 318 } 319 } 320 321 void PowerSupply::checkPGOrUnitOffFault(const uint16_t statusWord) 322 { 323 using namespace phosphor::pmbus; 324 325 if (powerOnFault < FAULT_COUNT) 326 { 327 // Check PG# and UNIT_IS_OFF 328 if ((statusWord & status_word::POWER_GOOD_NEGATED) || 329 (statusWord & status_word::UNIT_IS_OFF)) 330 { 331 lg2::info("PGOOD or UNIT_IS_OFF bit bad, STATUS_WORD={STATUS_WORD}", 332 "STATUS_WORD", lg2::hex | lg2::field16, statusWord); 333 powerOnFault++; 334 } 335 else 336 { 337 if (powerOnFault > 0) 338 { 339 lg2::info("PGOOD and UNIT_IS_OFF bits good"); 340 powerOnFault = 0; 341 } 342 } 343 344 if (!faultFound && (powerOnFault >= FAULT_COUNT)) 345 { 346 faultFound = true; 347 348 util::NamesValues nv; 349 nv.add("STATUS_WORD", statusWord); 350 captureCmd(nv, STATUS_INPUT, Type::Debug); 351 auto status0Vout = pmbusIntf.insertPageNum(STATUS_VOUT, 0); 352 captureCmd(nv, status0Vout, Type::Debug); 353 captureCmd(nv, STATUS_IOUT, Type::Debug); 354 captureCmd(nv, STATUS_MFR, Type::Debug); 355 356 using metadata = 357 org::open_power::Witherspoon::Fault::PowerSupplyShouldBeOn; 358 359 // A power supply is OFF (or pgood low) but should be on. 360 report<PowerSupplyShouldBeOn>( 361 metadata::RAW_STATUS(nv.get().c_str()), 362 metadata::CALLOUT_INVENTORY_PATH(inventoryPath.c_str())); 363 } 364 } 365 } 366 367 void PowerSupply::checkCurrentOutOverCurrentFault(const uint16_t statusWord) 368 { 369 using namespace phosphor::pmbus; 370 371 if (outputOCFault < FAULT_COUNT) 372 { 373 // Check for an output overcurrent fault. 374 if ((statusWord & status_word::IOUT_OC_FAULT)) 375 { 376 outputOCFault++; 377 } 378 else 379 { 380 if (outputOCFault > 0) 381 { 382 outputOCFault = 0; 383 } 384 } 385 386 if (!faultFound && (outputOCFault >= FAULT_COUNT)) 387 { 388 util::NamesValues nv; 389 nv.add("STATUS_WORD", statusWord); 390 captureCmd(nv, STATUS_INPUT, Type::Debug); 391 auto status0Vout = pmbusIntf.insertPageNum(STATUS_VOUT, 0); 392 captureCmd(nv, status0Vout, Type::Debug); 393 captureCmd(nv, STATUS_IOUT, Type::Debug); 394 captureCmd(nv, STATUS_MFR, Type::Debug); 395 396 using metadata = org::open_power::Witherspoon::Fault:: 397 PowerSupplyOutputOvercurrent; 398 399 report<PowerSupplyOutputOvercurrent>( 400 metadata::RAW_STATUS(nv.get().c_str()), 401 metadata::CALLOUT_INVENTORY_PATH(inventoryPath.c_str())); 402 403 faultFound = true; 404 } 405 } 406 } 407 408 void PowerSupply::checkOutputOvervoltageFault(const uint16_t statusWord) 409 { 410 using namespace phosphor::pmbus; 411 412 if (outputOVFault < FAULT_COUNT) 413 { 414 // Check for an output overvoltage fault. 415 if (statusWord & status_word::VOUT_OV_FAULT) 416 { 417 outputOVFault++; 418 } 419 else 420 { 421 if (outputOVFault > 0) 422 { 423 outputOVFault = 0; 424 } 425 } 426 427 if (!faultFound && (outputOVFault >= FAULT_COUNT)) 428 { 429 util::NamesValues nv; 430 nv.add("STATUS_WORD", statusWord); 431 captureCmd(nv, STATUS_INPUT, Type::Debug); 432 auto status0Vout = pmbusIntf.insertPageNum(STATUS_VOUT, 0); 433 captureCmd(nv, status0Vout, Type::Debug); 434 captureCmd(nv, STATUS_IOUT, Type::Debug); 435 captureCmd(nv, STATUS_MFR, Type::Debug); 436 437 using metadata = org::open_power::Witherspoon::Fault:: 438 PowerSupplyOutputOvervoltage; 439 440 report<PowerSupplyOutputOvervoltage>( 441 metadata::RAW_STATUS(nv.get().c_str()), 442 metadata::CALLOUT_INVENTORY_PATH(inventoryPath.c_str())); 443 444 faultFound = true; 445 } 446 } 447 } 448 449 void PowerSupply::checkFanFault(const uint16_t statusWord) 450 { 451 using namespace phosphor::pmbus; 452 453 if (fanFault < FAULT_COUNT) 454 { 455 // Check for a fan fault or warning condition 456 if (statusWord & status_word::FAN_FAULT) 457 { 458 fanFault++; 459 } 460 else 461 { 462 if (fanFault > 0) 463 { 464 fanFault = 0; 465 } 466 } 467 468 if (!faultFound && (fanFault >= FAULT_COUNT)) 469 { 470 util::NamesValues nv; 471 nv.add("STATUS_WORD", statusWord); 472 captureCmd(nv, STATUS_MFR, Type::Debug); 473 captureCmd(nv, STATUS_TEMPERATURE, Type::Debug); 474 captureCmd(nv, STATUS_FANS_1_2, Type::Debug); 475 476 using metadata = 477 org::open_power::Witherspoon::Fault::PowerSupplyFanFault; 478 479 report<PowerSupplyFanFault>( 480 metadata::RAW_STATUS(nv.get().c_str()), 481 metadata::CALLOUT_INVENTORY_PATH(inventoryPath.c_str())); 482 483 faultFound = true; 484 } 485 } 486 } 487 488 void PowerSupply::checkTemperatureFault(const uint16_t statusWord) 489 { 490 using namespace phosphor::pmbus; 491 492 // Due to how the PMBus core device driver sends a clear faults command 493 // the bit in STATUS_WORD will likely be cleared when we attempt to examine 494 // it for a Thermal Fault or Warning. So, check the STATUS_WORD and the 495 // STATUS_TEMPERATURE bits. If either indicates a fault, proceed with 496 // logging the over-temperature condition. 497 std::uint8_t statusTemperature = 0; 498 statusTemperature = pmbusIntf.read(STATUS_TEMPERATURE, Type::Debug); 499 if (temperatureFault < FAULT_COUNT) 500 { 501 if ((statusWord & status_word::TEMPERATURE_FAULT_WARN) || 502 (statusTemperature & status_temperature::OT_FAULT)) 503 { 504 temperatureFault++; 505 } 506 else 507 { 508 if (temperatureFault > 0) 509 { 510 temperatureFault = 0; 511 } 512 } 513 514 if (!faultFound && (temperatureFault >= FAULT_COUNT)) 515 { 516 // The power supply has had an over-temperature condition. 517 // This may not result in a shutdown if experienced for a short 518 // duration. 519 // This should not occur under normal conditions. 520 // The power supply may be faulty, or the paired supply may be 521 // putting out less current. 522 // Capture command responses with potentially relevant information, 523 // and call out the power supply reporting the condition. 524 util::NamesValues nv; 525 nv.add("STATUS_WORD", statusWord); 526 captureCmd(nv, STATUS_MFR, Type::Debug); 527 captureCmd(nv, STATUS_IOUT, Type::Debug); 528 nv.add("STATUS_TEMPERATURE", statusTemperature); 529 captureCmd(nv, STATUS_FANS_1_2, Type::Debug); 530 531 using metadata = org::open_power::Witherspoon::Fault:: 532 PowerSupplyTemperatureFault; 533 534 report<PowerSupplyTemperatureFault>( 535 metadata::RAW_STATUS(nv.get().c_str()), 536 metadata::CALLOUT_INVENTORY_PATH(inventoryPath.c_str())); 537 538 faultFound = true; 539 } 540 } 541 } 542 543 void PowerSupply::clearFaults() 544 { 545 readFail = 0; 546 readFailLogged = false; 547 inputFault = 0; 548 powerOnFault = 0; 549 outputOCFault = 0; 550 outputOVFault = 0; 551 fanFault = 0; 552 temperatureFault = 0; 553 faultFound = false; 554 555 return; 556 } 557 558 void PowerSupply::resolveError(const std::string& callout, 559 const std::string& message) 560 { 561 using EndpointList = std::vector<std::string>; 562 563 try 564 { 565 auto path = callout + "/fault"; 566 // Get the service name from the mapper for the fault callout 567 auto service = util::getService(path, ASSOCIATION_IFACE, bus); 568 569 // Use getProperty utility function to get log entries (endpoints) 570 EndpointList logEntries; 571 util::getProperty(ASSOCIATION_IFACE, ENDPOINTS_PROP, path, service, bus, 572 logEntries); 573 574 // It is possible that all such entries for this callout have since 575 // been deleted. 576 if (logEntries.empty()) 577 { 578 return; 579 } 580 581 auto logEntryService = 582 util::getService(logEntries[0], LOGGING_IFACE, bus); 583 if (logEntryService.empty()) 584 { 585 return; 586 } 587 588 // go through each log entry that matches this callout path 589 std::string logMessage; 590 for (const auto& logEntry : logEntries) 591 { 592 // Check to see if this logEntry has a message that matches. 593 util::getProperty(LOGGING_IFACE, MESSAGE_PROP, logEntry, 594 logEntryService, bus, logMessage); 595 596 if (message == logMessage) 597 { 598 // Log entry matches call out and message, set Resolved to true 599 bool resolved = true; 600 util::setProperty(LOGGING_IFACE, RESOLVED_PROP, logEntry, 601 logEntryService, bus, resolved); 602 } 603 } 604 } 605 catch (const std::exception& e) 606 { 607 lg2::info("Failed to resolve error, CALLOUT={CALLOUT}, ERROR={ERROR}", 608 "CALLOUT", callout, "ERROR", message); 609 } 610 } 611 612 void PowerSupply::updateInventory() 613 { 614 using namespace phosphor::pmbus; 615 using namespace sdbusplus::message; 616 617 // Build the object map and send it to the inventory 618 using Properties = std::map<std::string, std::variant<std::string, bool>>; 619 using Interfaces = std::map<std::string, Properties>; 620 using Object = std::map<object_path, Interfaces>; 621 Properties assetProps; 622 Properties operProps; 623 Interfaces interfaces; 624 Object object; 625 626 // If any of these accesses fail, the fields will just be 627 // blank in the inventory. Leave logging ReadFailure errors 628 // to analyze() as it runs continuously and will most 629 // likely hit and threshold them first anyway. The 630 // readString() function will do the tracing of the failing 631 // path so this code doesn't need to. 632 for (const auto& fru : fruJson.at("fruConfigs")) 633 { 634 if (fru.at("interface") == ASSET_IFACE) 635 { 636 try 637 { 638 assetProps.emplace( 639 fru.at("propertyName"), 640 present ? pmbusIntf.readString(fru.at("fileName"), 641 inventoryPMBusAccessType) 642 : ""); 643 } 644 catch (const ReadFailure& e) 645 {} 646 } 647 } 648 649 operProps.emplace(FUNCTIONAL_PROP, present); 650 interfaces.emplace(ASSET_IFACE, std::move(assetProps)); 651 interfaces.emplace(OPERATIONAL_STATE_IFACE, std::move(operProps)); 652 653 // For Notify(), just send the relative path of the inventory 654 // object so remove the INVENTORY_OBJ_PATH prefix 655 auto path = inventoryPath.substr(strlen(INVENTORY_OBJ_PATH)); 656 657 object.emplace(path, std::move(interfaces)); 658 659 try 660 { 661 auto service = 662 util::getService(INVENTORY_OBJ_PATH, INVENTORY_MGR_IFACE, bus); 663 664 if (service.empty()) 665 { 666 lg2::error("Unable to get inventory manager service"); 667 return; 668 } 669 670 auto method = bus.new_method_call(service.c_str(), INVENTORY_OBJ_PATH, 671 INVENTORY_MGR_IFACE, "Notify"); 672 673 method.append(std::move(object)); 674 675 auto reply = bus.call(method); 676 } 677 catch (const std::exception& e) 678 { 679 lg2::error("Exception in updateInventory: {ERROR}, PATH={PATH}", 680 "ERROR", e, "PATH", inventoryPath); 681 } 682 } 683 684 void PowerSupply::syncHistory() 685 { 686 using namespace phosphor::gpio; 687 688 if (syncGPIODevPath.empty()) 689 { 690 // Sync not implemented 691 return; 692 } 693 694 GPIO gpio{syncGPIODevPath, static_cast<gpioNum_t>(syncGPIONumber), 695 Direction::output}; 696 697 try 698 { 699 gpio.set(Value::low); 700 701 std::this_thread::sleep_for(std::chrono::milliseconds{5}); 702 703 gpio.set(Value::high); 704 705 recordManager->clear(); 706 } 707 catch (const std::exception& e) 708 { 709 // Do nothing. There would already be a journal entry. 710 } 711 } 712 713 void PowerSupply::enableHistory( 714 const std::string& objectPath, size_t numRecords, 715 const std::string& syncGPIOPath, size_t syncGPIONum) 716 { 717 historyObjectPath = objectPath; 718 syncGPIODevPath = syncGPIOPath; 719 syncGPIONumber = syncGPIONum; 720 721 recordManager = std::make_unique<history::RecordManager>(numRecords); 722 723 auto avgPath = historyObjectPath + '/' + history::Average::name; 724 auto maxPath = historyObjectPath + '/' + history::Maximum::name; 725 726 average = std::make_unique<history::Average>(bus, avgPath); 727 728 maximum = std::make_unique<history::Maximum>(bus, maxPath); 729 } 730 731 void PowerSupply::updateHistory() 732 { 733 if (!recordManager) 734 { 735 // Not enabled 736 return; 737 } 738 739 // Read just the most recent average/max record 740 auto data = 741 pmbusIntf.readBinary(INPUT_HISTORY, pmbus::Type::HwmonDeviceDebug, 742 history::RecordManager::RAW_RECORD_SIZE); 743 744 // Update D-Bus only if something changed (a new record ID, or cleared out) 745 auto changed = recordManager->add(data); 746 if (changed) 747 { 748 average->values(recordManager->getAverageRecords()); 749 maximum->values(recordManager->getMaximumRecords()); 750 } 751 } 752 753 } // namespace psu 754 } // namespace power 755 } // namespace phosphor 756