14ecdfaaaSHarshit Aghera /* 24ecdfaaaSHarshit Aghera * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & 34ecdfaaaSHarshit Aghera * AFFILIATES. All rights reserved. 44ecdfaaaSHarshit Aghera * SPDX-License-Identifier: Apache-2.0 54ecdfaaaSHarshit Aghera */ 64ecdfaaaSHarshit Aghera 74ecdfaaaSHarshit Aghera #include "NvidiaGpuDevice.hpp" 84ecdfaaaSHarshit Aghera 9*0a88826fSRohit PAI #include "Inventory.hpp" 104ecdfaaaSHarshit Aghera #include "NvidiaDeviceDiscovery.hpp" 114ecdfaaaSHarshit Aghera #include "NvidiaGpuSensor.hpp" 124ecdfaaaSHarshit Aghera #include "Thresholds.hpp" 134ecdfaaaSHarshit Aghera #include "Utils.hpp" 144ecdfaaaSHarshit Aghera 154ecdfaaaSHarshit Aghera #include <bits/basic_string.h> 164ecdfaaaSHarshit Aghera 174ecdfaaaSHarshit Aghera #include <MctpRequester.hpp> 18775199d2SHarshit Aghera #include <NvidiaGpuEnergySensor.hpp> 19*0a88826fSRohit PAI #include <NvidiaGpuMctpVdm.hpp> 20902c649bSHarshit Aghera #include <NvidiaGpuPowerSensor.hpp> 215e7deccdSHarshit Aghera #include <NvidiaGpuThresholds.hpp> 22bef4d418SHarshit Aghera #include <NvidiaGpuVoltageSensor.hpp> 234ecdfaaaSHarshit Aghera #include <boost/asio/io_context.hpp> 244ecdfaaaSHarshit Aghera #include <phosphor-logging/lg2.hpp> 254ecdfaaaSHarshit Aghera #include <sdbusplus/asio/connection.hpp> 264ecdfaaaSHarshit Aghera #include <sdbusplus/asio/object_server.hpp> 274ecdfaaaSHarshit Aghera 284ecdfaaaSHarshit Aghera #include <chrono> 294ecdfaaaSHarshit Aghera #include <cstdint> 305e7deccdSHarshit Aghera #include <functional> 314ecdfaaaSHarshit Aghera #include <memory> 324ecdfaaaSHarshit Aghera #include <string> 335e7deccdSHarshit Aghera #include <utility> 344ecdfaaaSHarshit Aghera #include <vector> 354ecdfaaaSHarshit Aghera 364ecdfaaaSHarshit Aghera GpuDevice::GpuDevice(const SensorConfigs& configs, const std::string& name, 374ecdfaaaSHarshit Aghera const std::string& path, 384ecdfaaaSHarshit Aghera const std::shared_ptr<sdbusplus::asio::connection>& conn, 394ecdfaaaSHarshit Aghera uint8_t eid, boost::asio::io_context& io, 404ecdfaaaSHarshit Aghera mctp::MctpRequester& mctpRequester, 414ecdfaaaSHarshit Aghera sdbusplus::asio::object_server& objectServer) : 424ecdfaaaSHarshit Aghera eid(eid), sensorPollMs(std::chrono::milliseconds{configs.pollRate}), 434ecdfaaaSHarshit Aghera waitTimer(io, std::chrono::steady_clock::duration(0)), 444ecdfaaaSHarshit Aghera mctpRequester(mctpRequester), conn(conn), objectServer(objectServer), 454ecdfaaaSHarshit Aghera configs(configs), name(escapeName(name)), path(path) 464ecdfaaaSHarshit Aghera { 47*0a88826fSRohit PAI inventory = std::make_shared<Inventory>( 48*0a88826fSRohit PAI conn, objectServer, name, gpu::DeviceIdentification::DEVICE_GPU); 494ecdfaaaSHarshit Aghera makeSensors(); 504ecdfaaaSHarshit Aghera } 514ecdfaaaSHarshit Aghera 524ecdfaaaSHarshit Aghera void GpuDevice::makeSensors() 534ecdfaaaSHarshit Aghera { 544ecdfaaaSHarshit Aghera tempSensor = std::make_shared<NvidiaGpuTempSensor>( 55ba138daeSHarshit Aghera conn, mctpRequester, name + "_TEMP_0", path, eid, gpuTempSensorId, 56ba138daeSHarshit Aghera objectServer, std::vector<thresholds::Threshold>{}); 57ba138daeSHarshit Aghera 585e7deccdSHarshit Aghera readThermalParameters( 595e7deccdSHarshit Aghera eid, 605e7deccdSHarshit Aghera std::vector<gpuThresholdId>{gpuTLimitWarnringThresholdId, 615e7deccdSHarshit Aghera gpuTLimitCriticalThresholdId, 625e7deccdSHarshit Aghera gpuTLimitHardshutDownThresholdId}, 635e7deccdSHarshit Aghera mctpRequester, 645e7deccdSHarshit Aghera std::bind_front(&GpuDevice::processTLimitThresholds, this)); 654ecdfaaaSHarshit Aghera 66b10a67b2SHarshit Aghera dramTempSensor = std::make_shared<NvidiaGpuTempSensor>( 67b10a67b2SHarshit Aghera conn, mctpRequester, name + "_DRAM_0_TEMP_0", path, eid, 68b10a67b2SHarshit Aghera gpuDramTempSensorId, objectServer, 69b10a67b2SHarshit Aghera std::vector<thresholds::Threshold>{thresholds::Threshold{ 70b10a67b2SHarshit Aghera thresholds::Level::CRITICAL, thresholds::Direction::HIGH, 95.0}}); 71b10a67b2SHarshit Aghera 72902c649bSHarshit Aghera powerSensor = std::make_shared<NvidiaGpuPowerSensor>( 73902c649bSHarshit Aghera conn, mctpRequester, name + "_Power_0", path, eid, gpuPowerSensorId, 74902c649bSHarshit Aghera objectServer, std::vector<thresholds::Threshold>{}); 75902c649bSHarshit Aghera 76775199d2SHarshit Aghera energySensor = std::make_shared<NvidiaGpuEnergySensor>( 77775199d2SHarshit Aghera conn, mctpRequester, name + "_Energy_0", path, eid, gpuEnergySensorId, 78775199d2SHarshit Aghera objectServer, std::vector<thresholds::Threshold>{}); 79775199d2SHarshit Aghera 80bef4d418SHarshit Aghera voltageSensor = std::make_shared<NvidiaGpuVoltageSensor>( 81bef4d418SHarshit Aghera conn, mctpRequester, name + "_Voltage_0", path, eid, gpuVoltageSensorId, 82bef4d418SHarshit Aghera objectServer, std::vector<thresholds::Threshold>{}); 83bef4d418SHarshit Aghera 844ecdfaaaSHarshit Aghera lg2::info("Added GPU {NAME} Sensors with chassis path: {PATH}.", "NAME", 854ecdfaaaSHarshit Aghera name, "PATH", path); 864ecdfaaaSHarshit Aghera 874ecdfaaaSHarshit Aghera read(); 884ecdfaaaSHarshit Aghera } 894ecdfaaaSHarshit Aghera 905e7deccdSHarshit Aghera void GpuDevice::processTLimitThresholds(uint8_t rc, 915e7deccdSHarshit Aghera const std::vector<int32_t>& thresholds) 925e7deccdSHarshit Aghera { 935e7deccdSHarshit Aghera std::vector<thresholds::Threshold> tLimitThresholds{}; 945e7deccdSHarshit Aghera if (rc == 0) 955e7deccdSHarshit Aghera { 965e7deccdSHarshit Aghera tLimitThresholds = { 975e7deccdSHarshit Aghera thresholds::Threshold{thresholds::Level::WARNING, 985e7deccdSHarshit Aghera thresholds::Direction::LOW, 995e7deccdSHarshit Aghera static_cast<double>(thresholds[0])}, 1005e7deccdSHarshit Aghera thresholds::Threshold{thresholds::Level::CRITICAL, 1015e7deccdSHarshit Aghera thresholds::Direction::LOW, 1025e7deccdSHarshit Aghera static_cast<double>(thresholds[1])}, 1035e7deccdSHarshit Aghera thresholds::Threshold{thresholds::Level::HARDSHUTDOWN, 1045e7deccdSHarshit Aghera thresholds::Direction::LOW, 1055e7deccdSHarshit Aghera static_cast<double>(thresholds[2])}}; 1065e7deccdSHarshit Aghera } 1075e7deccdSHarshit Aghera 1085e7deccdSHarshit Aghera tLimitSensor = std::make_shared<NvidiaGpuTempSensor>( 1095e7deccdSHarshit Aghera conn, mctpRequester, name + "_TEMP_1", path, eid, gpuTLimitSensorId, 1105e7deccdSHarshit Aghera objectServer, std::move(tLimitThresholds)); 1115e7deccdSHarshit Aghera } 1125e7deccdSHarshit Aghera 1134ecdfaaaSHarshit Aghera void GpuDevice::read() 1144ecdfaaaSHarshit Aghera { 1154ecdfaaaSHarshit Aghera tempSensor->update(); 1165e7deccdSHarshit Aghera if (tLimitSensor) 1175e7deccdSHarshit Aghera { 118ba138daeSHarshit Aghera tLimitSensor->update(); 1195e7deccdSHarshit Aghera } 120b10a67b2SHarshit Aghera dramTempSensor->update(); 121902c649bSHarshit Aghera powerSensor->update(); 122775199d2SHarshit Aghera energySensor->update(); 123bef4d418SHarshit Aghera voltageSensor->update(); 1244ecdfaaaSHarshit Aghera 1254ecdfaaaSHarshit Aghera waitTimer.expires_after(std::chrono::milliseconds(sensorPollMs)); 1264ecdfaaaSHarshit Aghera waitTimer.async_wait([this](const boost::system::error_code& ec) { 1274ecdfaaaSHarshit Aghera if (ec) 1284ecdfaaaSHarshit Aghera { 1294ecdfaaaSHarshit Aghera return; 1304ecdfaaaSHarshit Aghera } 1314ecdfaaaSHarshit Aghera read(); 1324ecdfaaaSHarshit Aghera }); 1334ecdfaaaSHarshit Aghera } 134