14ecdfaaaSHarshit Aghera /*
24ecdfaaaSHarshit Aghera * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
34ecdfaaaSHarshit Aghera * AFFILIATES. All rights reserved.
44ecdfaaaSHarshit Aghera * SPDX-License-Identifier: Apache-2.0
54ecdfaaaSHarshit Aghera */
64ecdfaaaSHarshit Aghera
74ecdfaaaSHarshit Aghera #include "NvidiaGpuDevice.hpp"
84ecdfaaaSHarshit Aghera
90a88826fSRohit PAI #include "Inventory.hpp"
104ecdfaaaSHarshit Aghera #include "NvidiaDeviceDiscovery.hpp"
114ecdfaaaSHarshit Aghera #include "NvidiaGpuSensor.hpp"
124ecdfaaaSHarshit Aghera #include "Thresholds.hpp"
134ecdfaaaSHarshit Aghera #include "Utils.hpp"
144ecdfaaaSHarshit Aghera
154ecdfaaaSHarshit Aghera #include <bits/basic_string.h>
164ecdfaaaSHarshit Aghera
174ecdfaaaSHarshit Aghera #include <MctpRequester.hpp>
18775199d2SHarshit Aghera #include <NvidiaGpuEnergySensor.hpp>
190a88826fSRohit PAI #include <NvidiaGpuMctpVdm.hpp>
20902c649bSHarshit Aghera #include <NvidiaGpuPowerSensor.hpp>
215e7deccdSHarshit Aghera #include <NvidiaGpuThresholds.hpp>
22bef4d418SHarshit Aghera #include <NvidiaGpuVoltageSensor.hpp>
234ecdfaaaSHarshit Aghera #include <boost/asio/io_context.hpp>
244ecdfaaaSHarshit Aghera #include <phosphor-logging/lg2.hpp>
254ecdfaaaSHarshit Aghera #include <sdbusplus/asio/connection.hpp>
264ecdfaaaSHarshit Aghera #include <sdbusplus/asio/object_server.hpp>
274ecdfaaaSHarshit Aghera
284ecdfaaaSHarshit Aghera #include <chrono>
294ecdfaaaSHarshit Aghera #include <cstdint>
305e7deccdSHarshit Aghera #include <functional>
314ecdfaaaSHarshit Aghera #include <memory>
324ecdfaaaSHarshit Aghera #include <string>
335e7deccdSHarshit Aghera #include <utility>
344ecdfaaaSHarshit Aghera #include <vector>
354ecdfaaaSHarshit Aghera
GpuDevice(const SensorConfigs & configs,const std::string & name,const std::string & path,const std::shared_ptr<sdbusplus::asio::connection> & conn,uint8_t eid,boost::asio::io_context & io,mctp::MctpRequester & mctpRequester,sdbusplus::asio::object_server & objectServer)364ecdfaaaSHarshit Aghera GpuDevice::GpuDevice(const SensorConfigs& configs, const std::string& name,
374ecdfaaaSHarshit Aghera const std::string& path,
384ecdfaaaSHarshit Aghera const std::shared_ptr<sdbusplus::asio::connection>& conn,
394ecdfaaaSHarshit Aghera uint8_t eid, boost::asio::io_context& io,
404ecdfaaaSHarshit Aghera mctp::MctpRequester& mctpRequester,
414ecdfaaaSHarshit Aghera sdbusplus::asio::object_server& objectServer) :
424ecdfaaaSHarshit Aghera eid(eid), sensorPollMs(std::chrono::milliseconds{configs.pollRate}),
434ecdfaaaSHarshit Aghera waitTimer(io, std::chrono::steady_clock::duration(0)),
444ecdfaaaSHarshit Aghera mctpRequester(mctpRequester), conn(conn), objectServer(objectServer),
454ecdfaaaSHarshit Aghera configs(configs), name(escapeName(name)), path(path)
464ecdfaaaSHarshit Aghera {
470a88826fSRohit PAI inventory = std::make_shared<Inventory>(
48*ada6baa9SRohit PAI conn, objectServer, name, mctpRequester,
49*ada6baa9SRohit PAI gpu::DeviceIdentification::DEVICE_GPU, eid, io);
504ecdfaaaSHarshit Aghera makeSensors();
514ecdfaaaSHarshit Aghera }
524ecdfaaaSHarshit Aghera
makeSensors()534ecdfaaaSHarshit Aghera void GpuDevice::makeSensors()
544ecdfaaaSHarshit Aghera {
554ecdfaaaSHarshit Aghera tempSensor = std::make_shared<NvidiaGpuTempSensor>(
56ba138daeSHarshit Aghera conn, mctpRequester, name + "_TEMP_0", path, eid, gpuTempSensorId,
57ba138daeSHarshit Aghera objectServer, std::vector<thresholds::Threshold>{});
58ba138daeSHarshit Aghera
595e7deccdSHarshit Aghera readThermalParameters(
605e7deccdSHarshit Aghera eid,
615e7deccdSHarshit Aghera std::vector<gpuThresholdId>{gpuTLimitWarnringThresholdId,
625e7deccdSHarshit Aghera gpuTLimitCriticalThresholdId,
635e7deccdSHarshit Aghera gpuTLimitHardshutDownThresholdId},
645e7deccdSHarshit Aghera mctpRequester,
655e7deccdSHarshit Aghera std::bind_front(&GpuDevice::processTLimitThresholds, this));
664ecdfaaaSHarshit Aghera
67b10a67b2SHarshit Aghera dramTempSensor = std::make_shared<NvidiaGpuTempSensor>(
68b10a67b2SHarshit Aghera conn, mctpRequester, name + "_DRAM_0_TEMP_0", path, eid,
69b10a67b2SHarshit Aghera gpuDramTempSensorId, objectServer,
70b10a67b2SHarshit Aghera std::vector<thresholds::Threshold>{thresholds::Threshold{
71b10a67b2SHarshit Aghera thresholds::Level::CRITICAL, thresholds::Direction::HIGH, 95.0}});
72b10a67b2SHarshit Aghera
73902c649bSHarshit Aghera powerSensor = std::make_shared<NvidiaGpuPowerSensor>(
74902c649bSHarshit Aghera conn, mctpRequester, name + "_Power_0", path, eid, gpuPowerSensorId,
75902c649bSHarshit Aghera objectServer, std::vector<thresholds::Threshold>{});
76902c649bSHarshit Aghera
77775199d2SHarshit Aghera energySensor = std::make_shared<NvidiaGpuEnergySensor>(
78775199d2SHarshit Aghera conn, mctpRequester, name + "_Energy_0", path, eid, gpuEnergySensorId,
79775199d2SHarshit Aghera objectServer, std::vector<thresholds::Threshold>{});
80775199d2SHarshit Aghera
81bef4d418SHarshit Aghera voltageSensor = std::make_shared<NvidiaGpuVoltageSensor>(
82bef4d418SHarshit Aghera conn, mctpRequester, name + "_Voltage_0", path, eid, gpuVoltageSensorId,
83bef4d418SHarshit Aghera objectServer, std::vector<thresholds::Threshold>{});
84bef4d418SHarshit Aghera
854ecdfaaaSHarshit Aghera lg2::info("Added GPU {NAME} Sensors with chassis path: {PATH}.", "NAME",
864ecdfaaaSHarshit Aghera name, "PATH", path);
874ecdfaaaSHarshit Aghera
884ecdfaaaSHarshit Aghera read();
894ecdfaaaSHarshit Aghera }
904ecdfaaaSHarshit Aghera
processTLimitThresholds(uint8_t rc,const std::vector<int32_t> & thresholds)915e7deccdSHarshit Aghera void GpuDevice::processTLimitThresholds(uint8_t rc,
925e7deccdSHarshit Aghera const std::vector<int32_t>& thresholds)
935e7deccdSHarshit Aghera {
945e7deccdSHarshit Aghera std::vector<thresholds::Threshold> tLimitThresholds{};
955e7deccdSHarshit Aghera if (rc == 0)
965e7deccdSHarshit Aghera {
975e7deccdSHarshit Aghera tLimitThresholds = {
985e7deccdSHarshit Aghera thresholds::Threshold{thresholds::Level::WARNING,
995e7deccdSHarshit Aghera thresholds::Direction::LOW,
1005e7deccdSHarshit Aghera static_cast<double>(thresholds[0])},
1015e7deccdSHarshit Aghera thresholds::Threshold{thresholds::Level::CRITICAL,
1025e7deccdSHarshit Aghera thresholds::Direction::LOW,
1035e7deccdSHarshit Aghera static_cast<double>(thresholds[1])},
1045e7deccdSHarshit Aghera thresholds::Threshold{thresholds::Level::HARDSHUTDOWN,
1055e7deccdSHarshit Aghera thresholds::Direction::LOW,
1065e7deccdSHarshit Aghera static_cast<double>(thresholds[2])}};
1075e7deccdSHarshit Aghera }
1085e7deccdSHarshit Aghera
1095e7deccdSHarshit Aghera tLimitSensor = std::make_shared<NvidiaGpuTempSensor>(
1105e7deccdSHarshit Aghera conn, mctpRequester, name + "_TEMP_1", path, eid, gpuTLimitSensorId,
1115e7deccdSHarshit Aghera objectServer, std::move(tLimitThresholds));
1125e7deccdSHarshit Aghera }
1135e7deccdSHarshit Aghera
read()1144ecdfaaaSHarshit Aghera void GpuDevice::read()
1154ecdfaaaSHarshit Aghera {
1164ecdfaaaSHarshit Aghera tempSensor->update();
1175e7deccdSHarshit Aghera if (tLimitSensor)
1185e7deccdSHarshit Aghera {
119ba138daeSHarshit Aghera tLimitSensor->update();
1205e7deccdSHarshit Aghera }
121b10a67b2SHarshit Aghera dramTempSensor->update();
122902c649bSHarshit Aghera powerSensor->update();
123775199d2SHarshit Aghera energySensor->update();
124bef4d418SHarshit Aghera voltageSensor->update();
1254ecdfaaaSHarshit Aghera
1264ecdfaaaSHarshit Aghera waitTimer.expires_after(std::chrono::milliseconds(sensorPollMs));
1274ecdfaaaSHarshit Aghera waitTimer.async_wait([this](const boost::system::error_code& ec) {
1284ecdfaaaSHarshit Aghera if (ec)
1294ecdfaaaSHarshit Aghera {
1304ecdfaaaSHarshit Aghera return;
1314ecdfaaaSHarshit Aghera }
1324ecdfaaaSHarshit Aghera read();
1334ecdfaaaSHarshit Aghera });
1344ecdfaaaSHarshit Aghera }
135