xref: /openbmc/dbus-sensors/src/nvidia-gpu/NvidiaGpuDevice.cpp (revision 0a88826ff30c07812a089331486e4eb68e8386cb)
14ecdfaaaSHarshit Aghera /*
24ecdfaaaSHarshit Aghera  * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
34ecdfaaaSHarshit Aghera  * AFFILIATES. All rights reserved.
44ecdfaaaSHarshit Aghera  * SPDX-License-Identifier: Apache-2.0
54ecdfaaaSHarshit Aghera  */
64ecdfaaaSHarshit Aghera 
74ecdfaaaSHarshit Aghera #include "NvidiaGpuDevice.hpp"
84ecdfaaaSHarshit Aghera 
9*0a88826fSRohit PAI #include "Inventory.hpp"
104ecdfaaaSHarshit Aghera #include "NvidiaDeviceDiscovery.hpp"
114ecdfaaaSHarshit Aghera #include "NvidiaGpuSensor.hpp"
124ecdfaaaSHarshit Aghera #include "Thresholds.hpp"
134ecdfaaaSHarshit Aghera #include "Utils.hpp"
144ecdfaaaSHarshit Aghera 
154ecdfaaaSHarshit Aghera #include <bits/basic_string.h>
164ecdfaaaSHarshit Aghera 
174ecdfaaaSHarshit Aghera #include <MctpRequester.hpp>
18775199d2SHarshit Aghera #include <NvidiaGpuEnergySensor.hpp>
19*0a88826fSRohit PAI #include <NvidiaGpuMctpVdm.hpp>
20902c649bSHarshit Aghera #include <NvidiaGpuPowerSensor.hpp>
215e7deccdSHarshit Aghera #include <NvidiaGpuThresholds.hpp>
22bef4d418SHarshit Aghera #include <NvidiaGpuVoltageSensor.hpp>
234ecdfaaaSHarshit Aghera #include <boost/asio/io_context.hpp>
244ecdfaaaSHarshit Aghera #include <phosphor-logging/lg2.hpp>
254ecdfaaaSHarshit Aghera #include <sdbusplus/asio/connection.hpp>
264ecdfaaaSHarshit Aghera #include <sdbusplus/asio/object_server.hpp>
274ecdfaaaSHarshit Aghera 
284ecdfaaaSHarshit Aghera #include <chrono>
294ecdfaaaSHarshit Aghera #include <cstdint>
305e7deccdSHarshit Aghera #include <functional>
314ecdfaaaSHarshit Aghera #include <memory>
324ecdfaaaSHarshit Aghera #include <string>
335e7deccdSHarshit Aghera #include <utility>
344ecdfaaaSHarshit Aghera #include <vector>
354ecdfaaaSHarshit Aghera 
364ecdfaaaSHarshit Aghera GpuDevice::GpuDevice(const SensorConfigs& configs, const std::string& name,
374ecdfaaaSHarshit Aghera                      const std::string& path,
384ecdfaaaSHarshit Aghera                      const std::shared_ptr<sdbusplus::asio::connection>& conn,
394ecdfaaaSHarshit Aghera                      uint8_t eid, boost::asio::io_context& io,
404ecdfaaaSHarshit Aghera                      mctp::MctpRequester& mctpRequester,
414ecdfaaaSHarshit Aghera                      sdbusplus::asio::object_server& objectServer) :
424ecdfaaaSHarshit Aghera     eid(eid), sensorPollMs(std::chrono::milliseconds{configs.pollRate}),
434ecdfaaaSHarshit Aghera     waitTimer(io, std::chrono::steady_clock::duration(0)),
444ecdfaaaSHarshit Aghera     mctpRequester(mctpRequester), conn(conn), objectServer(objectServer),
454ecdfaaaSHarshit Aghera     configs(configs), name(escapeName(name)), path(path)
464ecdfaaaSHarshit Aghera {
47*0a88826fSRohit PAI     inventory = std::make_shared<Inventory>(
48*0a88826fSRohit PAI         conn, objectServer, name, gpu::DeviceIdentification::DEVICE_GPU);
494ecdfaaaSHarshit Aghera     makeSensors();
504ecdfaaaSHarshit Aghera }
514ecdfaaaSHarshit Aghera 
524ecdfaaaSHarshit Aghera void GpuDevice::makeSensors()
534ecdfaaaSHarshit Aghera {
544ecdfaaaSHarshit Aghera     tempSensor = std::make_shared<NvidiaGpuTempSensor>(
55ba138daeSHarshit Aghera         conn, mctpRequester, name + "_TEMP_0", path, eid, gpuTempSensorId,
56ba138daeSHarshit Aghera         objectServer, std::vector<thresholds::Threshold>{});
57ba138daeSHarshit Aghera 
585e7deccdSHarshit Aghera     readThermalParameters(
595e7deccdSHarshit Aghera         eid,
605e7deccdSHarshit Aghera         std::vector<gpuThresholdId>{gpuTLimitWarnringThresholdId,
615e7deccdSHarshit Aghera                                     gpuTLimitCriticalThresholdId,
625e7deccdSHarshit Aghera                                     gpuTLimitHardshutDownThresholdId},
635e7deccdSHarshit Aghera         mctpRequester,
645e7deccdSHarshit Aghera         std::bind_front(&GpuDevice::processTLimitThresholds, this));
654ecdfaaaSHarshit Aghera 
66b10a67b2SHarshit Aghera     dramTempSensor = std::make_shared<NvidiaGpuTempSensor>(
67b10a67b2SHarshit Aghera         conn, mctpRequester, name + "_DRAM_0_TEMP_0", path, eid,
68b10a67b2SHarshit Aghera         gpuDramTempSensorId, objectServer,
69b10a67b2SHarshit Aghera         std::vector<thresholds::Threshold>{thresholds::Threshold{
70b10a67b2SHarshit Aghera             thresholds::Level::CRITICAL, thresholds::Direction::HIGH, 95.0}});
71b10a67b2SHarshit Aghera 
72902c649bSHarshit Aghera     powerSensor = std::make_shared<NvidiaGpuPowerSensor>(
73902c649bSHarshit Aghera         conn, mctpRequester, name + "_Power_0", path, eid, gpuPowerSensorId,
74902c649bSHarshit Aghera         objectServer, std::vector<thresholds::Threshold>{});
75902c649bSHarshit Aghera 
76775199d2SHarshit Aghera     energySensor = std::make_shared<NvidiaGpuEnergySensor>(
77775199d2SHarshit Aghera         conn, mctpRequester, name + "_Energy_0", path, eid, gpuEnergySensorId,
78775199d2SHarshit Aghera         objectServer, std::vector<thresholds::Threshold>{});
79775199d2SHarshit Aghera 
80bef4d418SHarshit Aghera     voltageSensor = std::make_shared<NvidiaGpuVoltageSensor>(
81bef4d418SHarshit Aghera         conn, mctpRequester, name + "_Voltage_0", path, eid, gpuVoltageSensorId,
82bef4d418SHarshit Aghera         objectServer, std::vector<thresholds::Threshold>{});
83bef4d418SHarshit Aghera 
844ecdfaaaSHarshit Aghera     lg2::info("Added GPU {NAME} Sensors with chassis path: {PATH}.", "NAME",
854ecdfaaaSHarshit Aghera               name, "PATH", path);
864ecdfaaaSHarshit Aghera 
874ecdfaaaSHarshit Aghera     read();
884ecdfaaaSHarshit Aghera }
894ecdfaaaSHarshit Aghera 
905e7deccdSHarshit Aghera void GpuDevice::processTLimitThresholds(uint8_t rc,
915e7deccdSHarshit Aghera                                         const std::vector<int32_t>& thresholds)
925e7deccdSHarshit Aghera {
935e7deccdSHarshit Aghera     std::vector<thresholds::Threshold> tLimitThresholds{};
945e7deccdSHarshit Aghera     if (rc == 0)
955e7deccdSHarshit Aghera     {
965e7deccdSHarshit Aghera         tLimitThresholds = {
975e7deccdSHarshit Aghera             thresholds::Threshold{thresholds::Level::WARNING,
985e7deccdSHarshit Aghera                                   thresholds::Direction::LOW,
995e7deccdSHarshit Aghera                                   static_cast<double>(thresholds[0])},
1005e7deccdSHarshit Aghera             thresholds::Threshold{thresholds::Level::CRITICAL,
1015e7deccdSHarshit Aghera                                   thresholds::Direction::LOW,
1025e7deccdSHarshit Aghera                                   static_cast<double>(thresholds[1])},
1035e7deccdSHarshit Aghera             thresholds::Threshold{thresholds::Level::HARDSHUTDOWN,
1045e7deccdSHarshit Aghera                                   thresholds::Direction::LOW,
1055e7deccdSHarshit Aghera                                   static_cast<double>(thresholds[2])}};
1065e7deccdSHarshit Aghera     }
1075e7deccdSHarshit Aghera 
1085e7deccdSHarshit Aghera     tLimitSensor = std::make_shared<NvidiaGpuTempSensor>(
1095e7deccdSHarshit Aghera         conn, mctpRequester, name + "_TEMP_1", path, eid, gpuTLimitSensorId,
1105e7deccdSHarshit Aghera         objectServer, std::move(tLimitThresholds));
1115e7deccdSHarshit Aghera }
1125e7deccdSHarshit Aghera 
1134ecdfaaaSHarshit Aghera void GpuDevice::read()
1144ecdfaaaSHarshit Aghera {
1154ecdfaaaSHarshit Aghera     tempSensor->update();
1165e7deccdSHarshit Aghera     if (tLimitSensor)
1175e7deccdSHarshit Aghera     {
118ba138daeSHarshit Aghera         tLimitSensor->update();
1195e7deccdSHarshit Aghera     }
120b10a67b2SHarshit Aghera     dramTempSensor->update();
121902c649bSHarshit Aghera     powerSensor->update();
122775199d2SHarshit Aghera     energySensor->update();
123bef4d418SHarshit Aghera     voltageSensor->update();
1244ecdfaaaSHarshit Aghera 
1254ecdfaaaSHarshit Aghera     waitTimer.expires_after(std::chrono::milliseconds(sensorPollMs));
1264ecdfaaaSHarshit Aghera     waitTimer.async_wait([this](const boost::system::error_code& ec) {
1274ecdfaaaSHarshit Aghera         if (ec)
1284ecdfaaaSHarshit Aghera         {
1294ecdfaaaSHarshit Aghera             return;
1304ecdfaaaSHarshit Aghera         }
1314ecdfaaaSHarshit Aghera         read();
1324ecdfaaaSHarshit Aghera     });
1334ecdfaaaSHarshit Aghera }
134