xref: /openbmc/dbus-sensors/src/nvidia-gpu/NvidiaGpuDevice.cpp (revision 5e7deccd14dcac790028a6641291cc019c1c4e52)
14ecdfaaaSHarshit Aghera /*
24ecdfaaaSHarshit Aghera  * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
34ecdfaaaSHarshit Aghera  * AFFILIATES. All rights reserved.
44ecdfaaaSHarshit Aghera  * SPDX-License-Identifier: Apache-2.0
54ecdfaaaSHarshit Aghera  */
64ecdfaaaSHarshit Aghera 
74ecdfaaaSHarshit Aghera #include "NvidiaGpuDevice.hpp"
84ecdfaaaSHarshit Aghera 
94ecdfaaaSHarshit Aghera #include "NvidiaDeviceDiscovery.hpp"
104ecdfaaaSHarshit Aghera #include "NvidiaGpuSensor.hpp"
114ecdfaaaSHarshit Aghera #include "Thresholds.hpp"
124ecdfaaaSHarshit Aghera #include "Utils.hpp"
134ecdfaaaSHarshit Aghera 
144ecdfaaaSHarshit Aghera #include <bits/basic_string.h>
154ecdfaaaSHarshit Aghera 
164ecdfaaaSHarshit Aghera #include <MctpRequester.hpp>
17*5e7deccdSHarshit Aghera #include <NvidiaGpuThresholds.hpp>
184ecdfaaaSHarshit Aghera #include <boost/asio/io_context.hpp>
194ecdfaaaSHarshit Aghera #include <phosphor-logging/lg2.hpp>
204ecdfaaaSHarshit Aghera #include <sdbusplus/asio/connection.hpp>
214ecdfaaaSHarshit Aghera #include <sdbusplus/asio/object_server.hpp>
224ecdfaaaSHarshit Aghera 
234ecdfaaaSHarshit Aghera #include <chrono>
244ecdfaaaSHarshit Aghera #include <cstdint>
25*5e7deccdSHarshit Aghera #include <functional>
264ecdfaaaSHarshit Aghera #include <memory>
274ecdfaaaSHarshit Aghera #include <string>
28*5e7deccdSHarshit Aghera #include <utility>
294ecdfaaaSHarshit Aghera #include <vector>
304ecdfaaaSHarshit Aghera 
314ecdfaaaSHarshit Aghera GpuDevice::GpuDevice(const SensorConfigs& configs, const std::string& name,
324ecdfaaaSHarshit Aghera                      const std::string& path,
334ecdfaaaSHarshit Aghera                      const std::shared_ptr<sdbusplus::asio::connection>& conn,
344ecdfaaaSHarshit Aghera                      uint8_t eid, boost::asio::io_context& io,
354ecdfaaaSHarshit Aghera                      mctp::MctpRequester& mctpRequester,
364ecdfaaaSHarshit Aghera                      sdbusplus::asio::object_server& objectServer) :
374ecdfaaaSHarshit Aghera     eid(eid), sensorPollMs(std::chrono::milliseconds{configs.pollRate}),
384ecdfaaaSHarshit Aghera     waitTimer(io, std::chrono::steady_clock::duration(0)),
394ecdfaaaSHarshit Aghera     mctpRequester(mctpRequester), conn(conn), objectServer(objectServer),
404ecdfaaaSHarshit Aghera     configs(configs), name(escapeName(name)), path(path)
414ecdfaaaSHarshit Aghera {
424ecdfaaaSHarshit Aghera     makeSensors();
434ecdfaaaSHarshit Aghera }
444ecdfaaaSHarshit Aghera 
454ecdfaaaSHarshit Aghera void GpuDevice::makeSensors()
464ecdfaaaSHarshit Aghera {
474ecdfaaaSHarshit Aghera     tempSensor = std::make_shared<NvidiaGpuTempSensor>(
48ba138daeSHarshit Aghera         conn, mctpRequester, name + "_TEMP_0", path, eid, gpuTempSensorId,
49ba138daeSHarshit Aghera         objectServer, std::vector<thresholds::Threshold>{});
50ba138daeSHarshit Aghera 
51*5e7deccdSHarshit Aghera     readThermalParameters(
52*5e7deccdSHarshit Aghera         eid,
53*5e7deccdSHarshit Aghera         std::vector<gpuThresholdId>{gpuTLimitWarnringThresholdId,
54*5e7deccdSHarshit Aghera                                     gpuTLimitCriticalThresholdId,
55*5e7deccdSHarshit Aghera                                     gpuTLimitHardshutDownThresholdId},
56*5e7deccdSHarshit Aghera         mctpRequester,
57*5e7deccdSHarshit Aghera         std::bind_front(&GpuDevice::processTLimitThresholds, this));
584ecdfaaaSHarshit Aghera 
594ecdfaaaSHarshit Aghera     lg2::info("Added GPU {NAME} Sensors with chassis path: {PATH}.", "NAME",
604ecdfaaaSHarshit Aghera               name, "PATH", path);
614ecdfaaaSHarshit Aghera 
624ecdfaaaSHarshit Aghera     read();
634ecdfaaaSHarshit Aghera }
644ecdfaaaSHarshit Aghera 
65*5e7deccdSHarshit Aghera void GpuDevice::processTLimitThresholds(uint8_t rc,
66*5e7deccdSHarshit Aghera                                         const std::vector<int32_t>& thresholds)
67*5e7deccdSHarshit Aghera {
68*5e7deccdSHarshit Aghera     std::vector<thresholds::Threshold> tLimitThresholds{};
69*5e7deccdSHarshit Aghera     if (rc == 0)
70*5e7deccdSHarshit Aghera     {
71*5e7deccdSHarshit Aghera         tLimitThresholds = {
72*5e7deccdSHarshit Aghera             thresholds::Threshold{thresholds::Level::WARNING,
73*5e7deccdSHarshit Aghera                                   thresholds::Direction::LOW,
74*5e7deccdSHarshit Aghera                                   static_cast<double>(thresholds[0])},
75*5e7deccdSHarshit Aghera             thresholds::Threshold{thresholds::Level::CRITICAL,
76*5e7deccdSHarshit Aghera                                   thresholds::Direction::LOW,
77*5e7deccdSHarshit Aghera                                   static_cast<double>(thresholds[1])},
78*5e7deccdSHarshit Aghera             thresholds::Threshold{thresholds::Level::HARDSHUTDOWN,
79*5e7deccdSHarshit Aghera                                   thresholds::Direction::LOW,
80*5e7deccdSHarshit Aghera                                   static_cast<double>(thresholds[2])}};
81*5e7deccdSHarshit Aghera     }
82*5e7deccdSHarshit Aghera 
83*5e7deccdSHarshit Aghera     tLimitSensor = std::make_shared<NvidiaGpuTempSensor>(
84*5e7deccdSHarshit Aghera         conn, mctpRequester, name + "_TEMP_1", path, eid, gpuTLimitSensorId,
85*5e7deccdSHarshit Aghera         objectServer, std::move(tLimitThresholds));
86*5e7deccdSHarshit Aghera }
87*5e7deccdSHarshit Aghera 
884ecdfaaaSHarshit Aghera void GpuDevice::read()
894ecdfaaaSHarshit Aghera {
904ecdfaaaSHarshit Aghera     tempSensor->update();
91*5e7deccdSHarshit Aghera     if (tLimitSensor)
92*5e7deccdSHarshit Aghera     {
93ba138daeSHarshit Aghera         tLimitSensor->update();
94*5e7deccdSHarshit Aghera     }
954ecdfaaaSHarshit Aghera 
964ecdfaaaSHarshit Aghera     waitTimer.expires_after(std::chrono::milliseconds(sensorPollMs));
974ecdfaaaSHarshit Aghera     waitTimer.async_wait([this](const boost::system::error_code& ec) {
984ecdfaaaSHarshit Aghera         if (ec)
994ecdfaaaSHarshit Aghera         {
1004ecdfaaaSHarshit Aghera             return;
1014ecdfaaaSHarshit Aghera         }
1024ecdfaaaSHarshit Aghera         read();
1034ecdfaaaSHarshit Aghera     });
1044ecdfaaaSHarshit Aghera }
105