xref: /openbmc/dbus-sensors/src/nvidia-gpu/NvidiaGpuDevice.cpp (revision ada6baa945fbd1bd1969facb1d7dc2fc12453f2e)
1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
3  * AFFILIATES. All rights reserved.
4  * SPDX-License-Identifier: Apache-2.0
5  */
6 
7 #include "NvidiaGpuDevice.hpp"
8 
9 #include "Inventory.hpp"
10 #include "NvidiaDeviceDiscovery.hpp"
11 #include "NvidiaGpuSensor.hpp"
12 #include "Thresholds.hpp"
13 #include "Utils.hpp"
14 
15 #include <bits/basic_string.h>
16 
17 #include <MctpRequester.hpp>
18 #include <NvidiaGpuEnergySensor.hpp>
19 #include <NvidiaGpuMctpVdm.hpp>
20 #include <NvidiaGpuPowerSensor.hpp>
21 #include <NvidiaGpuThresholds.hpp>
22 #include <NvidiaGpuVoltageSensor.hpp>
23 #include <boost/asio/io_context.hpp>
24 #include <phosphor-logging/lg2.hpp>
25 #include <sdbusplus/asio/connection.hpp>
26 #include <sdbusplus/asio/object_server.hpp>
27 
28 #include <chrono>
29 #include <cstdint>
30 #include <functional>
31 #include <memory>
32 #include <string>
33 #include <utility>
34 #include <vector>
35 
GpuDevice(const SensorConfigs & configs,const std::string & name,const std::string & path,const std::shared_ptr<sdbusplus::asio::connection> & conn,uint8_t eid,boost::asio::io_context & io,mctp::MctpRequester & mctpRequester,sdbusplus::asio::object_server & objectServer)36 GpuDevice::GpuDevice(const SensorConfigs& configs, const std::string& name,
37                      const std::string& path,
38                      const std::shared_ptr<sdbusplus::asio::connection>& conn,
39                      uint8_t eid, boost::asio::io_context& io,
40                      mctp::MctpRequester& mctpRequester,
41                      sdbusplus::asio::object_server& objectServer) :
42     eid(eid), sensorPollMs(std::chrono::milliseconds{configs.pollRate}),
43     waitTimer(io, std::chrono::steady_clock::duration(0)),
44     mctpRequester(mctpRequester), conn(conn), objectServer(objectServer),
45     configs(configs), name(escapeName(name)), path(path)
46 {
47     inventory = std::make_shared<Inventory>(
48         conn, objectServer, name, mctpRequester,
49         gpu::DeviceIdentification::DEVICE_GPU, eid, io);
50     makeSensors();
51 }
52 
makeSensors()53 void GpuDevice::makeSensors()
54 {
55     tempSensor = std::make_shared<NvidiaGpuTempSensor>(
56         conn, mctpRequester, name + "_TEMP_0", path, eid, gpuTempSensorId,
57         objectServer, std::vector<thresholds::Threshold>{});
58 
59     readThermalParameters(
60         eid,
61         std::vector<gpuThresholdId>{gpuTLimitWarnringThresholdId,
62                                     gpuTLimitCriticalThresholdId,
63                                     gpuTLimitHardshutDownThresholdId},
64         mctpRequester,
65         std::bind_front(&GpuDevice::processTLimitThresholds, this));
66 
67     dramTempSensor = std::make_shared<NvidiaGpuTempSensor>(
68         conn, mctpRequester, name + "_DRAM_0_TEMP_0", path, eid,
69         gpuDramTempSensorId, objectServer,
70         std::vector<thresholds::Threshold>{thresholds::Threshold{
71             thresholds::Level::CRITICAL, thresholds::Direction::HIGH, 95.0}});
72 
73     powerSensor = std::make_shared<NvidiaGpuPowerSensor>(
74         conn, mctpRequester, name + "_Power_0", path, eid, gpuPowerSensorId,
75         objectServer, std::vector<thresholds::Threshold>{});
76 
77     energySensor = std::make_shared<NvidiaGpuEnergySensor>(
78         conn, mctpRequester, name + "_Energy_0", path, eid, gpuEnergySensorId,
79         objectServer, std::vector<thresholds::Threshold>{});
80 
81     voltageSensor = std::make_shared<NvidiaGpuVoltageSensor>(
82         conn, mctpRequester, name + "_Voltage_0", path, eid, gpuVoltageSensorId,
83         objectServer, std::vector<thresholds::Threshold>{});
84 
85     lg2::info("Added GPU {NAME} Sensors with chassis path: {PATH}.", "NAME",
86               name, "PATH", path);
87 
88     read();
89 }
90 
processTLimitThresholds(uint8_t rc,const std::vector<int32_t> & thresholds)91 void GpuDevice::processTLimitThresholds(uint8_t rc,
92                                         const std::vector<int32_t>& thresholds)
93 {
94     std::vector<thresholds::Threshold> tLimitThresholds{};
95     if (rc == 0)
96     {
97         tLimitThresholds = {
98             thresholds::Threshold{thresholds::Level::WARNING,
99                                   thresholds::Direction::LOW,
100                                   static_cast<double>(thresholds[0])},
101             thresholds::Threshold{thresholds::Level::CRITICAL,
102                                   thresholds::Direction::LOW,
103                                   static_cast<double>(thresholds[1])},
104             thresholds::Threshold{thresholds::Level::HARDSHUTDOWN,
105                                   thresholds::Direction::LOW,
106                                   static_cast<double>(thresholds[2])}};
107     }
108 
109     tLimitSensor = std::make_shared<NvidiaGpuTempSensor>(
110         conn, mctpRequester, name + "_TEMP_1", path, eid, gpuTLimitSensorId,
111         objectServer, std::move(tLimitThresholds));
112 }
113 
read()114 void GpuDevice::read()
115 {
116     tempSensor->update();
117     if (tLimitSensor)
118     {
119         tLimitSensor->update();
120     }
121     dramTempSensor->update();
122     powerSensor->update();
123     energySensor->update();
124     voltageSensor->update();
125 
126     waitTimer.expires_after(std::chrono::milliseconds(sensorPollMs));
127     waitTimer.async_wait([this](const boost::system::error_code& ec) {
128         if (ec)
129         {
130             return;
131         }
132         read();
133     });
134 }
135