xref: /openbmc/dbus-sensors/src/nvidia-gpu/NvidiaGpuDevice.cpp (revision b5e823f73897a8d47087d91f3f936dc07506a6e1)
1 /*
2  * SPDX-FileCopyrightText: Copyright OpenBMC Authors
3  * SPDX-License-Identifier: Apache-2.0
4  */
5 
6 #include "NvidiaGpuDevice.hpp"
7 
8 #include "Thresholds.hpp"
9 #include "Utils.hpp"
10 
11 #include <Inventory.hpp>
12 #include <MctpRequester.hpp>
13 #include <NvidiaDeviceDiscovery.hpp>
14 #include <NvidiaGpuEnergySensor.hpp>
15 #include <NvidiaGpuMctpVdm.hpp>
16 #include <NvidiaGpuPowerPeakReading.hpp>
17 #include <NvidiaGpuPowerSensor.hpp>
18 #include <NvidiaGpuSensor.hpp>
19 #include <NvidiaGpuVoltageSensor.hpp>
20 #include <OcpMctpVdm.hpp>
21 #include <boost/asio/io_context.hpp>
22 #include <phosphor-logging/lg2.hpp>
23 #include <sdbusplus/asio/connection.hpp>
24 #include <sdbusplus/asio/object_server.hpp>
25 
26 #include <array>
27 #include <chrono>
28 #include <cstdint>
29 #include <functional>
30 #include <memory>
31 #include <span>
32 #include <string>
33 #include <system_error>
34 #include <utility>
35 #include <vector>
36 
37 static constexpr uint8_t gpuTLimitCriticalThresholdId{1};
38 static constexpr uint8_t gpuTLimitWarningThresholdId{2};
39 static constexpr uint8_t gpuTLimitHardshutDownThresholdId{4};
40 
41 // nota bene: the order has to match the order in processTLimitThresholds
42 static constexpr std::array<uint8_t, 3> thresholdIds{
43     gpuTLimitWarningThresholdId, gpuTLimitCriticalThresholdId,
44     gpuTLimitHardshutDownThresholdId};
45 
GpuDevice(const SensorConfigs & configs,const std::string & name,const std::string & path,const std::shared_ptr<sdbusplus::asio::connection> & conn,uint8_t eid,boost::asio::io_context & io,mctp::MctpRequester & mctpRequester,sdbusplus::asio::object_server & objectServer)46 GpuDevice::GpuDevice(const SensorConfigs& configs, const std::string& name,
47                      const std::string& path,
48                      const std::shared_ptr<sdbusplus::asio::connection>& conn,
49                      uint8_t eid, boost::asio::io_context& io,
50                      mctp::MctpRequester& mctpRequester,
51                      sdbusplus::asio::object_server& objectServer) :
52     eid(eid), sensorPollMs(std::chrono::milliseconds{configs.pollRate}),
53     waitTimer(io, std::chrono::steady_clock::duration(0)),
54     mctpRequester(mctpRequester), conn(conn), objectServer(objectServer),
55     configs(configs), name(escapeName(name)), path(path)
56 {
57     inventory = std::make_shared<Inventory>(
58         conn, objectServer, name, mctpRequester,
59         gpu::DeviceIdentification::DEVICE_GPU, eid, io);
60 }
61 
init()62 void GpuDevice::init()
63 {
64     makeSensors();
65     inventory->init();
66 }
67 
makeSensors()68 void GpuDevice::makeSensors()
69 {
70     tempSensor = std::make_shared<NvidiaGpuTempSensor>(
71         conn, mctpRequester, name + "_TEMP_0", path, eid, gpuTempSensorId,
72         objectServer, std::vector<thresholds::Threshold>{});
73 
74     dramTempSensor = std::make_shared<NvidiaGpuTempSensor>(
75         conn, mctpRequester, name + "_DRAM_0_TEMP_0", path, eid,
76         gpuDramTempSensorId, objectServer,
77         std::vector<thresholds::Threshold>{thresholds::Threshold{
78             thresholds::Level::CRITICAL, thresholds::Direction::HIGH, 95.0}});
79 
80     powerSensor = std::make_shared<NvidiaGpuPowerSensor>(
81         conn, mctpRequester, name + "_Power_0", path, eid, gpuPowerSensorId,
82         objectServer, std::vector<thresholds::Threshold>{});
83 
84     peakPower = std::make_shared<NvidiaGpuPowerPeakReading>(
85         mctpRequester, name + "_Power_0", eid, gpuPeakPowerSensorId,
86         objectServer);
87 
88     energySensor = std::make_shared<NvidiaGpuEnergySensor>(
89         conn, mctpRequester, name + "_Energy_0", path, eid, gpuEnergySensorId,
90         objectServer, std::vector<thresholds::Threshold>{});
91 
92     voltageSensor = std::make_shared<NvidiaGpuVoltageSensor>(
93         conn, mctpRequester, name + "_Voltage_0", path, eid, gpuVoltageSensorId,
94         objectServer, std::vector<thresholds::Threshold>{});
95 
96     getTLimitThresholds();
97 
98     lg2::info("Added GPU {NAME} Sensors with chassis path: {PATH}.", "NAME",
99               name, "PATH", path);
100     read();
101 }
102 
getTLimitThresholds()103 void GpuDevice::getTLimitThresholds()
104 {
105     thresholds = {};
106     current_threshold_index = 0;
107     getNextThermalParameter();
108 }
109 
readThermalParameterCallback(const std::error_code & ec,std::span<const uint8_t> buffer)110 void GpuDevice::readThermalParameterCallback(const std::error_code& ec,
111                                              std::span<const uint8_t> buffer)
112 {
113     if (ec)
114     {
115         lg2::error(
116             "Error reading thermal parameter: sending message over MCTP failed, rc={RC}",
117             "RC", ec.message());
118         processTLimitThresholds(ec);
119         return;
120     }
121 
122     ocp::accelerator_management::CompletionCode cc{};
123     uint16_t reasonCode = 0;
124     int32_t threshold = 0;
125 
126     int rc = gpu::decodeReadThermalParametersResponse(buffer, cc, reasonCode,
127                                                       threshold);
128 
129     if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
130     {
131         lg2::error(
132             "Error reading thermal parameter: decode failed, rc={RC}, cc={CC}, reasonCode={RESC}",
133             "RC", rc, "CC", cc, "RESC", reasonCode);
134         processTLimitThresholds(ec);
135         return;
136     }
137 
138     thresholds[current_threshold_index] = threshold;
139 
140     current_threshold_index++;
141 
142     if (current_threshold_index < thresholdIds.size())
143     {
144         getNextThermalParameter();
145         return;
146     }
147     processTLimitThresholds(std::error_code{});
148 }
149 
getNextThermalParameter()150 void GpuDevice::getNextThermalParameter()
151 {
152     uint8_t id = thresholdIds[current_threshold_index];
153     auto rc =
154         gpu::encodeReadThermalParametersRequest(0, id, thermalParamReqMsg);
155     if (rc != 0)
156     {
157         lg2::error(
158             "Error reading thermal parameter for eid {EID} and parameter id {PID} : encode failed. rc={RC}",
159             "EID", eid, "PID", id, "RC", rc);
160         processTLimitThresholds(
161             std::make_error_code(static_cast<std::errc>(rc)));
162         return;
163     }
164 
165     mctpRequester.sendRecvMsg(
166         eid, thermalParamReqMsg,
167         [weak{weak_from_this()}](const std::error_code& ec,
168                                  std::span<const uint8_t> buffer) {
169             std::shared_ptr<GpuDevice> self = weak.lock();
170             if (!self)
171             {
172                 lg2::error("Failed to get lock on GpuDevice");
173                 return;
174             }
175             self->readThermalParameterCallback(ec, buffer);
176         });
177 }
178 
processTLimitThresholds(const std::error_code & ec)179 void GpuDevice::processTLimitThresholds(const std::error_code& ec)
180 {
181     std::vector<thresholds::Threshold> tLimitThresholds{};
182     if (!ec)
183     {
184         tLimitThresholds = {
185             thresholds::Threshold{thresholds::Level::WARNING,
186                                   thresholds::Direction::LOW,
187                                   static_cast<double>(thresholds[0])},
188             thresholds::Threshold{thresholds::Level::CRITICAL,
189                                   thresholds::Direction::LOW,
190                                   static_cast<double>(thresholds[1])},
191             thresholds::Threshold{thresholds::Level::HARDSHUTDOWN,
192                                   thresholds::Direction::LOW,
193                                   static_cast<double>(thresholds[2])}};
194     }
195 
196     tLimitSensor = std::make_shared<NvidiaGpuTempSensor>(
197         conn, mctpRequester, name + "_TEMP_1", path, eid, gpuTLimitSensorId,
198         objectServer, std::move(tLimitThresholds));
199 }
200 
read()201 void GpuDevice::read()
202 {
203     tempSensor->update();
204     if (tLimitSensor)
205     {
206         tLimitSensor->update();
207     }
208     dramTempSensor->update();
209     powerSensor->update();
210     peakPower->update();
211     energySensor->update();
212     voltageSensor->update();
213 
214     waitTimer.expires_after(std::chrono::milliseconds(sensorPollMs));
215     waitTimer.async_wait(
216         [weak{weak_from_this()}](const boost::system::error_code& ec) {
217             std::shared_ptr<GpuDevice> self = weak.lock();
218             if (!self)
219             {
220                 lg2::error("Invalid reference to GpuDevice");
221                 return;
222             }
223             if (ec)
224             {
225                 return;
226             }
227             self->read();
228         });
229 }
230