1 /*
2 * SPDX-FileCopyrightText: Copyright OpenBMC Authors
3 * SPDX-License-Identifier: Apache-2.0
4 */
5
6 #include "NvidiaGpuDevice.hpp"
7
8 #include "Thresholds.hpp"
9 #include "Utils.hpp"
10
11 #include <Inventory.hpp>
12 #include <MctpRequester.hpp>
13 #include <NvidiaDeviceDiscovery.hpp>
14 #include <NvidiaGpuEnergySensor.hpp>
15 #include <NvidiaGpuMctpVdm.hpp>
16 #include <NvidiaGpuPowerPeakReading.hpp>
17 #include <NvidiaGpuPowerSensor.hpp>
18 #include <NvidiaGpuSensor.hpp>
19 #include <NvidiaGpuVoltageSensor.hpp>
20 #include <OcpMctpVdm.hpp>
21 #include <boost/asio/io_context.hpp>
22 #include <phosphor-logging/lg2.hpp>
23 #include <sdbusplus/asio/connection.hpp>
24 #include <sdbusplus/asio/object_server.hpp>
25
26 #include <array>
27 #include <chrono>
28 #include <cstdint>
29 #include <functional>
30 #include <memory>
31 #include <span>
32 #include <string>
33 #include <system_error>
34 #include <utility>
35 #include <vector>
36
37 static constexpr uint8_t gpuTLimitCriticalThresholdId{1};
38 static constexpr uint8_t gpuTLimitWarningThresholdId{2};
39 static constexpr uint8_t gpuTLimitHardshutDownThresholdId{4};
40
41 // nota bene: the order has to match the order in processTLimitThresholds
42 static constexpr std::array<uint8_t, 3> thresholdIds{
43 gpuTLimitWarningThresholdId, gpuTLimitCriticalThresholdId,
44 gpuTLimitHardshutDownThresholdId};
45
GpuDevice(const SensorConfigs & configs,const std::string & name,const std::string & path,const std::shared_ptr<sdbusplus::asio::connection> & conn,uint8_t eid,boost::asio::io_context & io,mctp::MctpRequester & mctpRequester,sdbusplus::asio::object_server & objectServer)46 GpuDevice::GpuDevice(const SensorConfigs& configs, const std::string& name,
47 const std::string& path,
48 const std::shared_ptr<sdbusplus::asio::connection>& conn,
49 uint8_t eid, boost::asio::io_context& io,
50 mctp::MctpRequester& mctpRequester,
51 sdbusplus::asio::object_server& objectServer) :
52 eid(eid), sensorPollMs(std::chrono::milliseconds{configs.pollRate}),
53 waitTimer(io, std::chrono::steady_clock::duration(0)),
54 mctpRequester(mctpRequester), conn(conn), objectServer(objectServer),
55 configs(configs), name(escapeName(name)), path(path)
56 {
57 inventory = std::make_shared<Inventory>(
58 conn, objectServer, name, mctpRequester,
59 gpu::DeviceIdentification::DEVICE_GPU, eid, io);
60 }
61
init()62 void GpuDevice::init()
63 {
64 makeSensors();
65 inventory->init();
66 }
67
makeSensors()68 void GpuDevice::makeSensors()
69 {
70 tempSensor = std::make_shared<NvidiaGpuTempSensor>(
71 conn, mctpRequester, name + "_TEMP_0", path, eid, gpuTempSensorId,
72 objectServer, std::vector<thresholds::Threshold>{});
73
74 dramTempSensor = std::make_shared<NvidiaGpuTempSensor>(
75 conn, mctpRequester, name + "_DRAM_0_TEMP_0", path, eid,
76 gpuDramTempSensorId, objectServer,
77 std::vector<thresholds::Threshold>{thresholds::Threshold{
78 thresholds::Level::CRITICAL, thresholds::Direction::HIGH, 95.0}});
79
80 powerSensor = std::make_shared<NvidiaGpuPowerSensor>(
81 conn, mctpRequester, name + "_Power_0", path, eid, gpuPowerSensorId,
82 objectServer, std::vector<thresholds::Threshold>{});
83
84 peakPower = std::make_shared<NvidiaGpuPowerPeakReading>(
85 mctpRequester, name + "_Power_0", eid, gpuPeakPowerSensorId,
86 objectServer);
87
88 energySensor = std::make_shared<NvidiaGpuEnergySensor>(
89 conn, mctpRequester, name + "_Energy_0", path, eid, gpuEnergySensorId,
90 objectServer, std::vector<thresholds::Threshold>{});
91
92 voltageSensor = std::make_shared<NvidiaGpuVoltageSensor>(
93 conn, mctpRequester, name + "_Voltage_0", path, eid, gpuVoltageSensorId,
94 objectServer, std::vector<thresholds::Threshold>{});
95
96 getTLimitThresholds();
97
98 lg2::info("Added GPU {NAME} Sensors with chassis path: {PATH}.", "NAME",
99 name, "PATH", path);
100 read();
101 }
102
getTLimitThresholds()103 void GpuDevice::getTLimitThresholds()
104 {
105 thresholds = {};
106 current_threshold_index = 0;
107 getNextThermalParameter();
108 }
109
readThermalParameterCallback(const std::error_code & ec,std::span<const uint8_t> buffer)110 void GpuDevice::readThermalParameterCallback(const std::error_code& ec,
111 std::span<const uint8_t> buffer)
112 {
113 if (ec)
114 {
115 lg2::error(
116 "Error reading thermal parameter: sending message over MCTP failed, rc={RC}",
117 "RC", ec.message());
118 processTLimitThresholds(ec);
119 return;
120 }
121
122 ocp::accelerator_management::CompletionCode cc{};
123 uint16_t reasonCode = 0;
124 int32_t threshold = 0;
125
126 int rc = gpu::decodeReadThermalParametersResponse(buffer, cc, reasonCode,
127 threshold);
128
129 if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
130 {
131 lg2::error(
132 "Error reading thermal parameter: decode failed, rc={RC}, cc={CC}, reasonCode={RESC}",
133 "RC", rc, "CC", cc, "RESC", reasonCode);
134 processTLimitThresholds(ec);
135 return;
136 }
137
138 thresholds[current_threshold_index] = threshold;
139
140 current_threshold_index++;
141
142 if (current_threshold_index < thresholdIds.size())
143 {
144 getNextThermalParameter();
145 return;
146 }
147 processTLimitThresholds(std::error_code{});
148 }
149
getNextThermalParameter()150 void GpuDevice::getNextThermalParameter()
151 {
152 uint8_t id = thresholdIds[current_threshold_index];
153 auto rc =
154 gpu::encodeReadThermalParametersRequest(0, id, thermalParamReqMsg);
155 if (rc != 0)
156 {
157 lg2::error(
158 "Error reading thermal parameter for eid {EID} and parameter id {PID} : encode failed. rc={RC}",
159 "EID", eid, "PID", id, "RC", rc);
160 processTLimitThresholds(
161 std::make_error_code(static_cast<std::errc>(rc)));
162 return;
163 }
164
165 mctpRequester.sendRecvMsg(
166 eid, thermalParamReqMsg,
167 [weak{weak_from_this()}](const std::error_code& ec,
168 std::span<const uint8_t> buffer) {
169 std::shared_ptr<GpuDevice> self = weak.lock();
170 if (!self)
171 {
172 lg2::error("Failed to get lock on GpuDevice");
173 return;
174 }
175 self->readThermalParameterCallback(ec, buffer);
176 });
177 }
178
processTLimitThresholds(const std::error_code & ec)179 void GpuDevice::processTLimitThresholds(const std::error_code& ec)
180 {
181 std::vector<thresholds::Threshold> tLimitThresholds{};
182 if (!ec)
183 {
184 tLimitThresholds = {
185 thresholds::Threshold{thresholds::Level::WARNING,
186 thresholds::Direction::LOW,
187 static_cast<double>(thresholds[0])},
188 thresholds::Threshold{thresholds::Level::CRITICAL,
189 thresholds::Direction::LOW,
190 static_cast<double>(thresholds[1])},
191 thresholds::Threshold{thresholds::Level::HARDSHUTDOWN,
192 thresholds::Direction::LOW,
193 static_cast<double>(thresholds[2])}};
194 }
195
196 tLimitSensor = std::make_shared<NvidiaGpuTempSensor>(
197 conn, mctpRequester, name + "_TEMP_1", path, eid, gpuTLimitSensorId,
198 objectServer, std::move(tLimitThresholds));
199 }
200
read()201 void GpuDevice::read()
202 {
203 tempSensor->update();
204 if (tLimitSensor)
205 {
206 tLimitSensor->update();
207 }
208 dramTempSensor->update();
209 powerSensor->update();
210 peakPower->update();
211 energySensor->update();
212 voltageSensor->update();
213
214 waitTimer.expires_after(std::chrono::milliseconds(sensorPollMs));
215 waitTimer.async_wait(
216 [weak{weak_from_this()}](const boost::system::error_code& ec) {
217 std::shared_ptr<GpuDevice> self = weak.lock();
218 if (!self)
219 {
220 lg2::error("Invalid reference to GpuDevice");
221 return;
222 }
223 if (ec)
224 {
225 return;
226 }
227 self->read();
228 });
229 }
230