1 /*
2 * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
3 * AFFILIATES. All rights reserved.
4 * SPDX-License-Identifier: Apache-2.0
5 */
6
7 #include "NvidiaGpuSensor.hpp"
8
9 #include "SensorPaths.hpp"
10 #include "Thresholds.hpp"
11 #include "Utils.hpp"
12 #include "sensor.hpp"
13
14 #include <bits/basic_string.h>
15
16 #include <MctpRequester.hpp>
17 #include <NvidiaGpuMctpVdm.hpp>
18 #include <OcpMctpVdm.hpp>
19 #include <boost/asio/io_context.hpp>
20 #include <boost/container/flat_map.hpp>
21 #include <phosphor-logging/lg2.hpp>
22 #include <sdbusplus/asio/connection.hpp>
23 #include <sdbusplus/asio/object_server.hpp>
24 #include <sdbusplus/message.hpp>
25 #include <sdbusplus/message/native_types.hpp>
26
27 #include <algorithm>
28 #include <chrono>
29 #include <cstddef>
30 #include <cstdint>
31 #include <functional>
32 #include <memory>
33 #include <string>
34 #include <utility>
35 #include <variant>
36 #include <vector>
37
38 using namespace std::literals;
39
40 constexpr uint8_t gpuTempSensorId{0};
41 static constexpr double gpuTempSensorMaxReading = 127;
42 static constexpr double gpuTempSensorMinReading = -128;
43
GpuTempSensor(std::shared_ptr<sdbusplus::asio::connection> & conn,boost::asio::io_context & io,mctp::MctpRequester & mctpRequester,const std::string & name,const std::string & sensorConfiguration,sdbusplus::asio::object_server & objectServer,std::vector<thresholds::Threshold> && thresholdData,std::chrono::milliseconds pollRate)44 GpuTempSensor::GpuTempSensor(
45 std::shared_ptr<sdbusplus::asio::connection>& conn,
46 boost::asio::io_context& io, mctp::MctpRequester& mctpRequester,
47 const std::string& name, const std::string& sensorConfiguration,
48 sdbusplus::asio::object_server& objectServer,
49 std::vector<thresholds::Threshold>&& thresholdData,
50 std::chrono::milliseconds pollRate) :
51 Sensor(escapeName(name), std::move(thresholdData), sensorConfiguration,
52 "temperature", false, true, gpuTempSensorMaxReading,
53 gpuTempSensorMinReading, conn),
54 sensorId{gpuTempSensorId}, sensorPollMs(pollRate),
55 waitTimer(io, std::chrono::steady_clock::duration(0)),
56 mctpRequester(mctpRequester), conn(conn), objectServer(objectServer)
57 {
58 std::string dbusPath =
59 sensorPathPrefix + "temperature/"s + escapeName(name);
60
61 sensorInterface = objectServer.add_interface(
62 dbusPath, "xyz.openbmc_project.Sensor.Value");
63
64 for (const auto& threshold : thresholds)
65 {
66 std::string interface = thresholds::getInterface(threshold.level);
67 thresholdInterfaces[static_cast<size_t>(threshold.level)] =
68 objectServer.add_interface(dbusPath, interface);
69 }
70
71 association = objectServer.add_interface(dbusPath, association::interface);
72
73 discoverGpus();
74 }
75
~GpuTempSensor()76 GpuTempSensor::~GpuTempSensor()
77 {
78 waitTimer.cancel();
79 for (const auto& iface : thresholdInterfaces)
80 {
81 objectServer.remove_interface(iface);
82 }
83 objectServer.remove_interface(association);
84 objectServer.remove_interface(sensorInterface);
85 }
86
checkThresholds()87 void GpuTempSensor::checkThresholds()
88 {
89 thresholds::checkThresholds(this);
90 }
91
queryEndpoints(const boost::system::error_code & ec,const GetSubTreeType & ret)92 void GpuTempSensor::queryEndpoints(const boost::system::error_code& ec,
93 const GetSubTreeType& ret)
94 {
95 if (ec)
96 {
97 lg2::error("Error querying endoints :{ERROR}", "ERROR", ec.message());
98 return;
99 }
100
101 if (ret.empty())
102 {
103 return;
104 }
105
106 for (const auto& [objPath, services] : ret)
107 {
108 for (const auto& [service, ifaces] : services)
109 {
110 for (const auto& iface : ifaces)
111 {
112 if (iface == "xyz.openbmc_project.MCTP.Endpoint")
113 {
114 conn->async_method_call(
115 [this](const boost::system::error_code& ec,
116 const SensorBaseConfigMap& configs) {
117 this->processEndpoint(ec, configs);
118 },
119 service, objPath, "org.freedesktop.DBus.Properties",
120 "GetAll", iface);
121 }
122 }
123 }
124 }
125 }
126
read()127 void GpuTempSensor::read()
128 {
129 update();
130
131 waitTimer.expires_after(std::chrono::milliseconds(sensorPollMs));
132 waitTimer.async_wait(
133 [weakPtrToThis = std::weak_ptr<GpuTempSensor>{shared_from_this()}](
134 const boost::system::error_code& ec) {
135 if (ec)
136 {
137 return;
138 }
139 if (auto ptr = weakPtrToThis.lock())
140 {
141 ptr->read();
142 }
143 });
144 }
145
processResponse(int sendRecvMsgResult)146 void GpuTempSensor::processResponse(int sendRecvMsgResult)
147 {
148 if (sendRecvMsgResult != 0)
149 {
150 lg2::error(
151 "Error updating Temperature Sensor: sending message over MCTP failed, rc={RC}",
152 "RC", sendRecvMsgResult);
153 return;
154 }
155
156 ocp::accelerator_management::CompletionCode cc{};
157 uint16_t reasonCode = 0;
158 double tempValue = 0;
159
160 auto rc = gpu::decodeGetTemperatureReadingResponse(
161 getTemperatureReadingResponse, cc, reasonCode, tempValue);
162
163 if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
164 {
165 lg2::error(
166 "Error updating Temperature Sensor: decode failed, rc={RC}, cc={CC}, reasonCode={RESC}",
167 "RC", rc, "CC", cc, "RESC", reasonCode);
168 return;
169 }
170
171 updateValue(tempValue);
172 }
173
update()174 void GpuTempSensor::update()
175 {
176 auto rc = gpu::encodeGetTemperatureReadingRequest(
177 0, sensorId, getTemperatureReadingRequest);
178 if (rc != 0)
179 {
180 lg2::error("Error updating Temperature Sensor: encode failed, rc={RC}",
181 "RC", rc);
182 return;
183 }
184
185 mctpRequester.sendRecvMsg(
186 eid, getTemperatureReadingRequest, getTemperatureReadingResponse,
187 [this](int sendRecvMsgResult) { processResponse(sendRecvMsgResult); });
188 }
189
processQueryDeviceIdResponse(uint8_t eid,int sendRecvMsgResult)190 void GpuTempSensor::processQueryDeviceIdResponse(uint8_t eid,
191 int sendRecvMsgResult)
192 {
193 if (sendRecvMsgResult != 0)
194 {
195 lg2::error(
196 "Error processing GPU endpoint: sending message over MCTP failed, rc={RC}",
197 "RC", sendRecvMsgResult);
198 return;
199 }
200
201 ocp::accelerator_management::CompletionCode cc{};
202 uint16_t reasonCode = 0;
203 uint8_t responseDeviceType = 0;
204 uint8_t responseInstanceId = 0;
205
206 auto rc = gpu::decodeQueryDeviceIdentificationResponse(
207 queryDeviceIdentificationResponse, cc, reasonCode, responseDeviceType,
208 responseInstanceId);
209
210 if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
211 {
212 lg2::error(
213 "Error processing GPU endpoint: decode failed, rc={RC}, cc={CC}, reasonCode={RESC}",
214 "RC", rc, "CC", cc, "RESC", reasonCode);
215 return;
216 }
217
218 if (responseDeviceType ==
219 static_cast<uint8_t>(gpu::DeviceIdentification::DEVICE_GPU))
220 {
221 lg2::info(
222 "Found the GPU with EID {EID}, DeviceType {DEVTYPE}, InstanceId {IID}.",
223 "EID", eid, "DEVTYPE", responseDeviceType, "IID",
224 responseInstanceId);
225
226 this->eid = eid;
227 setInitialProperties(sensor_paths::unitDegreesC);
228 read();
229 }
230 }
231
processGpuEndpoint(uint8_t eid)232 void GpuTempSensor::processGpuEndpoint(uint8_t eid)
233 {
234 auto rc = gpu::encodeQueryDeviceIdentificationRequest(
235 0, queryDeviceIdentificationRequest);
236 if (rc != 0)
237 {
238 lg2::error("Error processing GPU endpoint: encode failed, rc={RC}",
239 "RC", rc);
240 return;
241 }
242
243 mctpRequester.sendRecvMsg(
244 eid, queryDeviceIdentificationRequest,
245 queryDeviceIdentificationResponse, [this, eid](int sendRecvMsgResult) {
246 processQueryDeviceIdResponse(eid, sendRecvMsgResult);
247 });
248 }
249
processEndpoint(const boost::system::error_code & ec,const SensorBaseConfigMap & endpoint)250 void GpuTempSensor::processEndpoint(const boost::system::error_code& ec,
251 const SensorBaseConfigMap& endpoint)
252 {
253 if (ec)
254 {
255 lg2::error("Error processing MCTP endpoint: {ERROR}", "ERROR",
256 ec.message());
257 return;
258 }
259
260 uint8_t eid{};
261 std::vector<uint8_t> mctpTypes{};
262
263 auto hasEid = endpoint.find("EID");
264 if (hasEid != endpoint.end())
265 {
266 const auto* eidPtr = std::get_if<uint8_t>(&hasEid->second);
267 if (eidPtr != nullptr)
268 {
269 eid = *eidPtr;
270 }
271 else
272 {
273 lg2::error(
274 "Error processing MCTP endpoint: Property EID does not have valid type.");
275 return;
276 }
277 }
278 else
279 {
280 lg2::error(
281 "Error processing MCTP endpoint: Property EID not found in the configuration.");
282 return;
283 }
284
285 auto hasMctpTypes = endpoint.find("SupportedMessageTypes");
286 if (hasMctpTypes != endpoint.end())
287 {
288 const auto* mctpTypePtr =
289 std::get_if<std::vector<uint8_t>>(&hasMctpTypes->second);
290 if (mctpTypePtr != nullptr)
291 {
292 mctpTypes = *mctpTypePtr;
293 }
294 else
295 {
296 lg2::error(
297 "Error processing MCTP endpoint: Property SupportedMessageTypes does not have valid type.");
298 return;
299 }
300 }
301 else
302 {
303 lg2::error(
304 "Error processing MCTP endpoint: Property SupportedMessageTypes not found in the configuration.");
305 return;
306 }
307
308 if (std::find(mctpTypes.begin(), mctpTypes.end(),
309 ocp::accelerator_management::messageType) != mctpTypes.end())
310 {
311 lg2::info(
312 "GpuTempSensor::discoverGpus(): Found OCP MCTP VDM Endpoint with ID {EID}",
313 "EID", eid);
314 this->processGpuEndpoint(eid);
315 }
316 }
317
discoverGpus()318 void GpuTempSensor::discoverGpus()
319 {
320 std::string searchPath{"/au/com/codeconstruct/"};
321 std::vector<std::string> ifaceList{{"xyz.openbmc_project.MCTP.Endpoint"}};
322
323 conn->async_method_call(
324 [this](const boost::system::error_code& ec, const GetSubTreeType& ret) {
325 queryEndpoints(ec, ret);
326 },
327 "xyz.openbmc_project.ObjectMapper",
328 "/xyz/openbmc_project/object_mapper",
329 "xyz.openbmc_project.ObjectMapper", "GetSubTree", searchPath, 0,
330 ifaceList);
331 }
332
processSensorConfigs(boost::asio::io_context & io,sdbusplus::asio::object_server & objectServer,boost::container::flat_map<std::string,std::shared_ptr<GpuTempSensor>> & sensors,std::shared_ptr<sdbusplus::asio::connection> & dbusConnection,mctp::MctpRequester & mctpRequester,const ManagedObjectType & resp)333 void processSensorConfigs(
334 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
335 boost::container::flat_map<std::string, std::shared_ptr<GpuTempSensor>>&
336 sensors,
337 std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
338 mctp::MctpRequester& mctpRequester, const ManagedObjectType& resp)
339 {
340 for (const auto& [path, interfaces] : resp)
341 {
342 for (const auto& [intf, cfg] : interfaces)
343 {
344 if (intf != configInterfaceName(sensorType))
345 {
346 continue;
347 }
348
349 std::string name = loadVariant<std::string>(cfg, "Name");
350
351 uint64_t pollRate = loadVariant<uint64_t>(cfg, "PollRate");
352
353 sensors[name] = std::make_shared<GpuTempSensor>(
354 dbusConnection, io, mctpRequester, name, path, objectServer,
355 std::vector<thresholds::Threshold>{},
356 std::chrono::milliseconds{pollRate});
357
358 lg2::info(
359 "Added GPU Temperature Sensor {NAME} with chassis path: {PATH}.",
360 "NAME", name, "PATH", path);
361 }
362 }
363 }
364
createSensors(boost::asio::io_context & io,sdbusplus::asio::object_server & objectServer,boost::container::flat_map<std::string,std::shared_ptr<GpuTempSensor>> & sensors,std::shared_ptr<sdbusplus::asio::connection> & dbusConnection,mctp::MctpRequester & mctpRequester)365 void createSensors(
366 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
367 boost::container::flat_map<std::string, std::shared_ptr<GpuTempSensor>>&
368 sensors,
369 std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
370 mctp::MctpRequester& mctpRequester)
371 {
372 if (!dbusConnection)
373 {
374 lg2::error("Connection not created");
375 return;
376 }
377 dbusConnection->async_method_call(
378 [&sensors, &mctpRequester, &dbusConnection, &io,
379 &objectServer](const boost::system::error_code& ec,
380 const ManagedObjectType& resp) {
381 if (ec)
382 {
383 lg2::error("Error contacting entity manager");
384 return;
385 }
386
387 processSensorConfigs(io, objectServer, sensors, dbusConnection,
388 mctpRequester, resp);
389 },
390 entityManagerName, "/xyz/openbmc_project/inventory",
391 "org.freedesktop.DBus.ObjectManager", "GetManagedObjects");
392 }
393
interfaceRemoved(sdbusplus::message_t & message,boost::container::flat_map<std::string,std::shared_ptr<GpuTempSensor>> & sensors)394 void interfaceRemoved(
395 sdbusplus::message_t& message,
396 boost::container::flat_map<std::string, std::shared_ptr<GpuTempSensor>>&
397 sensors)
398 {
399 if (message.is_method_error())
400 {
401 lg2::error("interfacesRemoved callback method error");
402 return;
403 }
404
405 sdbusplus::message::object_path removedPath;
406 std::vector<std::string> interfaces;
407
408 message.read(removedPath, interfaces);
409
410 // If the xyz.openbmc_project.Confguration.X interface was removed
411 // for one or more sensors, delete those sensor objects.
412 auto sensorIt = sensors.begin();
413 while (sensorIt != sensors.end())
414 {
415 if ((sensorIt->second->configurationPath == removedPath) &&
416 (std::find(interfaces.begin(), interfaces.end(),
417 configInterfaceName(sensorType)) != interfaces.end()))
418 {
419 sensorIt = sensors.erase(sensorIt);
420 }
421 else
422 {
423 sensorIt++;
424 }
425 }
426 }
427