xref: /openbmc/dbus-sensors/src/nvidia-gpu/NvidiaGpuSensor.cpp (revision 560e6af7b1f74e9c020a0f82817f9d926e0c4f72)
1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
3  * AFFILIATES. All rights reserved.
4  * SPDX-License-Identifier: Apache-2.0
5  */
6 
7 #include "NvidiaGpuSensor.hpp"
8 
9 #include "SensorPaths.hpp"
10 #include "Thresholds.hpp"
11 #include "Utils.hpp"
12 #include "sensor.hpp"
13 
14 #include <bits/basic_string.h>
15 
16 #include <MctpRequester.hpp>
17 #include <NvidiaGpuMctpVdm.hpp>
18 #include <OcpMctpVdm.hpp>
19 #include <boost/asio/io_context.hpp>
20 #include <boost/container/flat_map.hpp>
21 #include <phosphor-logging/lg2.hpp>
22 #include <sdbusplus/asio/connection.hpp>
23 #include <sdbusplus/asio/object_server.hpp>
24 #include <sdbusplus/message.hpp>
25 #include <sdbusplus/message/native_types.hpp>
26 
27 #include <algorithm>
28 #include <chrono>
29 #include <cstddef>
30 #include <cstdint>
31 #include <functional>
32 #include <memory>
33 #include <string>
34 #include <utility>
35 #include <variant>
36 #include <vector>
37 
38 using namespace std::literals;
39 
40 constexpr uint8_t gpuTempSensorId{0};
41 static constexpr double gpuTempSensorMaxReading = 127;
42 static constexpr double gpuTempSensorMinReading = -128;
43 
GpuTempSensor(std::shared_ptr<sdbusplus::asio::connection> & conn,boost::asio::io_context & io,mctp::MctpRequester & mctpRequester,const std::string & name,const std::string & sensorConfiguration,sdbusplus::asio::object_server & objectServer,std::vector<thresholds::Threshold> && thresholdData,std::chrono::milliseconds pollRate)44 GpuTempSensor::GpuTempSensor(
45     std::shared_ptr<sdbusplus::asio::connection>& conn,
46     boost::asio::io_context& io, mctp::MctpRequester& mctpRequester,
47     const std::string& name, const std::string& sensorConfiguration,
48     sdbusplus::asio::object_server& objectServer,
49     std::vector<thresholds::Threshold>&& thresholdData,
50     std::chrono::milliseconds pollRate) :
51     Sensor(escapeName(name), std::move(thresholdData), sensorConfiguration,
52            "temperature", false, true, gpuTempSensorMaxReading,
53            gpuTempSensorMinReading, conn),
54     sensorId{gpuTempSensorId}, sensorPollMs(pollRate),
55     waitTimer(io, std::chrono::steady_clock::duration(0)),
56     mctpRequester(mctpRequester), conn(conn), objectServer(objectServer)
57 {
58     std::string dbusPath =
59         sensorPathPrefix + "temperature/"s + escapeName(name);
60 
61     sensorInterface = objectServer.add_interface(
62         dbusPath, "xyz.openbmc_project.Sensor.Value");
63 
64     for (const auto& threshold : thresholds)
65     {
66         std::string interface = thresholds::getInterface(threshold.level);
67         thresholdInterfaces[static_cast<size_t>(threshold.level)] =
68             objectServer.add_interface(dbusPath, interface);
69     }
70 
71     association = objectServer.add_interface(dbusPath, association::interface);
72 
73     discoverGpus();
74 }
75 
~GpuTempSensor()76 GpuTempSensor::~GpuTempSensor()
77 {
78     waitTimer.cancel();
79     for (const auto& iface : thresholdInterfaces)
80     {
81         objectServer.remove_interface(iface);
82     }
83     objectServer.remove_interface(association);
84     objectServer.remove_interface(sensorInterface);
85 }
86 
checkThresholds()87 void GpuTempSensor::checkThresholds()
88 {
89     thresholds::checkThresholds(this);
90 }
91 
queryEndpoints(const boost::system::error_code & ec,const GetSubTreeType & ret)92 void GpuTempSensor::queryEndpoints(const boost::system::error_code& ec,
93                                    const GetSubTreeType& ret)
94 {
95     if (ec)
96     {
97         lg2::error("Error querying endoints :{ERROR}", "ERROR", ec.message());
98         return;
99     }
100 
101     if (ret.empty())
102     {
103         return;
104     }
105 
106     for (const auto& [objPath, services] : ret)
107     {
108         for (const auto& [service, ifaces] : services)
109         {
110             for (const auto& iface : ifaces)
111             {
112                 if (iface == "xyz.openbmc_project.MCTP.Endpoint")
113                 {
114                     conn->async_method_call(
115                         [this](const boost::system::error_code& ec,
116                                const SensorBaseConfigMap& configs) {
117                             this->processEndpoint(ec, configs);
118                         },
119                         service, objPath, "org.freedesktop.DBus.Properties",
120                         "GetAll", iface);
121                 }
122             }
123         }
124     }
125 }
126 
read()127 void GpuTempSensor::read()
128 {
129     update();
130 
131     waitTimer.expires_after(std::chrono::milliseconds(sensorPollMs));
132     waitTimer.async_wait(
133         [weakPtrToThis = std::weak_ptr<GpuTempSensor>{shared_from_this()}](
134             const boost::system::error_code& ec) {
135             if (ec)
136             {
137                 return;
138             }
139             if (auto ptr = weakPtrToThis.lock())
140             {
141                 ptr->read();
142             }
143         });
144 }
145 
processResponse(int sendRecvMsgResult)146 void GpuTempSensor::processResponse(int sendRecvMsgResult)
147 {
148     if (sendRecvMsgResult != 0)
149     {
150         lg2::error(
151             "Error updating Temperature Sensor: sending message over MCTP failed, rc={RC}",
152             "RC", sendRecvMsgResult);
153         return;
154     }
155 
156     ocp::accelerator_management::CompletionCode cc{};
157     uint16_t reasonCode = 0;
158     double tempValue = 0;
159 
160     auto rc = gpu::decodeGetTemperatureReadingResponse(
161         getTemperatureReadingResponse, cc, reasonCode, tempValue);
162 
163     if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
164     {
165         lg2::error(
166             "Error updating Temperature Sensor: decode failed, rc={RC}, cc={CC}, reasonCode={RESC}",
167             "RC", rc, "CC", cc, "RESC", reasonCode);
168         return;
169     }
170 
171     updateValue(tempValue);
172 }
173 
update()174 void GpuTempSensor::update()
175 {
176     auto rc = gpu::encodeGetTemperatureReadingRequest(
177         0, sensorId, getTemperatureReadingRequest);
178     if (rc != 0)
179     {
180         lg2::error("Error updating Temperature Sensor: encode failed, rc={RC}",
181                    "RC", rc);
182         return;
183     }
184 
185     mctpRequester.sendRecvMsg(
186         eid, getTemperatureReadingRequest, getTemperatureReadingResponse,
187         [this](int sendRecvMsgResult) { processResponse(sendRecvMsgResult); });
188 }
189 
processQueryDeviceIdResponse(uint8_t eid,int sendRecvMsgResult)190 void GpuTempSensor::processQueryDeviceIdResponse(uint8_t eid,
191                                                  int sendRecvMsgResult)
192 {
193     if (sendRecvMsgResult != 0)
194     {
195         lg2::error(
196             "Error processing GPU endpoint: sending message over MCTP failed, rc={RC}",
197             "RC", sendRecvMsgResult);
198         return;
199     }
200 
201     ocp::accelerator_management::CompletionCode cc{};
202     uint16_t reasonCode = 0;
203     uint8_t responseDeviceType = 0;
204     uint8_t responseInstanceId = 0;
205 
206     auto rc = gpu::decodeQueryDeviceIdentificationResponse(
207         queryDeviceIdentificationResponse, cc, reasonCode, responseDeviceType,
208         responseInstanceId);
209 
210     if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
211     {
212         lg2::error(
213             "Error processing GPU endpoint: decode failed, rc={RC}, cc={CC}, reasonCode={RESC}",
214             "RC", rc, "CC", cc, "RESC", reasonCode);
215         return;
216     }
217 
218     if (responseDeviceType ==
219         static_cast<uint8_t>(gpu::DeviceIdentification::DEVICE_GPU))
220     {
221         lg2::info(
222             "Found the GPU with EID {EID}, DeviceType {DEVTYPE}, InstanceId {IID}.",
223             "EID", eid, "DEVTYPE", responseDeviceType, "IID",
224             responseInstanceId);
225 
226         this->eid = eid;
227         setInitialProperties(sensor_paths::unitDegreesC);
228         read();
229     }
230 }
231 
processGpuEndpoint(uint8_t eid)232 void GpuTempSensor::processGpuEndpoint(uint8_t eid)
233 {
234     auto rc = gpu::encodeQueryDeviceIdentificationRequest(
235         0, queryDeviceIdentificationRequest);
236     if (rc != 0)
237     {
238         lg2::error("Error processing GPU endpoint: encode failed, rc={RC}",
239                    "RC", rc);
240         return;
241     }
242 
243     mctpRequester.sendRecvMsg(
244         eid, queryDeviceIdentificationRequest,
245         queryDeviceIdentificationResponse, [this, eid](int sendRecvMsgResult) {
246             processQueryDeviceIdResponse(eid, sendRecvMsgResult);
247         });
248 }
249 
processEndpoint(const boost::system::error_code & ec,const SensorBaseConfigMap & endpoint)250 void GpuTempSensor::processEndpoint(const boost::system::error_code& ec,
251                                     const SensorBaseConfigMap& endpoint)
252 {
253     if (ec)
254     {
255         lg2::error("Error processing MCTP endpoint: {ERROR}", "ERROR",
256                    ec.message());
257         return;
258     }
259 
260     uint8_t eid{};
261     std::vector<uint8_t> mctpTypes{};
262 
263     auto hasEid = endpoint.find("EID");
264     if (hasEid != endpoint.end())
265     {
266         const auto* eidPtr = std::get_if<uint8_t>(&hasEid->second);
267         if (eidPtr != nullptr)
268         {
269             eid = *eidPtr;
270         }
271         else
272         {
273             lg2::error(
274                 "Error processing MCTP endpoint: Property EID does not have valid type.");
275             return;
276         }
277     }
278     else
279     {
280         lg2::error(
281             "Error processing MCTP endpoint: Property EID not found in the configuration.");
282         return;
283     }
284 
285     auto hasMctpTypes = endpoint.find("SupportedMessageTypes");
286     if (hasMctpTypes != endpoint.end())
287     {
288         const auto* mctpTypePtr =
289             std::get_if<std::vector<uint8_t>>(&hasMctpTypes->second);
290         if (mctpTypePtr != nullptr)
291         {
292             mctpTypes = *mctpTypePtr;
293         }
294         else
295         {
296             lg2::error(
297                 "Error processing MCTP endpoint: Property SupportedMessageTypes does not have valid type.");
298             return;
299         }
300     }
301     else
302     {
303         lg2::error(
304             "Error processing MCTP endpoint: Property SupportedMessageTypes not found in the configuration.");
305         return;
306     }
307 
308     if (std::find(mctpTypes.begin(), mctpTypes.end(),
309                   ocp::accelerator_management::messageType) != mctpTypes.end())
310     {
311         lg2::info(
312             "GpuTempSensor::discoverGpus(): Found OCP MCTP VDM Endpoint with ID {EID}",
313             "EID", eid);
314         this->processGpuEndpoint(eid);
315     }
316 }
317 
discoverGpus()318 void GpuTempSensor::discoverGpus()
319 {
320     std::string searchPath{"/au/com/codeconstruct/"};
321     std::vector<std::string> ifaceList{{"xyz.openbmc_project.MCTP.Endpoint"}};
322 
323     conn->async_method_call(
324         [this](const boost::system::error_code& ec, const GetSubTreeType& ret) {
325             queryEndpoints(ec, ret);
326         },
327         "xyz.openbmc_project.ObjectMapper",
328         "/xyz/openbmc_project/object_mapper",
329         "xyz.openbmc_project.ObjectMapper", "GetSubTree", searchPath, 0,
330         ifaceList);
331 }
332 
processSensorConfigs(boost::asio::io_context & io,sdbusplus::asio::object_server & objectServer,boost::container::flat_map<std::string,std::shared_ptr<GpuTempSensor>> & sensors,std::shared_ptr<sdbusplus::asio::connection> & dbusConnection,mctp::MctpRequester & mctpRequester,const ManagedObjectType & resp)333 void processSensorConfigs(
334     boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
335     boost::container::flat_map<std::string, std::shared_ptr<GpuTempSensor>>&
336         sensors,
337     std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
338     mctp::MctpRequester& mctpRequester, const ManagedObjectType& resp)
339 {
340     for (const auto& [path, interfaces] : resp)
341     {
342         for (const auto& [intf, cfg] : interfaces)
343         {
344             if (intf != configInterfaceName(sensorType))
345             {
346                 continue;
347             }
348 
349             std::string name = loadVariant<std::string>(cfg, "Name");
350 
351             uint64_t pollRate = loadVariant<uint64_t>(cfg, "PollRate");
352 
353             sensors[name] = std::make_shared<GpuTempSensor>(
354                 dbusConnection, io, mctpRequester, name, path, objectServer,
355                 std::vector<thresholds::Threshold>{},
356                 std::chrono::milliseconds{pollRate});
357 
358             lg2::info(
359                 "Added GPU Temperature Sensor {NAME} with chassis path: {PATH}.",
360                 "NAME", name, "PATH", path);
361         }
362     }
363 }
364 
createSensors(boost::asio::io_context & io,sdbusplus::asio::object_server & objectServer,boost::container::flat_map<std::string,std::shared_ptr<GpuTempSensor>> & sensors,std::shared_ptr<sdbusplus::asio::connection> & dbusConnection,mctp::MctpRequester & mctpRequester)365 void createSensors(
366     boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
367     boost::container::flat_map<std::string, std::shared_ptr<GpuTempSensor>>&
368         sensors,
369     std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
370     mctp::MctpRequester& mctpRequester)
371 {
372     if (!dbusConnection)
373     {
374         lg2::error("Connection not created");
375         return;
376     }
377     dbusConnection->async_method_call(
378         [&sensors, &mctpRequester, &dbusConnection, &io,
379          &objectServer](const boost::system::error_code& ec,
380                         const ManagedObjectType& resp) {
381             if (ec)
382             {
383                 lg2::error("Error contacting entity manager");
384                 return;
385             }
386 
387             processSensorConfigs(io, objectServer, sensors, dbusConnection,
388                                  mctpRequester, resp);
389         },
390         entityManagerName, "/xyz/openbmc_project/inventory",
391         "org.freedesktop.DBus.ObjectManager", "GetManagedObjects");
392 }
393 
interfaceRemoved(sdbusplus::message_t & message,boost::container::flat_map<std::string,std::shared_ptr<GpuTempSensor>> & sensors)394 void interfaceRemoved(
395     sdbusplus::message_t& message,
396     boost::container::flat_map<std::string, std::shared_ptr<GpuTempSensor>>&
397         sensors)
398 {
399     if (message.is_method_error())
400     {
401         lg2::error("interfacesRemoved callback method error");
402         return;
403     }
404 
405     sdbusplus::message::object_path removedPath;
406     std::vector<std::string> interfaces;
407 
408     message.read(removedPath, interfaces);
409 
410     // If the xyz.openbmc_project.Confguration.X interface was removed
411     // for one or more sensors, delete those sensor objects.
412     auto sensorIt = sensors.begin();
413     while (sensorIt != sensors.end())
414     {
415         if ((sensorIt->second->configurationPath == removedPath) &&
416             (std::find(interfaces.begin(), interfaces.end(),
417                        configInterfaceName(sensorType)) != interfaces.end()))
418         {
419             sensorIt = sensors.erase(sensorIt);
420         }
421         else
422         {
423             sensorIt++;
424         }
425     }
426 }
427