xref: /openbmc/dbus-sensors/src/nvidia-gpu/NvidiaDeviceDiscovery.cpp (revision 0ad3a7e885ded75ec5a0e0ca78792784daeefa54)
1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
3  * AFFILIATES. All rights reserved.
4  * SPDX-License-Identifier: Apache-2.0
5  */
6 
7 #include "NvidiaDeviceDiscovery.hpp"
8 
9 #include "NvidiaGpuDevice.hpp"
10 #include "NvidiaSmaDevice.hpp"
11 #include "Utils.hpp"
12 
13 #include <bits/basic_string.h>
14 
15 #include <MctpRequester.hpp>
16 #include <NvidiaGpuMctpVdm.hpp>
17 #include <OcpMctpVdm.hpp>
18 #include <boost/asio/io_context.hpp>
19 #include <boost/container/flat_map.hpp>
20 #include <phosphor-logging/lg2.hpp>
21 #include <sdbusplus/asio/connection.hpp>
22 #include <sdbusplus/asio/object_server.hpp>
23 #include <sdbusplus/message.hpp>
24 #include <sdbusplus/message/native_types.hpp>
25 
26 #include <algorithm>
27 #include <array>
28 #include <cstdint>
29 #include <memory>
30 #include <span>
31 #include <stdexcept>
32 #include <string>
33 #include <utility>
34 #include <variant>
35 #include <vector>
36 
37 static constexpr auto sensorPollRateMs = 1000;
38 
processQueryDeviceIdResponse(boost::asio::io_context & io,sdbusplus::asio::object_server & objectServer,boost::container::flat_map<std::string,std::shared_ptr<GpuDevice>> & gpuDevices,boost::container::flat_map<std::string,std::shared_ptr<SmaDevice>> & smaDevices,const std::shared_ptr<sdbusplus::asio::connection> & conn,mctp::MctpRequester & mctpRequester,const SensorConfigs & configs,const std::string & path,uint8_t eid,int sendRecvMsgResult,std::span<uint8_t> queryDeviceIdentificationResponse)39 void processQueryDeviceIdResponse(
40     boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
41     boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
42         gpuDevices,
43     boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
44         smaDevices,
45     const std::shared_ptr<sdbusplus::asio::connection>& conn,
46     mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
47     const std::string& path, uint8_t eid, int sendRecvMsgResult,
48     std::span<uint8_t> queryDeviceIdentificationResponse)
49 {
50     if (sendRecvMsgResult != 0)
51     {
52         lg2::error(
53             "Error processing MCTP endpoint with eid {EID} : sending message over MCTP failed, rc={RC}",
54             "EID", eid, "RC", sendRecvMsgResult);
55         return;
56     }
57 
58     ocp::accelerator_management::CompletionCode cc{};
59     uint16_t reasonCode = 0;
60     uint8_t responseDeviceType = 0;
61     uint8_t responseInstanceId = 0;
62 
63     auto rc = gpu::decodeQueryDeviceIdentificationResponse(
64         queryDeviceIdentificationResponse, cc, reasonCode, responseDeviceType,
65         responseInstanceId);
66 
67     if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
68     {
69         lg2::error(
70             "Error processing MCTP endpoint with eid {EID} : decode failed, rc={RC}, cc={CC}, reasonCode={RESC}",
71             "EID", eid, "RC", rc, "CC", cc, "RESC", reasonCode);
72         return;
73     }
74 
75     switch (static_cast<gpu::DeviceIdentification>(responseDeviceType))
76     {
77         case gpu::DeviceIdentification::DEVICE_GPU:
78         {
79             lg2::info(
80                 "Found the GPU with EID {EID}, DeviceType {DEVTYPE}, InstanceId {IID}.",
81                 "EID", eid, "DEVTYPE", responseDeviceType, "IID",
82                 responseInstanceId);
83 
84             auto gpuName = configs.name + '_' +
85                            std::to_string(responseInstanceId);
86 
87             gpuDevices[gpuName] =
88                 std::make_shared<GpuDevice>(configs, gpuName, path, conn, eid,
89                                             io, mctpRequester, objectServer);
90             break;
91         }
92 
93         case gpu::DeviceIdentification::DEVICE_SMA:
94         {
95             lg2::info(
96                 "Found the SMA Device with EID {EID}, DeviceType {DEVTYPE}, InstanceId {IID}.",
97                 "EID", eid, "DEVTYPE", responseDeviceType, "IID",
98                 responseInstanceId);
99 
100             auto smaName = configs.name + "_SMA_" +
101                            std::to_string(responseInstanceId);
102 
103             smaDevices[smaName] =
104                 std::make_shared<SmaDevice>(configs, smaName, path, conn, eid,
105                                             io, mctpRequester, objectServer);
106             break;
107         }
108     }
109 }
110 
queryDeviceIdentification(boost::asio::io_context & io,sdbusplus::asio::object_server & objectServer,boost::container::flat_map<std::string,std::shared_ptr<GpuDevice>> & gpuDevices,boost::container::flat_map<std::string,std::shared_ptr<SmaDevice>> & smaDevices,const std::shared_ptr<sdbusplus::asio::connection> & conn,mctp::MctpRequester & mctpRequester,const SensorConfigs & configs,const std::string & path,uint8_t eid)111 void queryDeviceIdentification(
112     boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
113     boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
114         gpuDevices,
115     boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
116         smaDevices,
117     const std::shared_ptr<sdbusplus::asio::connection>& conn,
118     mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
119     const std::string& path, uint8_t eid)
120 {
121     auto queryDeviceIdentificationRequest = std::make_shared<
122         std::array<uint8_t, sizeof(gpu::QueryDeviceIdentificationRequest)>>();
123 
124     auto queryDeviceIdentificationResponse = std::make_shared<
125         std::array<uint8_t, sizeof(gpu::QueryDeviceIdentificationResponse)>>();
126 
127     auto rc = gpu::encodeQueryDeviceIdentificationRequest(
128         0, *queryDeviceIdentificationRequest);
129     if (rc != 0)
130     {
131         lg2::error(
132             "Error processing MCTP endpoint with eid {EID} : encode failed, rc={RC}",
133             "EID", eid, "RC", rc);
134         return;
135     }
136 
137     mctpRequester.sendRecvMsg(
138         eid, *queryDeviceIdentificationRequest,
139         *queryDeviceIdentificationResponse,
140         [&io, &objectServer, &gpuDevices, &smaDevices, conn, &mctpRequester,
141          configs, path, eid, queryDeviceIdentificationRequest,
142          queryDeviceIdentificationResponse](int sendRecvMsgResult) {
143             processQueryDeviceIdResponse(
144                 io, objectServer, gpuDevices, smaDevices, conn, mctpRequester,
145                 configs, path, eid, sendRecvMsgResult,
146                 *queryDeviceIdentificationResponse);
147         });
148 }
149 
processEndpoint(boost::asio::io_context & io,sdbusplus::asio::object_server & objectServer,boost::container::flat_map<std::string,std::shared_ptr<GpuDevice>> & gpuDevices,boost::container::flat_map<std::string,std::shared_ptr<SmaDevice>> & smaDevices,const std::shared_ptr<sdbusplus::asio::connection> & conn,mctp::MctpRequester & mctpRequester,const SensorConfigs & configs,const std::string & path,const boost::system::error_code & ec,const SensorBaseConfigMap & endpoint)150 void processEndpoint(
151     boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
152     boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
153         gpuDevices,
154     boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
155         smaDevices,
156     const std::shared_ptr<sdbusplus::asio::connection>& conn,
157     mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
158     const std::string& path, const boost::system::error_code& ec,
159     const SensorBaseConfigMap& endpoint)
160 {
161     if (ec)
162     {
163         lg2::error("Error processing MCTP endpoint: Error:{ERROR}", "ERROR",
164                    ec.message());
165         return;
166     }
167 
168     auto hasEid = endpoint.find("EID");
169     uint8_t eid{};
170 
171     if (hasEid != endpoint.end())
172     {
173         const auto* eidPtr = std::get_if<uint8_t>(&hasEid->second);
174         if (eidPtr != nullptr)
175         {
176             eid = *eidPtr;
177         }
178         else
179         {
180             lg2::error(
181                 "Error processing MCTP endpoint: Property EID does not have valid type.");
182             return;
183         }
184     }
185     else
186     {
187         lg2::error(
188             "Error processing MCTP endpoint: Property EID not found in the configuration.");
189         return;
190     }
191 
192     auto hasMctpTypes = endpoint.find("SupportedMessageTypes");
193     std::vector<uint8_t> mctpTypes{};
194 
195     if (hasMctpTypes != endpoint.end())
196     {
197         const auto* mctpTypePtr =
198             std::get_if<std::vector<uint8_t>>(&hasMctpTypes->second);
199         if (mctpTypePtr != nullptr)
200         {
201             mctpTypes = *mctpTypePtr;
202         }
203         else
204         {
205             lg2::error(
206                 "Error processing MCTP endpoint with eid {EID} : Property SupportedMessageTypes does not have valid type.",
207                 "EID", eid);
208             return;
209         }
210     }
211     else
212     {
213         lg2::error(
214             "Error processing MCTP endpoint with eid {EID} : Property SupportedMessageTypes not found in the configuration.",
215             "EID", eid);
216         return;
217     }
218 
219     if (std::find(mctpTypes.begin(), mctpTypes.end(),
220                   ocp::accelerator_management::messageType) != mctpTypes.end())
221     {
222         lg2::info("Found OCP MCTP VDM Endpoint with ID {EID}", "EID", eid);
223         queryDeviceIdentification(io, objectServer, gpuDevices, smaDevices,
224                                   conn, mctpRequester, configs, path, eid);
225     }
226 }
227 
queryEndpoints(boost::asio::io_context & io,sdbusplus::asio::object_server & objectServer,boost::container::flat_map<std::string,std::shared_ptr<GpuDevice>> & gpuDevices,boost::container::flat_map<std::string,std::shared_ptr<SmaDevice>> & smaDevices,const std::shared_ptr<sdbusplus::asio::connection> & conn,mctp::MctpRequester & mctpRequester,const SensorConfigs & configs,const std::string & path,const boost::system::error_code & ec,const GetSubTreeType & ret)228 void queryEndpoints(
229     boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
230     boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
231         gpuDevices,
232     boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
233         smaDevices,
234     const std::shared_ptr<sdbusplus::asio::connection>& conn,
235     mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
236     const std::string& path, const boost::system::error_code& ec,
237     const GetSubTreeType& ret)
238 {
239     if (ec)
240     {
241         lg2::error("Error processing MCTP endpoints: {ERROR}", "ERROR",
242                    ec.message());
243         return;
244     }
245 
246     if (ret.empty())
247     {
248         return;
249     }
250 
251     for (const auto& [objPath, services] : ret)
252     {
253         for (const auto& [service, ifaces] : services)
254         {
255             for (const auto& iface : ifaces)
256             {
257                 if (iface == "xyz.openbmc_project.MCTP.Endpoint")
258                 {
259                     conn->async_method_call(
260                         [&io, &objectServer, &gpuDevices, &smaDevices, conn,
261                          &mctpRequester, configs,
262                          path](const boost::system::error_code& ec,
263                                const SensorBaseConfigMap& endpoint) {
264                             processEndpoint(io, objectServer, gpuDevices,
265                                             smaDevices, conn, mctpRequester,
266                                             configs, path, ec, endpoint);
267                         },
268                         service, objPath, "org.freedesktop.DBus.Properties",
269                         "GetAll", iface);
270                 }
271             }
272         }
273     }
274 }
275 
discoverDevices(boost::asio::io_context & io,sdbusplus::asio::object_server & objectServer,boost::container::flat_map<std::string,std::shared_ptr<GpuDevice>> & gpuDevices,boost::container::flat_map<std::string,std::shared_ptr<SmaDevice>> & smaDevices,const std::shared_ptr<sdbusplus::asio::connection> & conn,mctp::MctpRequester & mctpRequester,const SensorConfigs & configs,const std::string & path)276 void discoverDevices(
277     boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
278     boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
279         gpuDevices,
280     boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
281         smaDevices,
282     const std::shared_ptr<sdbusplus::asio::connection>& conn,
283     mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
284     const std::string& path)
285 {
286     std::string searchPath{"/au/com/codeconstruct/"};
287     std::vector<std::string> ifaceList{{"xyz.openbmc_project.MCTP.Endpoint"}};
288 
289     conn->async_method_call(
290         [&io, &objectServer, &gpuDevices, &smaDevices, conn, &mctpRequester,
291          configs,
292          path](const boost::system::error_code& ec, const GetSubTreeType& ret) {
293             queryEndpoints(io, objectServer, gpuDevices, smaDevices, conn,
294                            mctpRequester, configs, path, ec, ret);
295         },
296         "xyz.openbmc_project.ObjectMapper",
297         "/xyz/openbmc_project/object_mapper",
298         "xyz.openbmc_project.ObjectMapper", "GetSubTree", searchPath, 0,
299         ifaceList);
300 }
301 
processSensorConfigs(boost::asio::io_context & io,sdbusplus::asio::object_server & objectServer,boost::container::flat_map<std::string,std::shared_ptr<GpuDevice>> & gpuDevices,boost::container::flat_map<std::string,std::shared_ptr<SmaDevice>> & smaDevices,const std::shared_ptr<sdbusplus::asio::connection> & dbusConnection,mctp::MctpRequester & mctpRequester,const ManagedObjectType & resp)302 void processSensorConfigs(
303     boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
304     boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
305         gpuDevices,
306     boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
307         smaDevices,
308     const std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
309     mctp::MctpRequester& mctpRequester, const ManagedObjectType& resp)
310 {
311     for (const auto& [path, interfaces] : resp)
312     {
313         for (const auto& [intf, cfg] : interfaces)
314         {
315             if (intf != configInterfaceName(deviceType))
316             {
317                 continue;
318             }
319 
320             SensorConfigs configs;
321 
322             configs.name = loadVariant<std::string>(cfg, "Name");
323 
324             try
325             {
326                 configs.pollRate = loadVariant<uint64_t>(cfg, "PollRate");
327             }
328             catch (const std::invalid_argument&)
329             {
330                 // PollRate is an optional config
331                 configs.pollRate = sensorPollRateMs;
332             }
333 
334             discoverDevices(io, objectServer, gpuDevices, smaDevices,
335                             dbusConnection, mctpRequester, configs, path);
336 
337             lg2::info(
338                 "Detected configuration {NAME} of type {TYPE} at path: {PATH}.",
339                 "NAME", configs.name, "TYPE", deviceType, "PATH", path);
340         }
341     }
342 }
343 
createSensors(boost::asio::io_context & io,sdbusplus::asio::object_server & objectServer,boost::container::flat_map<std::string,std::shared_ptr<GpuDevice>> & gpuDevices,boost::container::flat_map<std::string,std::shared_ptr<SmaDevice>> & smaDevices,const std::shared_ptr<sdbusplus::asio::connection> & dbusConnection,mctp::MctpRequester & mctpRequester)344 void createSensors(
345     boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
346     boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
347         gpuDevices,
348     boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
349         smaDevices,
350     const std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
351     mctp::MctpRequester& mctpRequester)
352 {
353     if (!dbusConnection)
354     {
355         lg2::error("Connection not created");
356         return;
357     }
358     dbusConnection->async_method_call(
359         [&gpuDevices, &smaDevices, &mctpRequester, dbusConnection, &io,
360          &objectServer](boost::system::error_code ec,
361                         const ManagedObjectType& resp) {
362             if (ec)
363             {
364                 lg2::error("Error contacting entity manager");
365                 return;
366             }
367 
368             processSensorConfigs(io, objectServer, gpuDevices, smaDevices,
369                                  dbusConnection, mctpRequester, resp);
370         },
371         entityManagerName, "/xyz/openbmc_project/inventory",
372         "org.freedesktop.DBus.ObjectManager", "GetManagedObjects");
373 }
374 
interfaceRemoved(sdbusplus::message_t & message,boost::container::flat_map<std::string,std::shared_ptr<GpuDevice>> & gpuDevices,boost::container::flat_map<std::string,std::shared_ptr<SmaDevice>> & smaDevices)375 void interfaceRemoved(
376     sdbusplus::message_t& message,
377     boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
378         gpuDevices,
379     boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
380         smaDevices)
381 {
382     if (message.is_method_error())
383     {
384         lg2::error("interfacesRemoved callback method error");
385         return;
386     }
387 
388     sdbusplus::message::object_path removedPath;
389     std::vector<std::string> interfaces;
390 
391     message.read(removedPath, interfaces);
392 
393     // If the xyz.openbmc_project.Confguration.X interface was removed
394     // for one or more sensors, delete those sensor objects.
395     auto sensorIt = gpuDevices.begin();
396     while (sensorIt != gpuDevices.end())
397     {
398         if ((sensorIt->second->getPath() == removedPath) &&
399             (std::find(interfaces.begin(), interfaces.end(),
400                        configInterfaceName(deviceType)) != interfaces.end()))
401         {
402             sensorIt = gpuDevices.erase(sensorIt);
403         }
404         else
405         {
406             sensorIt++;
407         }
408     }
409 
410     auto smaSensorIt = smaDevices.begin();
411     while (smaSensorIt != smaDevices.end())
412     {
413         if ((smaSensorIt->second->getPath() == removedPath) &&
414             (std::find(interfaces.begin(), interfaces.end(),
415                        configInterfaceName(deviceType)) != interfaces.end()))
416         {
417             smaSensorIt = smaDevices.erase(smaSensorIt);
418         }
419         else
420         {
421             smaSensorIt++;
422         }
423     }
424 }
425