xref: /openbmc/dbus-sensors/src/nvidia-gpu/NvidiaDeviceDiscovery.cpp (revision fd4a37798de7682f9caad3414975e17e47ef6ea3)
1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
3  * AFFILIATES. All rights reserved.
4  * SPDX-License-Identifier: Apache-2.0
5  */
6 
7 #include "NvidiaDeviceDiscovery.hpp"
8 
9 #include "NvidiaGpuDevice.hpp"
10 #include "NvidiaSmaDevice.hpp"
11 #include "Utils.hpp"
12 
13 #include <bits/basic_string.h>
14 
15 #include <MctpRequester.hpp>
16 #include <NvidiaGpuMctpVdm.hpp>
17 #include <OcpMctpVdm.hpp>
18 #include <boost/asio/io_context.hpp>
19 #include <boost/container/flat_map.hpp>
20 #include <phosphor-logging/lg2.hpp>
21 #include <sdbusplus/asio/connection.hpp>
22 #include <sdbusplus/asio/object_server.hpp>
23 #include <sdbusplus/message.hpp>
24 #include <sdbusplus/message/native_types.hpp>
25 
26 #include <algorithm>
27 #include <array>
28 #include <cstdint>
29 #include <memory>
30 #include <span>
31 #include <stdexcept>
32 #include <string>
33 #include <system_error>
34 #include <utility>
35 #include <variant>
36 #include <vector>
37 
38 static constexpr auto sensorPollRateMs = 1000;
39 
processQueryDeviceIdResponse(boost::asio::io_context & io,sdbusplus::asio::object_server & objectServer,boost::container::flat_map<std::string,std::shared_ptr<GpuDevice>> & gpuDevices,boost::container::flat_map<std::string,std::shared_ptr<SmaDevice>> & smaDevices,const std::shared_ptr<sdbusplus::asio::connection> & conn,mctp::MctpRequester & mctpRequester,const SensorConfigs & configs,const std::string & path,uint8_t eid,const std::error_code & sendRecvMsgResult,std::span<const uint8_t> queryDeviceIdentificationResponse)40 void processQueryDeviceIdResponse(
41     boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
42     boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
43         gpuDevices,
44     boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
45         smaDevices,
46     const std::shared_ptr<sdbusplus::asio::connection>& conn,
47     mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
48     const std::string& path, uint8_t eid,
49     const std::error_code& sendRecvMsgResult,
50     std::span<const uint8_t> queryDeviceIdentificationResponse)
51 {
52     if (sendRecvMsgResult)
53     {
54         lg2::error(
55             "Error processing MCTP endpoint with eid {EID} : sending message over MCTP failed, rc={RC}",
56             "EID", eid, "RC", sendRecvMsgResult.message());
57         return;
58     }
59 
60     ocp::accelerator_management::CompletionCode cc{};
61     uint16_t reasonCode = 0;
62     uint8_t responseDeviceType = 0;
63     uint8_t responseInstanceId = 0;
64 
65     auto rc = gpu::decodeQueryDeviceIdentificationResponse(
66         queryDeviceIdentificationResponse, cc, reasonCode, responseDeviceType,
67         responseInstanceId);
68 
69     if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
70     {
71         lg2::error(
72             "Error processing MCTP endpoint with eid {EID} : decode failed, rc={RC}, cc={CC}, reasonCode={RESC}",
73             "EID", eid, "RC", rc, "CC", cc, "RESC", reasonCode);
74         return;
75     }
76 
77     switch (static_cast<gpu::DeviceIdentification>(responseDeviceType))
78     {
79         case gpu::DeviceIdentification::DEVICE_GPU:
80         {
81             lg2::info(
82                 "Found the GPU with EID {EID}, DeviceType {DEVTYPE}, InstanceId {IID}.",
83                 "EID", eid, "DEVTYPE", responseDeviceType, "IID",
84                 responseInstanceId);
85 
86             auto gpuName = configs.name + '_' +
87                            std::to_string(responseInstanceId);
88 
89             auto gpu = gpuDevices
90                            .insert(std::make_pair(
91                                gpuName, std::make_shared<GpuDevice>(
92                                             configs, gpuName, path, conn, eid,
93                                             io, mctpRequester, objectServer)))
94                            .first;
95             (*gpu).second->init();
96             break;
97         }
98 
99         case gpu::DeviceIdentification::DEVICE_SMA:
100         {
101             lg2::info(
102                 "Found the SMA Device with EID {EID}, DeviceType {DEVTYPE}, InstanceId {IID}.",
103                 "EID", eid, "DEVTYPE", responseDeviceType, "IID",
104                 responseInstanceId);
105 
106             auto smaName = configs.name + "_SMA_" +
107                            std::to_string(responseInstanceId);
108 
109             auto sma = smaDevices
110                            .insert(std::make_pair(
111                                smaName, std::make_shared<SmaDevice>(
112                                             configs, smaName, path, conn, eid,
113                                             io, mctpRequester, objectServer)))
114                            .first;
115             (*sma).second->init();
116             break;
117         }
118     }
119 }
120 
queryDeviceIdentification(boost::asio::io_context & io,sdbusplus::asio::object_server & objectServer,boost::container::flat_map<std::string,std::shared_ptr<GpuDevice>> & gpuDevices,boost::container::flat_map<std::string,std::shared_ptr<SmaDevice>> & smaDevices,const std::shared_ptr<sdbusplus::asio::connection> & conn,mctp::MctpRequester & mctpRequester,const SensorConfigs & configs,const std::string & path,uint8_t eid)121 void queryDeviceIdentification(
122     boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
123     boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
124         gpuDevices,
125     boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
126         smaDevices,
127     const std::shared_ptr<sdbusplus::asio::connection>& conn,
128     mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
129     const std::string& path, uint8_t eid)
130 {
131     auto queryDeviceIdentificationRequest = std::make_shared<
132         std::array<uint8_t, sizeof(gpu::QueryDeviceIdentificationRequest)>>();
133 
134     auto rc = gpu::encodeQueryDeviceIdentificationRequest(
135         0, *queryDeviceIdentificationRequest);
136     if (rc != 0)
137     {
138         lg2::error(
139             "Error processing MCTP endpoint with eid {EID} : encode failed, rc={RC}",
140             "EID", eid, "RC", rc);
141         return;
142     }
143 
144     mctpRequester.sendRecvMsg(
145         eid, *queryDeviceIdentificationRequest,
146         [&io, &objectServer, &gpuDevices, &smaDevices, conn, &mctpRequester,
147          configs, path, eid, queryDeviceIdentificationRequest](
148             const std::error_code& ec, std::span<const uint8_t> response) {
149             processQueryDeviceIdResponse(io, objectServer, gpuDevices,
150                                          smaDevices, conn, mctpRequester,
151                                          configs, path, eid, ec, response);
152         });
153 }
154 
processEndpoint(boost::asio::io_context & io,sdbusplus::asio::object_server & objectServer,boost::container::flat_map<std::string,std::shared_ptr<GpuDevice>> & gpuDevices,boost::container::flat_map<std::string,std::shared_ptr<SmaDevice>> & smaDevices,const std::shared_ptr<sdbusplus::asio::connection> & conn,mctp::MctpRequester & mctpRequester,const SensorConfigs & configs,const std::string & path,const boost::system::error_code & ec,const SensorBaseConfigMap & endpoint)155 void processEndpoint(
156     boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
157     boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
158         gpuDevices,
159     boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
160         smaDevices,
161     const std::shared_ptr<sdbusplus::asio::connection>& conn,
162     mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
163     const std::string& path, const boost::system::error_code& ec,
164     const SensorBaseConfigMap& endpoint)
165 {
166     if (ec)
167     {
168         lg2::error("Error processing MCTP endpoint: Error:{ERROR}", "ERROR",
169                    ec.message());
170         return;
171     }
172 
173     auto hasEid = endpoint.find("EID");
174     uint8_t eid{};
175 
176     if (hasEid != endpoint.end())
177     {
178         const auto* eidPtr = std::get_if<uint8_t>(&hasEid->second);
179         if (eidPtr != nullptr)
180         {
181             eid = *eidPtr;
182         }
183         else
184         {
185             lg2::error(
186                 "Error processing MCTP endpoint: Property EID does not have valid type.");
187             return;
188         }
189     }
190     else
191     {
192         lg2::error(
193             "Error processing MCTP endpoint: Property EID not found in the configuration.");
194         return;
195     }
196 
197     auto hasMctpTypes = endpoint.find("SupportedMessageTypes");
198     std::vector<uint8_t> mctpTypes{};
199 
200     if (hasMctpTypes != endpoint.end())
201     {
202         const auto* mctpTypePtr =
203             std::get_if<std::vector<uint8_t>>(&hasMctpTypes->second);
204         if (mctpTypePtr != nullptr)
205         {
206             mctpTypes = *mctpTypePtr;
207         }
208         else
209         {
210             lg2::error(
211                 "Error processing MCTP endpoint with eid {EID} : Property SupportedMessageTypes does not have valid type.",
212                 "EID", eid);
213             return;
214         }
215     }
216     else
217     {
218         lg2::error(
219             "Error processing MCTP endpoint with eid {EID} : Property SupportedMessageTypes not found in the configuration.",
220             "EID", eid);
221         return;
222     }
223 
224     if (std::find(mctpTypes.begin(), mctpTypes.end(),
225                   ocp::accelerator_management::messageType) != mctpTypes.end())
226     {
227         lg2::info("Found OCP MCTP VDM Endpoint with ID {EID}", "EID", eid);
228         queryDeviceIdentification(io, objectServer, gpuDevices, smaDevices,
229                                   conn, mctpRequester, configs, path, eid);
230     }
231 }
232 
queryEndpoints(boost::asio::io_context & io,sdbusplus::asio::object_server & objectServer,boost::container::flat_map<std::string,std::shared_ptr<GpuDevice>> & gpuDevices,boost::container::flat_map<std::string,std::shared_ptr<SmaDevice>> & smaDevices,const std::shared_ptr<sdbusplus::asio::connection> & conn,mctp::MctpRequester & mctpRequester,const SensorConfigs & configs,const std::string & path,const boost::system::error_code & ec,const GetSubTreeType & ret)233 void queryEndpoints(
234     boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
235     boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
236         gpuDevices,
237     boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
238         smaDevices,
239     const std::shared_ptr<sdbusplus::asio::connection>& conn,
240     mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
241     const std::string& path, const boost::system::error_code& ec,
242     const GetSubTreeType& ret)
243 {
244     if (ec)
245     {
246         lg2::error("Error processing MCTP endpoints: {ERROR}", "ERROR",
247                    ec.message());
248         return;
249     }
250 
251     if (ret.empty())
252     {
253         return;
254     }
255 
256     for (const auto& [objPath, services] : ret)
257     {
258         for (const auto& [service, ifaces] : services)
259         {
260             for (const auto& iface : ifaces)
261             {
262                 if (iface == "xyz.openbmc_project.MCTP.Endpoint")
263                 {
264                     conn->async_method_call(
265                         [&io, &objectServer, &gpuDevices, &smaDevices, conn,
266                          &mctpRequester, configs,
267                          path](const boost::system::error_code& ec,
268                                const SensorBaseConfigMap& endpoint) {
269                             processEndpoint(io, objectServer, gpuDevices,
270                                             smaDevices, conn, mctpRequester,
271                                             configs, path, ec, endpoint);
272                         },
273                         service, objPath, "org.freedesktop.DBus.Properties",
274                         "GetAll", iface);
275                 }
276             }
277         }
278     }
279 }
280 
discoverDevices(boost::asio::io_context & io,sdbusplus::asio::object_server & objectServer,boost::container::flat_map<std::string,std::shared_ptr<GpuDevice>> & gpuDevices,boost::container::flat_map<std::string,std::shared_ptr<SmaDevice>> & smaDevices,const std::shared_ptr<sdbusplus::asio::connection> & conn,mctp::MctpRequester & mctpRequester,const SensorConfigs & configs,const std::string & path)281 void discoverDevices(
282     boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
283     boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
284         gpuDevices,
285     boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
286         smaDevices,
287     const std::shared_ptr<sdbusplus::asio::connection>& conn,
288     mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
289     const std::string& path)
290 {
291     std::string searchPath{"/au/com/codeconstruct/"};
292     std::vector<std::string> ifaceList{{"xyz.openbmc_project.MCTP.Endpoint"}};
293 
294     conn->async_method_call(
295         [&io, &objectServer, &gpuDevices, &smaDevices, conn, &mctpRequester,
296          configs,
297          path](const boost::system::error_code& ec, const GetSubTreeType& ret) {
298             queryEndpoints(io, objectServer, gpuDevices, smaDevices, conn,
299                            mctpRequester, configs, path, ec, ret);
300         },
301         "xyz.openbmc_project.ObjectMapper",
302         "/xyz/openbmc_project/object_mapper",
303         "xyz.openbmc_project.ObjectMapper", "GetSubTree", searchPath, 0,
304         ifaceList);
305 }
306 
processSensorConfigs(boost::asio::io_context & io,sdbusplus::asio::object_server & objectServer,boost::container::flat_map<std::string,std::shared_ptr<GpuDevice>> & gpuDevices,boost::container::flat_map<std::string,std::shared_ptr<SmaDevice>> & smaDevices,const std::shared_ptr<sdbusplus::asio::connection> & dbusConnection,mctp::MctpRequester & mctpRequester,const ManagedObjectType & resp)307 void processSensorConfigs(
308     boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
309     boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
310         gpuDevices,
311     boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
312         smaDevices,
313     const std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
314     mctp::MctpRequester& mctpRequester, const ManagedObjectType& resp)
315 {
316     for (const auto& [path, interfaces] : resp)
317     {
318         for (const auto& [intf, cfg] : interfaces)
319         {
320             if (intf != configInterfaceName(deviceType))
321             {
322                 continue;
323             }
324 
325             SensorConfigs configs;
326 
327             configs.name = loadVariant<std::string>(cfg, "Name");
328 
329             try
330             {
331                 configs.pollRate = loadVariant<uint64_t>(cfg, "PollRate");
332             }
333             catch (const std::invalid_argument&)
334             {
335                 // PollRate is an optional config
336                 configs.pollRate = sensorPollRateMs;
337             }
338 
339             discoverDevices(io, objectServer, gpuDevices, smaDevices,
340                             dbusConnection, mctpRequester, configs, path);
341 
342             lg2::info(
343                 "Detected configuration {NAME} of type {TYPE} at path: {PATH}.",
344                 "NAME", configs.name, "TYPE", deviceType, "PATH", path);
345         }
346     }
347 }
348 
createSensors(boost::asio::io_context & io,sdbusplus::asio::object_server & objectServer,boost::container::flat_map<std::string,std::shared_ptr<GpuDevice>> & gpuDevices,boost::container::flat_map<std::string,std::shared_ptr<SmaDevice>> & smaDevices,const std::shared_ptr<sdbusplus::asio::connection> & dbusConnection,mctp::MctpRequester & mctpRequester)349 void createSensors(
350     boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
351     boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
352         gpuDevices,
353     boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
354         smaDevices,
355     const std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
356     mctp::MctpRequester& mctpRequester)
357 {
358     if (!dbusConnection)
359     {
360         lg2::error("Connection not created");
361         return;
362     }
363     dbusConnection->async_method_call(
364         [&gpuDevices, &smaDevices, &mctpRequester, dbusConnection, &io,
365          &objectServer](boost::system::error_code ec,
366                         const ManagedObjectType& resp) {
367             if (ec)
368             {
369                 lg2::error("Error contacting entity manager");
370                 return;
371             }
372 
373             processSensorConfigs(io, objectServer, gpuDevices, smaDevices,
374                                  dbusConnection, mctpRequester, resp);
375         },
376         entityManagerName, "/xyz/openbmc_project/inventory",
377         "org.freedesktop.DBus.ObjectManager", "GetManagedObjects");
378 }
379 
interfaceRemoved(sdbusplus::message_t & message,boost::container::flat_map<std::string,std::shared_ptr<GpuDevice>> & gpuDevices,boost::container::flat_map<std::string,std::shared_ptr<SmaDevice>> & smaDevices)380 void interfaceRemoved(
381     sdbusplus::message_t& message,
382     boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
383         gpuDevices,
384     boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
385         smaDevices)
386 {
387     if (message.is_method_error())
388     {
389         lg2::error("interfacesRemoved callback method error");
390         return;
391     }
392 
393     sdbusplus::message::object_path removedPath;
394     std::vector<std::string> interfaces;
395 
396     message.read(removedPath, interfaces);
397 
398     // If the xyz.openbmc_project.Confguration.X interface was removed
399     // for one or more sensors, delete those sensor objects.
400     auto sensorIt = gpuDevices.begin();
401     while (sensorIt != gpuDevices.end())
402     {
403         if ((sensorIt->second->getPath() == removedPath) &&
404             (std::find(interfaces.begin(), interfaces.end(),
405                        configInterfaceName(deviceType)) != interfaces.end()))
406         {
407             sensorIt = gpuDevices.erase(sensorIt);
408         }
409         else
410         {
411             sensorIt++;
412         }
413     }
414 
415     auto smaSensorIt = smaDevices.begin();
416     while (smaSensorIt != smaDevices.end())
417     {
418         if ((smaSensorIt->second->getPath() == removedPath) &&
419             (std::find(interfaces.begin(), interfaces.end(),
420                        configInterfaceName(deviceType)) != interfaces.end()))
421         {
422             smaSensorIt = smaDevices.erase(smaSensorIt);
423         }
424         else
425         {
426             smaSensorIt++;
427         }
428     }
429 }
430