xref: /openbmc/dbus-sensors/src/nvidia-gpu/NvidiaDeviceDiscovery.cpp (revision e0b80e1e58bddcf218369f2f9e3ba2002b59b6f9)
1 /*
2  * SPDX-FileCopyrightText: Copyright OpenBMC Authors
3  * SPDX-License-Identifier: Apache-2.0
4  */
5 
6 #include "NvidiaDeviceDiscovery.hpp"
7 
8 #include "NvidiaGpuDevice.hpp"
9 #include "NvidiaPcieDevice.hpp"
10 #include "NvidiaSmaDevice.hpp"
11 #include "Utils.hpp"
12 
13 #include <bits/basic_string.h>
14 
15 #include <MctpRequester.hpp>
16 #include <NvidiaGpuMctpVdm.hpp>
17 #include <OcpMctpVdm.hpp>
18 #include <boost/asio/io_context.hpp>
19 #include <boost/container/flat_map.hpp>
20 #include <phosphor-logging/lg2.hpp>
21 #include <sdbusplus/asio/connection.hpp>
22 #include <sdbusplus/asio/object_server.hpp>
23 #include <sdbusplus/message.hpp>
24 #include <sdbusplus/message/native_types.hpp>
25 
26 #include <algorithm>
27 #include <array>
28 #include <cstdint>
29 #include <format>
30 #include <memory>
31 #include <span>
32 #include <stdexcept>
33 #include <string>
34 #include <system_error>
35 #include <utility>
36 #include <variant>
37 #include <vector>
38 
39 static constexpr auto sensorPollRateMs = 1000;
40 
processQueryDeviceIdResponse(boost::asio::io_context & io,sdbusplus::asio::object_server & objectServer,boost::container::flat_map<std::string,std::shared_ptr<GpuDevice>> & gpuDevices,boost::container::flat_map<std::string,std::shared_ptr<SmaDevice>> & smaDevices,boost::container::flat_map<std::string,std::shared_ptr<PcieDevice>> & pcieDevices,const std::shared_ptr<sdbusplus::asio::connection> & conn,mctp::MctpRequester & mctpRequester,const SensorConfigs & configs,const std::string & path,uint8_t eid,const std::error_code & sendRecvMsgResult,std::span<const uint8_t> queryDeviceIdentificationResponse)41 void processQueryDeviceIdResponse(
42     boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
43     boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
44         gpuDevices,
45     boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
46         smaDevices,
47     boost::container::flat_map<std::string, std::shared_ptr<PcieDevice>>&
48         pcieDevices,
49     const std::shared_ptr<sdbusplus::asio::connection>& conn,
50     mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
51     const std::string& path, uint8_t eid,
52     const std::error_code& sendRecvMsgResult,
53     std::span<const uint8_t> queryDeviceIdentificationResponse)
54 {
55     if (sendRecvMsgResult)
56     {
57         lg2::error(
58             "Error processing MCTP endpoint with eid {EID} : sending message over MCTP failed, rc={RC}",
59             "EID", eid, "RC", sendRecvMsgResult.message());
60         return;
61     }
62 
63     ocp::accelerator_management::CompletionCode cc{};
64     uint16_t reasonCode = 0;
65     uint8_t responseDeviceType = 0;
66     uint8_t responseInstanceId = 0;
67 
68     auto rc = gpu::decodeQueryDeviceIdentificationResponse(
69         queryDeviceIdentificationResponse, cc, reasonCode, responseDeviceType,
70         responseInstanceId);
71 
72     if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
73     {
74         lg2::error(
75             "Error processing MCTP endpoint with eid {EID} : decode failed, rc={RC}, cc={CC}, reasonCode={RESC}",
76             "EID", eid, "RC", rc, "CC", cc, "RESC", reasonCode);
77         return;
78     }
79 
80     switch (static_cast<gpu::DeviceIdentification>(responseDeviceType))
81     {
82         case gpu::DeviceIdentification::DEVICE_GPU:
83         {
84             lg2::info(
85                 "Found the GPU with EID {EID}, DeviceType {DEVTYPE}, InstanceId {IID}.",
86                 "EID", eid, "DEVTYPE", responseDeviceType, "IID",
87                 responseInstanceId);
88 
89             auto gpuName = configs.name + '_' +
90                            std::to_string(responseInstanceId);
91 
92             auto gpu = gpuDevices
93                            .insert(std::make_pair(
94                                gpuName, std::make_shared<GpuDevice>(
95                                             configs, gpuName, path, conn, eid,
96                                             io, mctpRequester, objectServer)))
97                            .first;
98             gpu->second->init();
99             break;
100         }
101 
102         case gpu::DeviceIdentification::DEVICE_SMA:
103         {
104             lg2::info(
105                 "Found the SMA Device with EID {EID}, DeviceType {DEVTYPE}, InstanceId {IID}.",
106                 "EID", eid, "DEVTYPE", responseDeviceType, "IID",
107                 responseInstanceId);
108 
109             auto smaName = configs.name + "_SMA_" +
110                            std::to_string(responseInstanceId);
111 
112             auto sma = smaDevices
113                            .insert(std::make_pair(
114                                smaName, std::make_shared<SmaDevice>(
115                                             configs, smaName, path, conn, eid,
116                                             io, mctpRequester, objectServer)))
117                            .first;
118             sma->second->init();
119             break;
120         }
121 
122         case gpu::DeviceIdentification::DEVICE_PCIE:
123         {
124             lg2::info(
125                 "Found the PCIe Device with EID {EID}, DeviceType {DEVTYPE}, InstanceId {IID}.",
126                 "EID", eid, "DEVTYPE", responseDeviceType, "IID",
127                 responseInstanceId);
128 
129             std::string pcieName =
130                 std::format("Nvidia_ConnectX_{}", responseInstanceId);
131 
132             auto pcieDevice =
133                 pcieDevices
134                     .insert(std::make_pair(
135                         pcieName, std::make_shared<PcieDevice>(
136                                       configs, pcieName, path, conn, eid, io,
137                                       mctpRequester, objectServer)))
138                     .first;
139             pcieDevice->second->init();
140             break;
141         }
142     }
143 }
144 
queryDeviceIdentification(boost::asio::io_context & io,sdbusplus::asio::object_server & objectServer,boost::container::flat_map<std::string,std::shared_ptr<GpuDevice>> & gpuDevices,boost::container::flat_map<std::string,std::shared_ptr<SmaDevice>> & smaDevices,boost::container::flat_map<std::string,std::shared_ptr<PcieDevice>> & pcieDevices,const std::shared_ptr<sdbusplus::asio::connection> & conn,mctp::MctpRequester & mctpRequester,const SensorConfigs & configs,const std::string & path,uint8_t eid)145 void queryDeviceIdentification(
146     boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
147     boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
148         gpuDevices,
149     boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
150         smaDevices,
151     boost::container::flat_map<std::string, std::shared_ptr<PcieDevice>>&
152         pcieDevices,
153     const std::shared_ptr<sdbusplus::asio::connection>& conn,
154     mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
155     const std::string& path, uint8_t eid)
156 {
157     auto queryDeviceIdentificationRequest = std::make_shared<
158         std::array<uint8_t, sizeof(gpu::QueryDeviceIdentificationRequest)>>();
159 
160     auto rc = gpu::encodeQueryDeviceIdentificationRequest(
161         0, *queryDeviceIdentificationRequest);
162     if (rc != 0)
163     {
164         lg2::error(
165             "Error processing MCTP endpoint with eid {EID} : encode failed, rc={RC}",
166             "EID", eid, "RC", rc);
167         return;
168     }
169 
170     mctpRequester.sendRecvMsg(
171         eid, *queryDeviceIdentificationRequest,
172         [&io, &objectServer, &gpuDevices, &smaDevices, &pcieDevices, conn,
173          &mctpRequester, configs, path, eid, queryDeviceIdentificationRequest](
174             const std::error_code& ec, std::span<const uint8_t> response) {
175             processQueryDeviceIdResponse(
176                 io, objectServer, gpuDevices, smaDevices, pcieDevices, conn,
177                 mctpRequester, configs, path, eid, ec, response);
178         });
179 }
180 
processEndpoint(boost::asio::io_context & io,sdbusplus::asio::object_server & objectServer,boost::container::flat_map<std::string,std::shared_ptr<GpuDevice>> & gpuDevices,boost::container::flat_map<std::string,std::shared_ptr<SmaDevice>> & smaDevices,boost::container::flat_map<std::string,std::shared_ptr<PcieDevice>> & pcieDevices,const std::shared_ptr<sdbusplus::asio::connection> & conn,mctp::MctpRequester & mctpRequester,const SensorConfigs & configs,const std::string & path,const boost::system::error_code & ec,const SensorBaseConfigMap & endpoint)181 void processEndpoint(
182     boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
183     boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
184         gpuDevices,
185     boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
186         smaDevices,
187     boost::container::flat_map<std::string, std::shared_ptr<PcieDevice>>&
188         pcieDevices,
189     const std::shared_ptr<sdbusplus::asio::connection>& conn,
190     mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
191     const std::string& path, const boost::system::error_code& ec,
192     const SensorBaseConfigMap& endpoint)
193 {
194     if (ec)
195     {
196         lg2::error("Error processing MCTP endpoint: Error:{ERROR}", "ERROR",
197                    ec.message());
198         return;
199     }
200 
201     auto hasEid = endpoint.find("EID");
202     uint8_t eid{};
203 
204     if (hasEid != endpoint.end())
205     {
206         const auto* eidPtr = std::get_if<uint8_t>(&hasEid->second);
207         if (eidPtr != nullptr)
208         {
209             eid = *eidPtr;
210         }
211         else
212         {
213             lg2::error(
214                 "Error processing MCTP endpoint: Property EID does not have valid type.");
215             return;
216         }
217     }
218     else
219     {
220         lg2::error(
221             "Error processing MCTP endpoint: Property EID not found in the configuration.");
222         return;
223     }
224 
225     auto hasMctpTypes = endpoint.find("SupportedMessageTypes");
226     std::vector<uint8_t> mctpTypes{};
227 
228     if (hasMctpTypes != endpoint.end())
229     {
230         const auto* mctpTypePtr =
231             std::get_if<std::vector<uint8_t>>(&hasMctpTypes->second);
232         if (mctpTypePtr != nullptr)
233         {
234             mctpTypes = *mctpTypePtr;
235         }
236         else
237         {
238             lg2::error(
239                 "Error processing MCTP endpoint with eid {EID} : Property SupportedMessageTypes does not have valid type.",
240                 "EID", eid);
241             return;
242         }
243     }
244     else
245     {
246         lg2::error(
247             "Error processing MCTP endpoint with eid {EID} : Property SupportedMessageTypes not found in the configuration.",
248             "EID", eid);
249         return;
250     }
251 
252     if (std::find(mctpTypes.begin(), mctpTypes.end(),
253                   ocp::accelerator_management::messageType) != mctpTypes.end())
254     {
255         lg2::info("Found OCP MCTP VDM Endpoint with ID {EID}", "EID", eid);
256         queryDeviceIdentification(io, objectServer, gpuDevices, smaDevices,
257                                   pcieDevices, conn, mctpRequester, configs,
258                                   path, eid);
259     }
260 }
261 
queryEndpoints(boost::asio::io_context & io,sdbusplus::asio::object_server & objectServer,boost::container::flat_map<std::string,std::shared_ptr<GpuDevice>> & gpuDevices,boost::container::flat_map<std::string,std::shared_ptr<SmaDevice>> & smaDevices,boost::container::flat_map<std::string,std::shared_ptr<PcieDevice>> & pcieDevices,const std::shared_ptr<sdbusplus::asio::connection> & conn,mctp::MctpRequester & mctpRequester,const SensorConfigs & configs,const std::string & path,const boost::system::error_code & ec,const GetSubTreeType & ret)262 void queryEndpoints(
263     boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
264     boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
265         gpuDevices,
266     boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
267         smaDevices,
268     boost::container::flat_map<std::string, std::shared_ptr<PcieDevice>>&
269         pcieDevices,
270     const std::shared_ptr<sdbusplus::asio::connection>& conn,
271     mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
272     const std::string& path, const boost::system::error_code& ec,
273     const GetSubTreeType& ret)
274 {
275     if (ec)
276     {
277         lg2::error("Error processing MCTP endpoints: {ERROR}", "ERROR",
278                    ec.message());
279         return;
280     }
281 
282     if (ret.empty())
283     {
284         return;
285     }
286 
287     for (const auto& [objPath, services] : ret)
288     {
289         for (const auto& [service, ifaces] : services)
290         {
291             for (const auto& iface : ifaces)
292             {
293                 if (iface == "xyz.openbmc_project.MCTP.Endpoint")
294                 {
295                     conn->async_method_call(
296                         [&io, &objectServer, &gpuDevices, &smaDevices,
297                          &pcieDevices, conn, &mctpRequester, configs,
298                          path](const boost::system::error_code& ec,
299                                const SensorBaseConfigMap& endpoint) {
300                             processEndpoint(io, objectServer, gpuDevices,
301                                             smaDevices, pcieDevices, conn,
302                                             mctpRequester, configs, path, ec,
303                                             endpoint);
304                         },
305                         service, objPath, "org.freedesktop.DBus.Properties",
306                         "GetAll", iface);
307                 }
308             }
309         }
310     }
311 }
312 
discoverDevices(boost::asio::io_context & io,sdbusplus::asio::object_server & objectServer,boost::container::flat_map<std::string,std::shared_ptr<GpuDevice>> & gpuDevices,boost::container::flat_map<std::string,std::shared_ptr<SmaDevice>> & smaDevices,boost::container::flat_map<std::string,std::shared_ptr<PcieDevice>> & pcieDevices,const std::shared_ptr<sdbusplus::asio::connection> & conn,mctp::MctpRequester & mctpRequester,const SensorConfigs & configs,const std::string & path)313 void discoverDevices(
314     boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
315     boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
316         gpuDevices,
317     boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
318         smaDevices,
319     boost::container::flat_map<std::string, std::shared_ptr<PcieDevice>>&
320         pcieDevices,
321     const std::shared_ptr<sdbusplus::asio::connection>& conn,
322     mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
323     const std::string& path)
324 {
325     std::string searchPath{"/au/com/codeconstruct/"};
326     std::vector<std::string> ifaceList{{"xyz.openbmc_project.MCTP.Endpoint"}};
327 
328     conn->async_method_call(
329         [&io, &objectServer, &gpuDevices, &smaDevices, &pcieDevices, conn,
330          &mctpRequester, configs,
331          path](const boost::system::error_code& ec, const GetSubTreeType& ret) {
332             queryEndpoints(io, objectServer, gpuDevices, smaDevices,
333                            pcieDevices, conn, mctpRequester, configs, path, ec,
334                            ret);
335         },
336         "xyz.openbmc_project.ObjectMapper",
337         "/xyz/openbmc_project/object_mapper",
338         "xyz.openbmc_project.ObjectMapper", "GetSubTree", searchPath, 0,
339         ifaceList);
340 }
341 
processSensorConfigs(boost::asio::io_context & io,sdbusplus::asio::object_server & objectServer,boost::container::flat_map<std::string,std::shared_ptr<GpuDevice>> & gpuDevices,boost::container::flat_map<std::string,std::shared_ptr<SmaDevice>> & smaDevices,boost::container::flat_map<std::string,std::shared_ptr<PcieDevice>> & pcieDevices,const std::shared_ptr<sdbusplus::asio::connection> & dbusConnection,mctp::MctpRequester & mctpRequester,const ManagedObjectType & resp)342 void processSensorConfigs(
343     boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
344     boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
345         gpuDevices,
346     boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
347         smaDevices,
348     boost::container::flat_map<std::string, std::shared_ptr<PcieDevice>>&
349         pcieDevices,
350     const std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
351     mctp::MctpRequester& mctpRequester, const ManagedObjectType& resp)
352 {
353     for (const auto& [path, interfaces] : resp)
354     {
355         for (const auto& [intf, cfg] : interfaces)
356         {
357             if (intf != configInterfaceName(deviceType))
358             {
359                 continue;
360             }
361 
362             SensorConfigs configs;
363 
364             configs.name = loadVariant<std::string>(cfg, "Name");
365 
366             try
367             {
368                 configs.pollRate = loadVariant<uint64_t>(cfg, "PollRate");
369             }
370             catch (const std::invalid_argument&)
371             {
372                 // PollRate is an optional config
373                 configs.pollRate = sensorPollRateMs;
374             }
375 
376             discoverDevices(io, objectServer, gpuDevices, smaDevices,
377                             pcieDevices, dbusConnection, mctpRequester, configs,
378                             path);
379 
380             lg2::info(
381                 "Detected configuration {NAME} of type {TYPE} at path: {PATH}.",
382                 "NAME", configs.name, "TYPE", deviceType, "PATH", path);
383         }
384     }
385 }
386 
createSensors(boost::asio::io_context & io,sdbusplus::asio::object_server & objectServer,boost::container::flat_map<std::string,std::shared_ptr<GpuDevice>> & gpuDevices,boost::container::flat_map<std::string,std::shared_ptr<SmaDevice>> & smaDevices,boost::container::flat_map<std::string,std::shared_ptr<PcieDevice>> & pcieDevices,const std::shared_ptr<sdbusplus::asio::connection> & dbusConnection,mctp::MctpRequester & mctpRequester)387 void createSensors(
388     boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
389     boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
390         gpuDevices,
391     boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
392         smaDevices,
393     boost::container::flat_map<std::string, std::shared_ptr<PcieDevice>>&
394         pcieDevices,
395     const std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
396     mctp::MctpRequester& mctpRequester)
397 {
398     if (!dbusConnection)
399     {
400         lg2::error("Connection not created");
401         return;
402     }
403     dbusConnection->async_method_call(
404         [&gpuDevices, &smaDevices, &pcieDevices, &mctpRequester, dbusConnection,
405          &io, &objectServer](boost::system::error_code ec,
406                              const ManagedObjectType& resp) {
407             if (ec)
408             {
409                 lg2::error("Error contacting entity manager");
410                 return;
411             }
412 
413             processSensorConfigs(io, objectServer, gpuDevices, smaDevices,
414                                  pcieDevices, dbusConnection, mctpRequester,
415                                  resp);
416         },
417         entityManagerName, "/xyz/openbmc_project/inventory",
418         "org.freedesktop.DBus.ObjectManager", "GetManagedObjects");
419 }
420 
interfaceRemoved(sdbusplus::message_t & message,boost::container::flat_map<std::string,std::shared_ptr<GpuDevice>> & gpuDevices,boost::container::flat_map<std::string,std::shared_ptr<SmaDevice>> & smaDevices,boost::container::flat_map<std::string,std::shared_ptr<PcieDevice>> & pcieDevices)421 void interfaceRemoved(
422     sdbusplus::message_t& message,
423     boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
424         gpuDevices,
425     boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
426         smaDevices,
427     boost::container::flat_map<std::string, std::shared_ptr<PcieDevice>>&
428         pcieDevices)
429 {
430     if (message.is_method_error())
431     {
432         lg2::error("interfacesRemoved callback method error");
433         return;
434     }
435 
436     sdbusplus::message::object_path removedPath;
437     std::vector<std::string> interfaces;
438 
439     message.read(removedPath, interfaces);
440 
441     // If the xyz.openbmc_project.Confguration.X interface was removed
442     // for one or more sensors, delete those sensor objects.
443     auto sensorIt = gpuDevices.begin();
444     while (sensorIt != gpuDevices.end())
445     {
446         if ((sensorIt->second->getPath() == removedPath) &&
447             (std::find(interfaces.begin(), interfaces.end(),
448                        configInterfaceName(deviceType)) != interfaces.end()))
449         {
450             sensorIt = gpuDevices.erase(sensorIt);
451         }
452         else
453         {
454             sensorIt++;
455         }
456     }
457 
458     auto smaSensorIt = smaDevices.begin();
459     while (smaSensorIt != smaDevices.end())
460     {
461         if ((smaSensorIt->second->getPath() == removedPath) &&
462             (std::find(interfaces.begin(), interfaces.end(),
463                        configInterfaceName(deviceType)) != interfaces.end()))
464         {
465             smaSensorIt = smaDevices.erase(smaSensorIt);
466         }
467         else
468         {
469             smaSensorIt++;
470         }
471     }
472 
473     auto pcieSensorIt = pcieDevices.begin();
474     while (pcieSensorIt != pcieDevices.end())
475     {
476         if ((pcieSensorIt->second->getPath() == removedPath) &&
477             (std::find(interfaces.begin(), interfaces.end(),
478                        configInterfaceName(deviceType)) != interfaces.end()))
479         {
480             pcieSensorIt = pcieDevices.erase(pcieSensorIt);
481         }
482         else
483         {
484             pcieSensorIt++;
485         }
486     }
487 }
488