1 /*
2 * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
3 * AFFILIATES. All rights reserved.
4 * SPDX-License-Identifier: Apache-2.0
5 */
6
7 #include "NvidiaDeviceDiscovery.hpp"
8
9 #include "NvidiaGpuDevice.hpp"
10 #include "NvidiaSmaDevice.hpp"
11 #include "Utils.hpp"
12
13 #include <bits/basic_string.h>
14
15 #include <MctpRequester.hpp>
16 #include <NvidiaGpuMctpVdm.hpp>
17 #include <OcpMctpVdm.hpp>
18 #include <boost/asio/io_context.hpp>
19 #include <boost/container/flat_map.hpp>
20 #include <phosphor-logging/lg2.hpp>
21 #include <sdbusplus/asio/connection.hpp>
22 #include <sdbusplus/asio/object_server.hpp>
23 #include <sdbusplus/message.hpp>
24 #include <sdbusplus/message/native_types.hpp>
25
26 #include <algorithm>
27 #include <array>
28 #include <cstdint>
29 #include <memory>
30 #include <span>
31 #include <stdexcept>
32 #include <string>
33 #include <system_error>
34 #include <utility>
35 #include <variant>
36 #include <vector>
37
38 static constexpr auto sensorPollRateMs = 1000;
39
processQueryDeviceIdResponse(boost::asio::io_context & io,sdbusplus::asio::object_server & objectServer,boost::container::flat_map<std::string,std::shared_ptr<GpuDevice>> & gpuDevices,boost::container::flat_map<std::string,std::shared_ptr<SmaDevice>> & smaDevices,const std::shared_ptr<sdbusplus::asio::connection> & conn,mctp::MctpRequester & mctpRequester,const SensorConfigs & configs,const std::string & path,uint8_t eid,const std::error_code & sendRecvMsgResult,std::span<const uint8_t> queryDeviceIdentificationResponse)40 void processQueryDeviceIdResponse(
41 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
42 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
43 gpuDevices,
44 boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
45 smaDevices,
46 const std::shared_ptr<sdbusplus::asio::connection>& conn,
47 mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
48 const std::string& path, uint8_t eid,
49 const std::error_code& sendRecvMsgResult,
50 std::span<const uint8_t> queryDeviceIdentificationResponse)
51 {
52 if (sendRecvMsgResult)
53 {
54 lg2::error(
55 "Error processing MCTP endpoint with eid {EID} : sending message over MCTP failed, rc={RC}",
56 "EID", eid, "RC", sendRecvMsgResult.message());
57 return;
58 }
59
60 ocp::accelerator_management::CompletionCode cc{};
61 uint16_t reasonCode = 0;
62 uint8_t responseDeviceType = 0;
63 uint8_t responseInstanceId = 0;
64
65 auto rc = gpu::decodeQueryDeviceIdentificationResponse(
66 queryDeviceIdentificationResponse, cc, reasonCode, responseDeviceType,
67 responseInstanceId);
68
69 if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
70 {
71 lg2::error(
72 "Error processing MCTP endpoint with eid {EID} : decode failed, rc={RC}, cc={CC}, reasonCode={RESC}",
73 "EID", eid, "RC", rc, "CC", cc, "RESC", reasonCode);
74 return;
75 }
76
77 switch (static_cast<gpu::DeviceIdentification>(responseDeviceType))
78 {
79 case gpu::DeviceIdentification::DEVICE_GPU:
80 {
81 lg2::info(
82 "Found the GPU with EID {EID}, DeviceType {DEVTYPE}, InstanceId {IID}.",
83 "EID", eid, "DEVTYPE", responseDeviceType, "IID",
84 responseInstanceId);
85
86 auto gpuName = configs.name + '_' +
87 std::to_string(responseInstanceId);
88
89 auto gpu = gpuDevices
90 .insert(std::make_pair(
91 gpuName, std::make_shared<GpuDevice>(
92 configs, gpuName, path, conn, eid,
93 io, mctpRequester, objectServer)))
94 .first;
95 (*gpu).second->init();
96 break;
97 }
98
99 case gpu::DeviceIdentification::DEVICE_SMA:
100 {
101 lg2::info(
102 "Found the SMA Device with EID {EID}, DeviceType {DEVTYPE}, InstanceId {IID}.",
103 "EID", eid, "DEVTYPE", responseDeviceType, "IID",
104 responseInstanceId);
105
106 auto smaName = configs.name + "_SMA_" +
107 std::to_string(responseInstanceId);
108
109 auto sma = smaDevices
110 .insert(std::make_pair(
111 smaName, std::make_shared<SmaDevice>(
112 configs, smaName, path, conn, eid,
113 io, mctpRequester, objectServer)))
114 .first;
115 (*sma).second->init();
116 break;
117 }
118 }
119 }
120
queryDeviceIdentification(boost::asio::io_context & io,sdbusplus::asio::object_server & objectServer,boost::container::flat_map<std::string,std::shared_ptr<GpuDevice>> & gpuDevices,boost::container::flat_map<std::string,std::shared_ptr<SmaDevice>> & smaDevices,const std::shared_ptr<sdbusplus::asio::connection> & conn,mctp::MctpRequester & mctpRequester,const SensorConfigs & configs,const std::string & path,uint8_t eid)121 void queryDeviceIdentification(
122 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
123 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
124 gpuDevices,
125 boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
126 smaDevices,
127 const std::shared_ptr<sdbusplus::asio::connection>& conn,
128 mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
129 const std::string& path, uint8_t eid)
130 {
131 auto queryDeviceIdentificationRequest = std::make_shared<
132 std::array<uint8_t, sizeof(gpu::QueryDeviceIdentificationRequest)>>();
133
134 auto rc = gpu::encodeQueryDeviceIdentificationRequest(
135 0, *queryDeviceIdentificationRequest);
136 if (rc != 0)
137 {
138 lg2::error(
139 "Error processing MCTP endpoint with eid {EID} : encode failed, rc={RC}",
140 "EID", eid, "RC", rc);
141 return;
142 }
143
144 mctpRequester.sendRecvMsg(
145 eid, *queryDeviceIdentificationRequest,
146 [&io, &objectServer, &gpuDevices, &smaDevices, conn, &mctpRequester,
147 configs, path, eid, queryDeviceIdentificationRequest](
148 const std::error_code& ec, std::span<const uint8_t> response) {
149 processQueryDeviceIdResponse(io, objectServer, gpuDevices,
150 smaDevices, conn, mctpRequester,
151 configs, path, eid, ec, response);
152 });
153 }
154
processEndpoint(boost::asio::io_context & io,sdbusplus::asio::object_server & objectServer,boost::container::flat_map<std::string,std::shared_ptr<GpuDevice>> & gpuDevices,boost::container::flat_map<std::string,std::shared_ptr<SmaDevice>> & smaDevices,const std::shared_ptr<sdbusplus::asio::connection> & conn,mctp::MctpRequester & mctpRequester,const SensorConfigs & configs,const std::string & path,const boost::system::error_code & ec,const SensorBaseConfigMap & endpoint)155 void processEndpoint(
156 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
157 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
158 gpuDevices,
159 boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
160 smaDevices,
161 const std::shared_ptr<sdbusplus::asio::connection>& conn,
162 mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
163 const std::string& path, const boost::system::error_code& ec,
164 const SensorBaseConfigMap& endpoint)
165 {
166 if (ec)
167 {
168 lg2::error("Error processing MCTP endpoint: Error:{ERROR}", "ERROR",
169 ec.message());
170 return;
171 }
172
173 auto hasEid = endpoint.find("EID");
174 uint8_t eid{};
175
176 if (hasEid != endpoint.end())
177 {
178 const auto* eidPtr = std::get_if<uint8_t>(&hasEid->second);
179 if (eidPtr != nullptr)
180 {
181 eid = *eidPtr;
182 }
183 else
184 {
185 lg2::error(
186 "Error processing MCTP endpoint: Property EID does not have valid type.");
187 return;
188 }
189 }
190 else
191 {
192 lg2::error(
193 "Error processing MCTP endpoint: Property EID not found in the configuration.");
194 return;
195 }
196
197 auto hasMctpTypes = endpoint.find("SupportedMessageTypes");
198 std::vector<uint8_t> mctpTypes{};
199
200 if (hasMctpTypes != endpoint.end())
201 {
202 const auto* mctpTypePtr =
203 std::get_if<std::vector<uint8_t>>(&hasMctpTypes->second);
204 if (mctpTypePtr != nullptr)
205 {
206 mctpTypes = *mctpTypePtr;
207 }
208 else
209 {
210 lg2::error(
211 "Error processing MCTP endpoint with eid {EID} : Property SupportedMessageTypes does not have valid type.",
212 "EID", eid);
213 return;
214 }
215 }
216 else
217 {
218 lg2::error(
219 "Error processing MCTP endpoint with eid {EID} : Property SupportedMessageTypes not found in the configuration.",
220 "EID", eid);
221 return;
222 }
223
224 if (std::find(mctpTypes.begin(), mctpTypes.end(),
225 ocp::accelerator_management::messageType) != mctpTypes.end())
226 {
227 lg2::info("Found OCP MCTP VDM Endpoint with ID {EID}", "EID", eid);
228 queryDeviceIdentification(io, objectServer, gpuDevices, smaDevices,
229 conn, mctpRequester, configs, path, eid);
230 }
231 }
232
queryEndpoints(boost::asio::io_context & io,sdbusplus::asio::object_server & objectServer,boost::container::flat_map<std::string,std::shared_ptr<GpuDevice>> & gpuDevices,boost::container::flat_map<std::string,std::shared_ptr<SmaDevice>> & smaDevices,const std::shared_ptr<sdbusplus::asio::connection> & conn,mctp::MctpRequester & mctpRequester,const SensorConfigs & configs,const std::string & path,const boost::system::error_code & ec,const GetSubTreeType & ret)233 void queryEndpoints(
234 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
235 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
236 gpuDevices,
237 boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
238 smaDevices,
239 const std::shared_ptr<sdbusplus::asio::connection>& conn,
240 mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
241 const std::string& path, const boost::system::error_code& ec,
242 const GetSubTreeType& ret)
243 {
244 if (ec)
245 {
246 lg2::error("Error processing MCTP endpoints: {ERROR}", "ERROR",
247 ec.message());
248 return;
249 }
250
251 if (ret.empty())
252 {
253 return;
254 }
255
256 for (const auto& [objPath, services] : ret)
257 {
258 for (const auto& [service, ifaces] : services)
259 {
260 for (const auto& iface : ifaces)
261 {
262 if (iface == "xyz.openbmc_project.MCTP.Endpoint")
263 {
264 conn->async_method_call(
265 [&io, &objectServer, &gpuDevices, &smaDevices, conn,
266 &mctpRequester, configs,
267 path](const boost::system::error_code& ec,
268 const SensorBaseConfigMap& endpoint) {
269 processEndpoint(io, objectServer, gpuDevices,
270 smaDevices, conn, mctpRequester,
271 configs, path, ec, endpoint);
272 },
273 service, objPath, "org.freedesktop.DBus.Properties",
274 "GetAll", iface);
275 }
276 }
277 }
278 }
279 }
280
discoverDevices(boost::asio::io_context & io,sdbusplus::asio::object_server & objectServer,boost::container::flat_map<std::string,std::shared_ptr<GpuDevice>> & gpuDevices,boost::container::flat_map<std::string,std::shared_ptr<SmaDevice>> & smaDevices,const std::shared_ptr<sdbusplus::asio::connection> & conn,mctp::MctpRequester & mctpRequester,const SensorConfigs & configs,const std::string & path)281 void discoverDevices(
282 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
283 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
284 gpuDevices,
285 boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
286 smaDevices,
287 const std::shared_ptr<sdbusplus::asio::connection>& conn,
288 mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
289 const std::string& path)
290 {
291 std::string searchPath{"/au/com/codeconstruct/"};
292 std::vector<std::string> ifaceList{{"xyz.openbmc_project.MCTP.Endpoint"}};
293
294 conn->async_method_call(
295 [&io, &objectServer, &gpuDevices, &smaDevices, conn, &mctpRequester,
296 configs,
297 path](const boost::system::error_code& ec, const GetSubTreeType& ret) {
298 queryEndpoints(io, objectServer, gpuDevices, smaDevices, conn,
299 mctpRequester, configs, path, ec, ret);
300 },
301 "xyz.openbmc_project.ObjectMapper",
302 "/xyz/openbmc_project/object_mapper",
303 "xyz.openbmc_project.ObjectMapper", "GetSubTree", searchPath, 0,
304 ifaceList);
305 }
306
processSensorConfigs(boost::asio::io_context & io,sdbusplus::asio::object_server & objectServer,boost::container::flat_map<std::string,std::shared_ptr<GpuDevice>> & gpuDevices,boost::container::flat_map<std::string,std::shared_ptr<SmaDevice>> & smaDevices,const std::shared_ptr<sdbusplus::asio::connection> & dbusConnection,mctp::MctpRequester & mctpRequester,const ManagedObjectType & resp)307 void processSensorConfigs(
308 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
309 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
310 gpuDevices,
311 boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
312 smaDevices,
313 const std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
314 mctp::MctpRequester& mctpRequester, const ManagedObjectType& resp)
315 {
316 for (const auto& [path, interfaces] : resp)
317 {
318 for (const auto& [intf, cfg] : interfaces)
319 {
320 if (intf != configInterfaceName(deviceType))
321 {
322 continue;
323 }
324
325 SensorConfigs configs;
326
327 configs.name = loadVariant<std::string>(cfg, "Name");
328
329 try
330 {
331 configs.pollRate = loadVariant<uint64_t>(cfg, "PollRate");
332 }
333 catch (const std::invalid_argument&)
334 {
335 // PollRate is an optional config
336 configs.pollRate = sensorPollRateMs;
337 }
338
339 discoverDevices(io, objectServer, gpuDevices, smaDevices,
340 dbusConnection, mctpRequester, configs, path);
341
342 lg2::info(
343 "Detected configuration {NAME} of type {TYPE} at path: {PATH}.",
344 "NAME", configs.name, "TYPE", deviceType, "PATH", path);
345 }
346 }
347 }
348
createSensors(boost::asio::io_context & io,sdbusplus::asio::object_server & objectServer,boost::container::flat_map<std::string,std::shared_ptr<GpuDevice>> & gpuDevices,boost::container::flat_map<std::string,std::shared_ptr<SmaDevice>> & smaDevices,const std::shared_ptr<sdbusplus::asio::connection> & dbusConnection,mctp::MctpRequester & mctpRequester)349 void createSensors(
350 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
351 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
352 gpuDevices,
353 boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
354 smaDevices,
355 const std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
356 mctp::MctpRequester& mctpRequester)
357 {
358 if (!dbusConnection)
359 {
360 lg2::error("Connection not created");
361 return;
362 }
363 dbusConnection->async_method_call(
364 [&gpuDevices, &smaDevices, &mctpRequester, dbusConnection, &io,
365 &objectServer](boost::system::error_code ec,
366 const ManagedObjectType& resp) {
367 if (ec)
368 {
369 lg2::error("Error contacting entity manager");
370 return;
371 }
372
373 processSensorConfigs(io, objectServer, gpuDevices, smaDevices,
374 dbusConnection, mctpRequester, resp);
375 },
376 entityManagerName, "/xyz/openbmc_project/inventory",
377 "org.freedesktop.DBus.ObjectManager", "GetManagedObjects");
378 }
379
interfaceRemoved(sdbusplus::message_t & message,boost::container::flat_map<std::string,std::shared_ptr<GpuDevice>> & gpuDevices,boost::container::flat_map<std::string,std::shared_ptr<SmaDevice>> & smaDevices)380 void interfaceRemoved(
381 sdbusplus::message_t& message,
382 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
383 gpuDevices,
384 boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
385 smaDevices)
386 {
387 if (message.is_method_error())
388 {
389 lg2::error("interfacesRemoved callback method error");
390 return;
391 }
392
393 sdbusplus::message::object_path removedPath;
394 std::vector<std::string> interfaces;
395
396 message.read(removedPath, interfaces);
397
398 // If the xyz.openbmc_project.Confguration.X interface was removed
399 // for one or more sensors, delete those sensor objects.
400 auto sensorIt = gpuDevices.begin();
401 while (sensorIt != gpuDevices.end())
402 {
403 if ((sensorIt->second->getPath() == removedPath) &&
404 (std::find(interfaces.begin(), interfaces.end(),
405 configInterfaceName(deviceType)) != interfaces.end()))
406 {
407 sensorIt = gpuDevices.erase(sensorIt);
408 }
409 else
410 {
411 sensorIt++;
412 }
413 }
414
415 auto smaSensorIt = smaDevices.begin();
416 while (smaSensorIt != smaDevices.end())
417 {
418 if ((smaSensorIt->second->getPath() == removedPath) &&
419 (std::find(interfaces.begin(), interfaces.end(),
420 configInterfaceName(deviceType)) != interfaces.end()))
421 {
422 smaSensorIt = smaDevices.erase(smaSensorIt);
423 }
424 else
425 {
426 smaSensorIt++;
427 }
428 }
429 }
430