1 /*
2 * SPDX-FileCopyrightText: Copyright OpenBMC Authors
3 * SPDX-License-Identifier: Apache-2.0
4 */
5
6 #include "NvidiaDeviceDiscovery.hpp"
7
8 #include "NvidiaGpuDevice.hpp"
9 #include "NvidiaPcieDevice.hpp"
10 #include "NvidiaSmaDevice.hpp"
11 #include "Utils.hpp"
12
13 #include <bits/basic_string.h>
14
15 #include <MctpRequester.hpp>
16 #include <NvidiaGpuMctpVdm.hpp>
17 #include <OcpMctpVdm.hpp>
18 #include <boost/asio/io_context.hpp>
19 #include <boost/container/flat_map.hpp>
20 #include <phosphor-logging/lg2.hpp>
21 #include <sdbusplus/asio/connection.hpp>
22 #include <sdbusplus/asio/object_server.hpp>
23 #include <sdbusplus/message.hpp>
24 #include <sdbusplus/message/native_types.hpp>
25
26 #include <algorithm>
27 #include <array>
28 #include <cstdint>
29 #include <format>
30 #include <memory>
31 #include <span>
32 #include <stdexcept>
33 #include <string>
34 #include <system_error>
35 #include <utility>
36 #include <variant>
37 #include <vector>
38
39 static constexpr auto sensorPollRateMs = 1000;
40
processQueryDeviceIdResponse(boost::asio::io_context & io,sdbusplus::asio::object_server & objectServer,boost::container::flat_map<std::string,std::shared_ptr<GpuDevice>> & gpuDevices,boost::container::flat_map<std::string,std::shared_ptr<SmaDevice>> & smaDevices,boost::container::flat_map<std::string,std::shared_ptr<PcieDevice>> & pcieDevices,const std::shared_ptr<sdbusplus::asio::connection> & conn,mctp::MctpRequester & mctpRequester,const SensorConfigs & configs,const std::string & path,uint8_t eid,const std::error_code & sendRecvMsgResult,std::span<const uint8_t> queryDeviceIdentificationResponse)41 void processQueryDeviceIdResponse(
42 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
43 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
44 gpuDevices,
45 boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
46 smaDevices,
47 boost::container::flat_map<std::string, std::shared_ptr<PcieDevice>>&
48 pcieDevices,
49 const std::shared_ptr<sdbusplus::asio::connection>& conn,
50 mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
51 const std::string& path, uint8_t eid,
52 const std::error_code& sendRecvMsgResult,
53 std::span<const uint8_t> queryDeviceIdentificationResponse)
54 {
55 if (sendRecvMsgResult)
56 {
57 lg2::error(
58 "Error processing MCTP endpoint with eid {EID} : sending message over MCTP failed, rc={RC}",
59 "EID", eid, "RC", sendRecvMsgResult.message());
60 return;
61 }
62
63 ocp::accelerator_management::CompletionCode cc{};
64 uint16_t reasonCode = 0;
65 uint8_t responseDeviceType = 0;
66 uint8_t responseInstanceId = 0;
67
68 auto rc = gpu::decodeQueryDeviceIdentificationResponse(
69 queryDeviceIdentificationResponse, cc, reasonCode, responseDeviceType,
70 responseInstanceId);
71
72 if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
73 {
74 lg2::error(
75 "Error processing MCTP endpoint with eid {EID} : decode failed, rc={RC}, cc={CC}, reasonCode={RESC}",
76 "EID", eid, "RC", rc, "CC", cc, "RESC", reasonCode);
77 return;
78 }
79
80 switch (static_cast<gpu::DeviceIdentification>(responseDeviceType))
81 {
82 case gpu::DeviceIdentification::DEVICE_GPU:
83 {
84 lg2::info(
85 "Found the GPU with EID {EID}, DeviceType {DEVTYPE}, InstanceId {IID}.",
86 "EID", eid, "DEVTYPE", responseDeviceType, "IID",
87 responseInstanceId);
88
89 auto gpuName = configs.name + '_' +
90 std::to_string(responseInstanceId);
91
92 auto gpu = gpuDevices
93 .insert(std::make_pair(
94 gpuName, std::make_shared<GpuDevice>(
95 configs, gpuName, path, conn, eid,
96 io, mctpRequester, objectServer)))
97 .first;
98 gpu->second->init();
99 break;
100 }
101
102 case gpu::DeviceIdentification::DEVICE_SMA:
103 {
104 lg2::info(
105 "Found the SMA Device with EID {EID}, DeviceType {DEVTYPE}, InstanceId {IID}.",
106 "EID", eid, "DEVTYPE", responseDeviceType, "IID",
107 responseInstanceId);
108
109 auto smaName = configs.name + "_SMA_" +
110 std::to_string(responseInstanceId);
111
112 auto sma = smaDevices
113 .insert(std::make_pair(
114 smaName, std::make_shared<SmaDevice>(
115 configs, smaName, path, conn, eid,
116 io, mctpRequester, objectServer)))
117 .first;
118 sma->second->init();
119 break;
120 }
121
122 case gpu::DeviceIdentification::DEVICE_PCIE:
123 {
124 lg2::info(
125 "Found the PCIe Device with EID {EID}, DeviceType {DEVTYPE}, InstanceId {IID}.",
126 "EID", eid, "DEVTYPE", responseDeviceType, "IID",
127 responseInstanceId);
128
129 std::string pcieName =
130 std::format("Nvidia_ConnectX_{}", responseInstanceId);
131
132 auto pcieDevice =
133 pcieDevices
134 .insert(std::make_pair(
135 pcieName, std::make_shared<PcieDevice>(
136 configs, pcieName, path, conn, eid, io,
137 mctpRequester, objectServer)))
138 .first;
139 pcieDevice->second->init();
140 break;
141 }
142 }
143 }
144
queryDeviceIdentification(boost::asio::io_context & io,sdbusplus::asio::object_server & objectServer,boost::container::flat_map<std::string,std::shared_ptr<GpuDevice>> & gpuDevices,boost::container::flat_map<std::string,std::shared_ptr<SmaDevice>> & smaDevices,boost::container::flat_map<std::string,std::shared_ptr<PcieDevice>> & pcieDevices,const std::shared_ptr<sdbusplus::asio::connection> & conn,mctp::MctpRequester & mctpRequester,const SensorConfigs & configs,const std::string & path,uint8_t eid)145 void queryDeviceIdentification(
146 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
147 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
148 gpuDevices,
149 boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
150 smaDevices,
151 boost::container::flat_map<std::string, std::shared_ptr<PcieDevice>>&
152 pcieDevices,
153 const std::shared_ptr<sdbusplus::asio::connection>& conn,
154 mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
155 const std::string& path, uint8_t eid)
156 {
157 auto queryDeviceIdentificationRequest = std::make_shared<
158 std::array<uint8_t, sizeof(gpu::QueryDeviceIdentificationRequest)>>();
159
160 auto rc = gpu::encodeQueryDeviceIdentificationRequest(
161 0, *queryDeviceIdentificationRequest);
162 if (rc != 0)
163 {
164 lg2::error(
165 "Error processing MCTP endpoint with eid {EID} : encode failed, rc={RC}",
166 "EID", eid, "RC", rc);
167 return;
168 }
169
170 mctpRequester.sendRecvMsg(
171 eid, *queryDeviceIdentificationRequest,
172 [&io, &objectServer, &gpuDevices, &smaDevices, &pcieDevices, conn,
173 &mctpRequester, configs, path, eid, queryDeviceIdentificationRequest](
174 const std::error_code& ec, std::span<const uint8_t> response) {
175 processQueryDeviceIdResponse(
176 io, objectServer, gpuDevices, smaDevices, pcieDevices, conn,
177 mctpRequester, configs, path, eid, ec, response);
178 });
179 }
180
processEndpoint(boost::asio::io_context & io,sdbusplus::asio::object_server & objectServer,boost::container::flat_map<std::string,std::shared_ptr<GpuDevice>> & gpuDevices,boost::container::flat_map<std::string,std::shared_ptr<SmaDevice>> & smaDevices,boost::container::flat_map<std::string,std::shared_ptr<PcieDevice>> & pcieDevices,const std::shared_ptr<sdbusplus::asio::connection> & conn,mctp::MctpRequester & mctpRequester,const SensorConfigs & configs,const std::string & path,const boost::system::error_code & ec,const SensorBaseConfigMap & endpoint)181 void processEndpoint(
182 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
183 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
184 gpuDevices,
185 boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
186 smaDevices,
187 boost::container::flat_map<std::string, std::shared_ptr<PcieDevice>>&
188 pcieDevices,
189 const std::shared_ptr<sdbusplus::asio::connection>& conn,
190 mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
191 const std::string& path, const boost::system::error_code& ec,
192 const SensorBaseConfigMap& endpoint)
193 {
194 if (ec)
195 {
196 lg2::error("Error processing MCTP endpoint: Error:{ERROR}", "ERROR",
197 ec.message());
198 return;
199 }
200
201 auto hasEid = endpoint.find("EID");
202 uint8_t eid{};
203
204 if (hasEid != endpoint.end())
205 {
206 const auto* eidPtr = std::get_if<uint8_t>(&hasEid->second);
207 if (eidPtr != nullptr)
208 {
209 eid = *eidPtr;
210 }
211 else
212 {
213 lg2::error(
214 "Error processing MCTP endpoint: Property EID does not have valid type.");
215 return;
216 }
217 }
218 else
219 {
220 lg2::error(
221 "Error processing MCTP endpoint: Property EID not found in the configuration.");
222 return;
223 }
224
225 auto hasMctpTypes = endpoint.find("SupportedMessageTypes");
226 std::vector<uint8_t> mctpTypes{};
227
228 if (hasMctpTypes != endpoint.end())
229 {
230 const auto* mctpTypePtr =
231 std::get_if<std::vector<uint8_t>>(&hasMctpTypes->second);
232 if (mctpTypePtr != nullptr)
233 {
234 mctpTypes = *mctpTypePtr;
235 }
236 else
237 {
238 lg2::error(
239 "Error processing MCTP endpoint with eid {EID} : Property SupportedMessageTypes does not have valid type.",
240 "EID", eid);
241 return;
242 }
243 }
244 else
245 {
246 lg2::error(
247 "Error processing MCTP endpoint with eid {EID} : Property SupportedMessageTypes not found in the configuration.",
248 "EID", eid);
249 return;
250 }
251
252 if (std::find(mctpTypes.begin(), mctpTypes.end(),
253 ocp::accelerator_management::messageType) != mctpTypes.end())
254 {
255 lg2::info("Found OCP MCTP VDM Endpoint with ID {EID}", "EID", eid);
256 queryDeviceIdentification(io, objectServer, gpuDevices, smaDevices,
257 pcieDevices, conn, mctpRequester, configs,
258 path, eid);
259 }
260 }
261
queryEndpoints(boost::asio::io_context & io,sdbusplus::asio::object_server & objectServer,boost::container::flat_map<std::string,std::shared_ptr<GpuDevice>> & gpuDevices,boost::container::flat_map<std::string,std::shared_ptr<SmaDevice>> & smaDevices,boost::container::flat_map<std::string,std::shared_ptr<PcieDevice>> & pcieDevices,const std::shared_ptr<sdbusplus::asio::connection> & conn,mctp::MctpRequester & mctpRequester,const SensorConfigs & configs,const std::string & path,const boost::system::error_code & ec,const GetSubTreeType & ret)262 void queryEndpoints(
263 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
264 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
265 gpuDevices,
266 boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
267 smaDevices,
268 boost::container::flat_map<std::string, std::shared_ptr<PcieDevice>>&
269 pcieDevices,
270 const std::shared_ptr<sdbusplus::asio::connection>& conn,
271 mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
272 const std::string& path, const boost::system::error_code& ec,
273 const GetSubTreeType& ret)
274 {
275 if (ec)
276 {
277 lg2::error("Error processing MCTP endpoints: {ERROR}", "ERROR",
278 ec.message());
279 return;
280 }
281
282 if (ret.empty())
283 {
284 return;
285 }
286
287 for (const auto& [objPath, services] : ret)
288 {
289 for (const auto& [service, ifaces] : services)
290 {
291 for (const auto& iface : ifaces)
292 {
293 if (iface == "xyz.openbmc_project.MCTP.Endpoint")
294 {
295 conn->async_method_call(
296 [&io, &objectServer, &gpuDevices, &smaDevices,
297 &pcieDevices, conn, &mctpRequester, configs,
298 path](const boost::system::error_code& ec,
299 const SensorBaseConfigMap& endpoint) {
300 processEndpoint(io, objectServer, gpuDevices,
301 smaDevices, pcieDevices, conn,
302 mctpRequester, configs, path, ec,
303 endpoint);
304 },
305 service, objPath, "org.freedesktop.DBus.Properties",
306 "GetAll", iface);
307 }
308 }
309 }
310 }
311 }
312
discoverDevices(boost::asio::io_context & io,sdbusplus::asio::object_server & objectServer,boost::container::flat_map<std::string,std::shared_ptr<GpuDevice>> & gpuDevices,boost::container::flat_map<std::string,std::shared_ptr<SmaDevice>> & smaDevices,boost::container::flat_map<std::string,std::shared_ptr<PcieDevice>> & pcieDevices,const std::shared_ptr<sdbusplus::asio::connection> & conn,mctp::MctpRequester & mctpRequester,const SensorConfigs & configs,const std::string & path)313 void discoverDevices(
314 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
315 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
316 gpuDevices,
317 boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
318 smaDevices,
319 boost::container::flat_map<std::string, std::shared_ptr<PcieDevice>>&
320 pcieDevices,
321 const std::shared_ptr<sdbusplus::asio::connection>& conn,
322 mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
323 const std::string& path)
324 {
325 std::string searchPath{"/au/com/codeconstruct/"};
326 std::vector<std::string> ifaceList{{"xyz.openbmc_project.MCTP.Endpoint"}};
327
328 conn->async_method_call(
329 [&io, &objectServer, &gpuDevices, &smaDevices, &pcieDevices, conn,
330 &mctpRequester, configs,
331 path](const boost::system::error_code& ec, const GetSubTreeType& ret) {
332 queryEndpoints(io, objectServer, gpuDevices, smaDevices,
333 pcieDevices, conn, mctpRequester, configs, path, ec,
334 ret);
335 },
336 "xyz.openbmc_project.ObjectMapper",
337 "/xyz/openbmc_project/object_mapper",
338 "xyz.openbmc_project.ObjectMapper", "GetSubTree", searchPath, 0,
339 ifaceList);
340 }
341
processSensorConfigs(boost::asio::io_context & io,sdbusplus::asio::object_server & objectServer,boost::container::flat_map<std::string,std::shared_ptr<GpuDevice>> & gpuDevices,boost::container::flat_map<std::string,std::shared_ptr<SmaDevice>> & smaDevices,boost::container::flat_map<std::string,std::shared_ptr<PcieDevice>> & pcieDevices,const std::shared_ptr<sdbusplus::asio::connection> & dbusConnection,mctp::MctpRequester & mctpRequester,const ManagedObjectType & resp)342 void processSensorConfigs(
343 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
344 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
345 gpuDevices,
346 boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
347 smaDevices,
348 boost::container::flat_map<std::string, std::shared_ptr<PcieDevice>>&
349 pcieDevices,
350 const std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
351 mctp::MctpRequester& mctpRequester, const ManagedObjectType& resp)
352 {
353 for (const auto& [path, interfaces] : resp)
354 {
355 for (const auto& [intf, cfg] : interfaces)
356 {
357 if (intf != configInterfaceName(deviceType))
358 {
359 continue;
360 }
361
362 SensorConfigs configs;
363
364 configs.name = loadVariant<std::string>(cfg, "Name");
365
366 try
367 {
368 configs.pollRate = loadVariant<uint64_t>(cfg, "PollRate");
369 }
370 catch (const std::invalid_argument&)
371 {
372 // PollRate is an optional config
373 configs.pollRate = sensorPollRateMs;
374 }
375
376 discoverDevices(io, objectServer, gpuDevices, smaDevices,
377 pcieDevices, dbusConnection, mctpRequester, configs,
378 path);
379
380 lg2::info(
381 "Detected configuration {NAME} of type {TYPE} at path: {PATH}.",
382 "NAME", configs.name, "TYPE", deviceType, "PATH", path);
383 }
384 }
385 }
386
createSensors(boost::asio::io_context & io,sdbusplus::asio::object_server & objectServer,boost::container::flat_map<std::string,std::shared_ptr<GpuDevice>> & gpuDevices,boost::container::flat_map<std::string,std::shared_ptr<SmaDevice>> & smaDevices,boost::container::flat_map<std::string,std::shared_ptr<PcieDevice>> & pcieDevices,const std::shared_ptr<sdbusplus::asio::connection> & dbusConnection,mctp::MctpRequester & mctpRequester)387 void createSensors(
388 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
389 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
390 gpuDevices,
391 boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
392 smaDevices,
393 boost::container::flat_map<std::string, std::shared_ptr<PcieDevice>>&
394 pcieDevices,
395 const std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
396 mctp::MctpRequester& mctpRequester)
397 {
398 if (!dbusConnection)
399 {
400 lg2::error("Connection not created");
401 return;
402 }
403 dbusConnection->async_method_call(
404 [&gpuDevices, &smaDevices, &pcieDevices, &mctpRequester, dbusConnection,
405 &io, &objectServer](boost::system::error_code ec,
406 const ManagedObjectType& resp) {
407 if (ec)
408 {
409 lg2::error("Error contacting entity manager");
410 return;
411 }
412
413 processSensorConfigs(io, objectServer, gpuDevices, smaDevices,
414 pcieDevices, dbusConnection, mctpRequester,
415 resp);
416 },
417 entityManagerName, "/xyz/openbmc_project/inventory",
418 "org.freedesktop.DBus.ObjectManager", "GetManagedObjects");
419 }
420
interfaceRemoved(sdbusplus::message_t & message,boost::container::flat_map<std::string,std::shared_ptr<GpuDevice>> & gpuDevices,boost::container::flat_map<std::string,std::shared_ptr<SmaDevice>> & smaDevices,boost::container::flat_map<std::string,std::shared_ptr<PcieDevice>> & pcieDevices)421 void interfaceRemoved(
422 sdbusplus::message_t& message,
423 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
424 gpuDevices,
425 boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
426 smaDevices,
427 boost::container::flat_map<std::string, std::shared_ptr<PcieDevice>>&
428 pcieDevices)
429 {
430 if (message.is_method_error())
431 {
432 lg2::error("interfacesRemoved callback method error");
433 return;
434 }
435
436 sdbusplus::message::object_path removedPath;
437 std::vector<std::string> interfaces;
438
439 message.read(removedPath, interfaces);
440
441 // If the xyz.openbmc_project.Confguration.X interface was removed
442 // for one or more sensors, delete those sensor objects.
443 auto sensorIt = gpuDevices.begin();
444 while (sensorIt != gpuDevices.end())
445 {
446 if ((sensorIt->second->getPath() == removedPath) &&
447 (std::find(interfaces.begin(), interfaces.end(),
448 configInterfaceName(deviceType)) != interfaces.end()))
449 {
450 sensorIt = gpuDevices.erase(sensorIt);
451 }
452 else
453 {
454 sensorIt++;
455 }
456 }
457
458 auto smaSensorIt = smaDevices.begin();
459 while (smaSensorIt != smaDevices.end())
460 {
461 if ((smaSensorIt->second->getPath() == removedPath) &&
462 (std::find(interfaces.begin(), interfaces.end(),
463 configInterfaceName(deviceType)) != interfaces.end()))
464 {
465 smaSensorIt = smaDevices.erase(smaSensorIt);
466 }
467 else
468 {
469 smaSensorIt++;
470 }
471 }
472
473 auto pcieSensorIt = pcieDevices.begin();
474 while (pcieSensorIt != pcieDevices.end())
475 {
476 if ((pcieSensorIt->second->getPath() == removedPath) &&
477 (std::find(interfaces.begin(), interfaces.end(),
478 configInterfaceName(deviceType)) != interfaces.end()))
479 {
480 pcieSensorIt = pcieDevices.erase(pcieSensorIt);
481 }
482 else
483 {
484 pcieSensorIt++;
485 }
486 }
487 }
488