1 /* 2 * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & 3 * AFFILIATES. All rights reserved. 4 * SPDX-License-Identifier: Apache-2.0 5 */ 6 7 #include "NvidiaDeviceDiscovery.hpp" 8 9 #include "NvidiaGpuDevice.hpp" 10 #include "NvidiaSmaDevice.hpp" 11 #include "Utils.hpp" 12 13 #include <bits/basic_string.h> 14 15 #include <MctpRequester.hpp> 16 #include <NvidiaGpuMctpVdm.hpp> 17 #include <OcpMctpVdm.hpp> 18 #include <boost/asio/io_context.hpp> 19 #include <boost/container/flat_map.hpp> 20 #include <phosphor-logging/lg2.hpp> 21 #include <sdbusplus/asio/connection.hpp> 22 #include <sdbusplus/asio/object_server.hpp> 23 #include <sdbusplus/message.hpp> 24 #include <sdbusplus/message/native_types.hpp> 25 26 #include <algorithm> 27 #include <array> 28 #include <cstdint> 29 #include <memory> 30 #include <span> 31 #include <stdexcept> 32 #include <string> 33 #include <utility> 34 #include <variant> 35 #include <vector> 36 37 static constexpr auto sensorPollRateMs = 1000; 38 39 void processQueryDeviceIdResponse( 40 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer, 41 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>& 42 gpuDevices, 43 boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>& 44 smaDevices, 45 const std::shared_ptr<sdbusplus::asio::connection>& conn, 46 mctp::MctpRequester& mctpRequester, const SensorConfigs& configs, 47 const std::string& path, uint8_t eid, int sendRecvMsgResult, 48 std::span<uint8_t> queryDeviceIdentificationResponse) 49 { 50 if (sendRecvMsgResult != 0) 51 { 52 lg2::error( 53 "Error processing MCTP endpoint with eid {EID} : sending message over MCTP failed, rc={RC}", 54 "EID", eid, "RC", sendRecvMsgResult); 55 return; 56 } 57 58 ocp::accelerator_management::CompletionCode cc{}; 59 uint16_t reasonCode = 0; 60 uint8_t responseDeviceType = 0; 61 uint8_t responseInstanceId = 0; 62 63 auto rc = gpu::decodeQueryDeviceIdentificationResponse( 64 queryDeviceIdentificationResponse, cc, reasonCode, responseDeviceType, 65 responseInstanceId); 66 67 if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS) 68 { 69 lg2::error( 70 "Error processing MCTP endpoint with eid {EID} : decode failed, rc={RC}, cc={CC}, reasonCode={RESC}", 71 "EID", eid, "RC", rc, "CC", cc, "RESC", reasonCode); 72 return; 73 } 74 75 switch (static_cast<gpu::DeviceIdentification>(responseDeviceType)) 76 { 77 case gpu::DeviceIdentification::DEVICE_GPU: 78 { 79 lg2::info( 80 "Found the GPU with EID {EID}, DeviceType {DEVTYPE}, InstanceId {IID}.", 81 "EID", eid, "DEVTYPE", responseDeviceType, "IID", 82 responseInstanceId); 83 84 auto gpuName = configs.name + '_' + 85 std::to_string(responseInstanceId); 86 87 gpuDevices[gpuName] = 88 std::make_shared<GpuDevice>(configs, gpuName, path, conn, eid, 89 io, mctpRequester, objectServer); 90 break; 91 } 92 93 case gpu::DeviceIdentification::DEVICE_SMA: 94 { 95 lg2::info( 96 "Found the SMA Device with EID {EID}, DeviceType {DEVTYPE}, InstanceId {IID}.", 97 "EID", eid, "DEVTYPE", responseDeviceType, "IID", 98 responseInstanceId); 99 100 auto smaName = configs.name + "_SMA_" + 101 std::to_string(responseInstanceId); 102 103 smaDevices[smaName] = 104 std::make_shared<SmaDevice>(configs, smaName, path, conn, eid, 105 io, mctpRequester, objectServer); 106 break; 107 } 108 } 109 } 110 111 void queryDeviceIdentification( 112 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer, 113 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>& 114 gpuDevices, 115 boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>& 116 smaDevices, 117 const std::shared_ptr<sdbusplus::asio::connection>& conn, 118 mctp::MctpRequester& mctpRequester, const SensorConfigs& configs, 119 const std::string& path, uint8_t eid) 120 { 121 auto queryDeviceIdentificationRequest = std::make_shared< 122 std::array<uint8_t, sizeof(gpu::QueryDeviceIdentificationRequest)>>(); 123 124 auto queryDeviceIdentificationResponse = std::make_shared< 125 std::array<uint8_t, sizeof(gpu::QueryDeviceIdentificationResponse)>>(); 126 127 auto rc = gpu::encodeQueryDeviceIdentificationRequest( 128 0, *queryDeviceIdentificationRequest); 129 if (rc != 0) 130 { 131 lg2::error( 132 "Error processing MCTP endpoint with eid {EID} : encode failed, rc={RC}", 133 "EID", eid, "RC", rc); 134 return; 135 } 136 137 mctpRequester.sendRecvMsg( 138 eid, *queryDeviceIdentificationRequest, 139 *queryDeviceIdentificationResponse, 140 [&io, &objectServer, &gpuDevices, &smaDevices, conn, &mctpRequester, 141 configs, path, eid, queryDeviceIdentificationRequest, 142 queryDeviceIdentificationResponse](int sendRecvMsgResult) { 143 processQueryDeviceIdResponse( 144 io, objectServer, gpuDevices, smaDevices, conn, mctpRequester, 145 configs, path, eid, sendRecvMsgResult, 146 *queryDeviceIdentificationResponse); 147 }); 148 } 149 150 void processEndpoint( 151 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer, 152 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>& 153 gpuDevices, 154 boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>& 155 smaDevices, 156 const std::shared_ptr<sdbusplus::asio::connection>& conn, 157 mctp::MctpRequester& mctpRequester, const SensorConfigs& configs, 158 const std::string& path, const boost::system::error_code& ec, 159 const SensorBaseConfigMap& endpoint) 160 { 161 if (ec) 162 { 163 lg2::error("Error processing MCTP endpoint: Error:{ERROR}", "ERROR", 164 ec.message()); 165 return; 166 } 167 168 auto hasEid = endpoint.find("EID"); 169 uint8_t eid{}; 170 171 if (hasEid != endpoint.end()) 172 { 173 const auto* eidPtr = std::get_if<uint8_t>(&hasEid->second); 174 if (eidPtr != nullptr) 175 { 176 eid = *eidPtr; 177 } 178 else 179 { 180 lg2::error( 181 "Error processing MCTP endpoint: Property EID does not have valid type."); 182 return; 183 } 184 } 185 else 186 { 187 lg2::error( 188 "Error processing MCTP endpoint: Property EID not found in the configuration."); 189 return; 190 } 191 192 auto hasMctpTypes = endpoint.find("SupportedMessageTypes"); 193 std::vector<uint8_t> mctpTypes{}; 194 195 if (hasMctpTypes != endpoint.end()) 196 { 197 const auto* mctpTypePtr = 198 std::get_if<std::vector<uint8_t>>(&hasMctpTypes->second); 199 if (mctpTypePtr != nullptr) 200 { 201 mctpTypes = *mctpTypePtr; 202 } 203 else 204 { 205 lg2::error( 206 "Error processing MCTP endpoint with eid {EID} : Property SupportedMessageTypes does not have valid type.", 207 "EID", eid); 208 return; 209 } 210 } 211 else 212 { 213 lg2::error( 214 "Error processing MCTP endpoint with eid {EID} : Property SupportedMessageTypes not found in the configuration.", 215 "EID", eid); 216 return; 217 } 218 219 if (std::find(mctpTypes.begin(), mctpTypes.end(), 220 ocp::accelerator_management::messageType) != mctpTypes.end()) 221 { 222 lg2::info("Found OCP MCTP VDM Endpoint with ID {EID}", "EID", eid); 223 queryDeviceIdentification(io, objectServer, gpuDevices, smaDevices, 224 conn, mctpRequester, configs, path, eid); 225 } 226 } 227 228 void queryEndpoints( 229 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer, 230 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>& 231 gpuDevices, 232 boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>& 233 smaDevices, 234 const std::shared_ptr<sdbusplus::asio::connection>& conn, 235 mctp::MctpRequester& mctpRequester, const SensorConfigs& configs, 236 const std::string& path, const boost::system::error_code& ec, 237 const GetSubTreeType& ret) 238 { 239 if (ec) 240 { 241 lg2::error("Error processing MCTP endpoints: {ERROR}", "ERROR", 242 ec.message()); 243 return; 244 } 245 246 if (ret.empty()) 247 { 248 return; 249 } 250 251 for (const auto& [objPath, services] : ret) 252 { 253 for (const auto& [service, ifaces] : services) 254 { 255 for (const auto& iface : ifaces) 256 { 257 if (iface == "xyz.openbmc_project.MCTP.Endpoint") 258 { 259 conn->async_method_call( 260 [&io, &objectServer, &gpuDevices, &smaDevices, conn, 261 &mctpRequester, configs, 262 path](const boost::system::error_code& ec, 263 const SensorBaseConfigMap& endpoint) { 264 processEndpoint(io, objectServer, gpuDevices, 265 smaDevices, conn, mctpRequester, 266 configs, path, ec, endpoint); 267 }, 268 service, objPath, "org.freedesktop.DBus.Properties", 269 "GetAll", iface); 270 } 271 } 272 } 273 } 274 } 275 276 void discoverDevices( 277 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer, 278 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>& 279 gpuDevices, 280 boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>& 281 smaDevices, 282 const std::shared_ptr<sdbusplus::asio::connection>& conn, 283 mctp::MctpRequester& mctpRequester, const SensorConfigs& configs, 284 const std::string& path) 285 { 286 std::string searchPath{"/au/com/codeconstruct/"}; 287 std::vector<std::string> ifaceList{{"xyz.openbmc_project.MCTP.Endpoint"}}; 288 289 conn->async_method_call( 290 [&io, &objectServer, &gpuDevices, &smaDevices, conn, &mctpRequester, 291 configs, 292 path](const boost::system::error_code& ec, const GetSubTreeType& ret) { 293 queryEndpoints(io, objectServer, gpuDevices, smaDevices, conn, 294 mctpRequester, configs, path, ec, ret); 295 }, 296 "xyz.openbmc_project.ObjectMapper", 297 "/xyz/openbmc_project/object_mapper", 298 "xyz.openbmc_project.ObjectMapper", "GetSubTree", searchPath, 0, 299 ifaceList); 300 } 301 302 void processSensorConfigs( 303 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer, 304 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>& 305 gpuDevices, 306 boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>& 307 smaDevices, 308 const std::shared_ptr<sdbusplus::asio::connection>& dbusConnection, 309 mctp::MctpRequester& mctpRequester, const ManagedObjectType& resp) 310 { 311 for (const auto& [path, interfaces] : resp) 312 { 313 for (const auto& [intf, cfg] : interfaces) 314 { 315 if (intf != configInterfaceName(deviceType)) 316 { 317 continue; 318 } 319 320 SensorConfigs configs; 321 322 configs.name = loadVariant<std::string>(cfg, "Name"); 323 324 try 325 { 326 configs.pollRate = loadVariant<uint64_t>(cfg, "PollRate"); 327 } 328 catch (const std::invalid_argument&) 329 { 330 // PollRate is an optional config 331 configs.pollRate = sensorPollRateMs; 332 } 333 334 discoverDevices(io, objectServer, gpuDevices, smaDevices, 335 dbusConnection, mctpRequester, configs, path); 336 337 lg2::info( 338 "Detected configuration {NAME} of type {TYPE} at path: {PATH}.", 339 "NAME", configs.name, "TYPE", deviceType, "PATH", path); 340 } 341 } 342 } 343 344 void createSensors( 345 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer, 346 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>& 347 gpuDevices, 348 boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>& 349 smaDevices, 350 const std::shared_ptr<sdbusplus::asio::connection>& dbusConnection, 351 mctp::MctpRequester& mctpRequester) 352 { 353 if (!dbusConnection) 354 { 355 lg2::error("Connection not created"); 356 return; 357 } 358 dbusConnection->async_method_call( 359 [&gpuDevices, &smaDevices, &mctpRequester, dbusConnection, &io, 360 &objectServer](boost::system::error_code ec, 361 const ManagedObjectType& resp) { 362 if (ec) 363 { 364 lg2::error("Error contacting entity manager"); 365 return; 366 } 367 368 processSensorConfigs(io, objectServer, gpuDevices, smaDevices, 369 dbusConnection, mctpRequester, resp); 370 }, 371 entityManagerName, "/xyz/openbmc_project/inventory", 372 "org.freedesktop.DBus.ObjectManager", "GetManagedObjects"); 373 } 374 375 void interfaceRemoved( 376 sdbusplus::message_t& message, 377 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>& 378 gpuDevices, 379 boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>& 380 smaDevices) 381 { 382 if (message.is_method_error()) 383 { 384 lg2::error("interfacesRemoved callback method error"); 385 return; 386 } 387 388 sdbusplus::message::object_path removedPath; 389 std::vector<std::string> interfaces; 390 391 message.read(removedPath, interfaces); 392 393 // If the xyz.openbmc_project.Confguration.X interface was removed 394 // for one or more sensors, delete those sensor objects. 395 auto sensorIt = gpuDevices.begin(); 396 while (sensorIt != gpuDevices.end()) 397 { 398 if ((sensorIt->second->getPath() == removedPath) && 399 (std::find(interfaces.begin(), interfaces.end(), 400 configInterfaceName(deviceType)) != interfaces.end())) 401 { 402 sensorIt = gpuDevices.erase(sensorIt); 403 } 404 else 405 { 406 sensorIt++; 407 } 408 } 409 410 auto smaSensorIt = smaDevices.begin(); 411 while (smaSensorIt != smaDevices.end()) 412 { 413 if ((smaSensorIt->second->getPath() == removedPath) && 414 (std::find(interfaces.begin(), interfaces.end(), 415 configInterfaceName(deviceType)) != interfaces.end())) 416 { 417 smaSensorIt = smaDevices.erase(smaSensorIt); 418 } 419 else 420 { 421 smaSensorIt++; 422 } 423 } 424 } 425