xref: /openbmc/dbus-sensors/src/nvidia-gpu/NvidiaPcieDevice.cpp (revision 7427aeef4225bf23715539b195a23bce10865265)
1 /*
2  * SPDX-FileCopyrightText: Copyright OpenBMC Authors
3  * SPDX-License-Identifier: Apache-2.0
4  */
5 
6 #include "NvidiaPcieDevice.hpp"
7 
8 #include "NvidiaDeviceDiscovery.hpp"
9 #include "NvidiaEthPort.hpp"
10 #include "NvidiaGpuMctpVdm.hpp"
11 #include "NvidiaPcieInterface.hpp"
12 #include "NvidiaPciePort.hpp"
13 #include "NvidiaPciePortMetrics.hpp"
14 #include "Utils.hpp"
15 
16 #include <MctpRequester.hpp>
17 #include <OcpMctpVdm.hpp>
18 #include <boost/asio/io_context.hpp>
19 #include <phosphor-logging/lg2.hpp>
20 #include <sdbusplus/asio/connection.hpp>
21 #include <sdbusplus/asio/object_server.hpp>
22 #include <sdbusplus/message/native_types.hpp>
23 
24 #include <chrono>
25 #include <cstdint>
26 #include <format>
27 #include <memory>
28 #include <span>
29 #include <string>
30 #include <system_error>
31 #include <utility>
32 #include <vector>
33 
PcieDevice(const SensorConfigs & configs,const std::string & name,const std::string & path,const std::shared_ptr<sdbusplus::asio::connection> & conn,uint8_t eid,boost::asio::io_context & io,mctp::MctpRequester & mctpRequester,sdbusplus::asio::object_server & objectServer)34 PcieDevice::PcieDevice(const SensorConfigs& configs, const std::string& name,
35                        const std::string& path,
36                        const std::shared_ptr<sdbusplus::asio::connection>& conn,
37                        uint8_t eid, boost::asio::io_context& io,
38                        mctp::MctpRequester& mctpRequester,
39                        sdbusplus::asio::object_server& objectServer) :
40     eid(eid), sensorPollMs(std::chrono::milliseconds{configs.pollRate}),
41     waitTimer(io, std::chrono::steady_clock::duration(0)),
42     mctpRequester(mctpRequester), conn(conn), objectServer(objectServer),
43     configs(configs), name(escapeName(name)), path(path)
44 {}
45 
init()46 void PcieDevice::init()
47 {
48     sdbusplus::message::object_path networkAdapterPath =
49         sdbusplus::message::object_path(nicPathPrefix) / (name + "_NIC");
50 
51     networkAdapterInterface = objectServer.add_interface(
52         networkAdapterPath,
53         "xyz.openbmc_project.Inventory.Item.NetworkAdapter");
54 
55     std::vector<Association> associations;
56     associations.emplace_back(
57         "contained_by", "containing",
58         sdbusplus::message::object_path(path).parent_path());
59 
60     networkAdapterAssociationInterface =
61         objectServer.add_interface(networkAdapterPath, association::interface);
62     networkAdapterAssociationInterface->register_property(
63         "Associations", associations);
64 
65     if (!networkAdapterInterface->initialize())
66     {
67         lg2::error(
68             "Failed to initialize network adapter interface for for eid {EID}",
69             "EID", eid);
70     }
71 
72     if (!networkAdapterAssociationInterface->initialize())
73     {
74         lg2::error(
75             "Error initializing Association Interface for Network Adapter for eid {EID}",
76             "EID", eid);
77     }
78 
79     getPciePortCounts();
80 
81     for (uint64_t k = 0; k < configs.nicNetworkPortCount; ++k)
82     {
83         getNetworkPortAddresses(static_cast<uint16_t>(k + 1));
84     }
85 }
86 
getPciePortCounts()87 void PcieDevice::getPciePortCounts()
88 {
89     const int rc = gpu::encodeListPciePortsRequest(0, getPciePortCountsRequest);
90 
91     if (rc != 0)
92     {
93         lg2::error(
94             "Error updating PCIe Port Counts: encode failed, rc={RC}, EID={EID}",
95             "RC", rc, "EID", eid);
96         return;
97     }
98 
99     mctpRequester.sendRecvMsg(
100         eid, getPciePortCountsRequest,
101         [weak{weak_from_this()}](const std::error_code& ec,
102                                  std::span<const uint8_t> buffer) {
103             std::shared_ptr<PcieDevice> self = weak.lock();
104             if (!self)
105             {
106                 lg2::error("Invalid reference to PcieDevice");
107                 return;
108             }
109             self->processPciePortCountsResponse(ec, buffer);
110         });
111 }
112 
processPciePortCountsResponse(const std::error_code & ec,std::span<const uint8_t> response)113 void PcieDevice::processPciePortCountsResponse(
114     const std::error_code& ec, std::span<const uint8_t> response)
115 {
116     if (ec)
117     {
118         lg2::error(
119             "Error processing PCIe Port Counts response: sending message over MCTP failed, rc={RC}, EID={EID}",
120             "RC", ec.message(), "EID", eid);
121         return;
122     }
123 
124     ocp::accelerator_management::CompletionCode cc{};
125     uint16_t reasonCode = 0;
126 
127     const int rc = gpu::decodeListPciePortsResponse(
128         response, cc, reasonCode, pcieDeviceInfo.numUpstreamPorts,
129         pcieDeviceInfo.numDownstreamPorts);
130 
131     if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
132     {
133         lg2::error(
134             "Error processing PCIe Port Counts response: decode failed, rc={RC}, cc={CC}, reasonCode={RESC}, EID={EID}",
135             "RC", rc, "CC", static_cast<uint8_t>(cc), "RESC", reasonCode, "EID",
136             eid);
137         return;
138     }
139 
140     lg2::info("PCIe Device with eid {EID} has {UP} upstream ports.", "EID", eid,
141               "UP", pcieDeviceInfo.numUpstreamPorts);
142 
143     makeSensors();
144 }
145 
getNetworkPortAddresses(const uint16_t portNumber)146 void PcieDevice::getNetworkPortAddresses(const uint16_t portNumber)
147 {
148     const int rc = gpu::encodeGetPortNetworkAddressesRequest(
149         0, portNumber, getPortNetworkAddressesRequest);
150 
151     if (rc != 0)
152     {
153         lg2::error(
154             "Error updating Network Port Addresses: encode failed, rc={RC}, EID={EID}",
155             "RC", rc, "EID", eid);
156         return;
157     }
158 
159     mctpRequester.sendRecvMsg(
160         eid, getPortNetworkAddressesRequest,
161         [portNumber, weak{weak_from_this()}](const std::error_code& ec,
162                                              std::span<const uint8_t> buffer) {
163             std::shared_ptr<PcieDevice> self = weak.lock();
164             if (!self)
165             {
166                 lg2::error("Invalid reference to PcieDevice, EID={EID}", "EID",
167                            self->eid);
168                 return;
169             }
170             self->processGetNetworkPortAddressesResponse(portNumber, ec,
171                                                          buffer);
172         });
173 }
174 
processGetNetworkPortAddressesResponse(const uint16_t portNumber,const std::error_code & ec,std::span<const uint8_t> response)175 void PcieDevice::processGetNetworkPortAddressesResponse(
176     const uint16_t portNumber, const std::error_code& ec,
177     std::span<const uint8_t> response)
178 {
179     if (ec)
180     {
181         lg2::error(
182             "Error processing Network Port Addresses response: sending message over MCTP failed, rc={RC}, EID={EID}",
183             "RC", ec.message(), "EID", eid);
184         return;
185     }
186 
187     ocp::accelerator_management::CompletionCode cc{};
188     uint16_t reasonCode = 0;
189     gpu::NetworkPortLinkType linkType = gpu::NetworkPortLinkType::UNKNOWN;
190     std::vector<std::pair<uint8_t, uint64_t>> addresses;
191 
192     const int rc = gpu::decodeGetPortNetworkAddressesResponse(
193         response, cc, reasonCode, linkType, addresses);
194 
195     if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
196     {
197         lg2::error(
198             "Error processing Network Port Addresses response: decode failed, rc={RC}, cc={CC}, reasonCode={RESC}, EID={EID}",
199             "RC", rc, "CC", static_cast<uint8_t>(cc), "RESC", reasonCode, "EID",
200             eid);
201         return;
202     }
203 
204     if (linkType == gpu::NetworkPortLinkType::ETHERNET)
205     {
206         lg2::info(
207             "Port {PN} of PCIe Device with eid {EID} is of type Ethernet.",
208             "EID", eid, "PN", portNumber);
209 
210         const std::string nicDeviceName = name + "_NIC";
211 
212         const std::string portName = std::format("Port_{}", portNumber);
213 
214         ethPortMetrics.emplace_back(std::make_shared<NvidiaEthPortMetrics>(
215             conn, mctpRequester, portName, nicDeviceName, path, eid, portNumber,
216             objectServer));
217     }
218 }
219 
makeSensors()220 void PcieDevice::makeSensors()
221 {
222     const std::string pcieDeviceName = name + "_PCIe";
223 
224     pcieInterface = std::make_shared<NvidiaPcieInterface>(
225         conn, mctpRequester, pcieDeviceName, path, eid, objectServer);
226 
227     uint64_t downstreamPortIndex = 0;
228 
229     for (uint64_t i = 0; i < pcieDeviceInfo.numUpstreamPorts; ++i)
230     {
231         const std::string portName = std::format("UP_{}", i);
232 
233         pciePorts.emplace_back(std::make_shared<NvidiaPciePortInfo>(
234             conn, mctpRequester, portName, pcieDeviceName, path, eid,
235             gpu::PciePortType::UPSTREAM, i, i, objectServer));
236 
237         pciePortMetrics.emplace_back(makeNvidiaPciePortErrors(
238             conn, mctpRequester, portName, pcieDeviceName, path, eid,
239             gpu::PciePortType::UPSTREAM, i, i, objectServer));
240 
241         pciePortMetrics.emplace_back(makeNvidiaPciePortCounters(
242             conn, mctpRequester, portName, pcieDeviceName, path, eid,
243             gpu::PciePortType::UPSTREAM, i, i, objectServer));
244 
245         pciePortMetrics.emplace_back(makeNvidiaPciePortL0ToRecoveryCount(
246             conn, mctpRequester, portName, pcieDeviceName, path, eid,
247             gpu::PciePortType::UPSTREAM, i, i, objectServer));
248 
249         for (uint64_t j = 0; j < pcieDeviceInfo.numDownstreamPorts[i]; ++j)
250         {
251             const std::string portName =
252                 std::format("DOWN_{}", downstreamPortIndex);
253 
254             pciePorts.emplace_back(std::make_shared<NvidiaPciePortInfo>(
255                 conn, mctpRequester, portName, pcieDeviceName, path, eid,
256                 gpu::PciePortType::DOWNSTREAM, i, downstreamPortIndex,
257                 objectServer));
258 
259             pciePortMetrics.emplace_back(makeNvidiaPciePortErrors(
260                 conn, mctpRequester, portName, pcieDeviceName, path, eid,
261                 gpu::PciePortType::DOWNSTREAM, i, downstreamPortIndex,
262                 objectServer));
263 
264             pciePortMetrics.emplace_back(makeNvidiaPciePortCounters(
265                 conn, mctpRequester, portName, pcieDeviceName, path, eid,
266                 gpu::PciePortType::DOWNSTREAM, i, downstreamPortIndex,
267                 objectServer));
268 
269             pciePortMetrics.emplace_back(makeNvidiaPciePortL0ToRecoveryCount(
270                 conn, mctpRequester, portName, pcieDeviceName, path, eid,
271                 gpu::PciePortType::DOWNSTREAM, i, downstreamPortIndex,
272                 objectServer));
273 
274             ++downstreamPortIndex;
275         }
276     }
277 
278     lg2::info("Added PCIe {NAME} Sensors with chassis path: {PATH}.", "NAME",
279               name, "PATH", path);
280 
281     read();
282 }
283 
read()284 void PcieDevice::read()
285 {
286     pcieInterface->update();
287 
288     for (auto& port : pciePorts)
289     {
290         port->update();
291     }
292 
293     for (auto& portMetrics : pciePortMetrics)
294     {
295         portMetrics->update();
296     }
297 
298     for (auto& ethPortMetric : ethPortMetrics)
299     {
300         ethPortMetric->update();
301     }
302 
303     waitTimer.expires_after(std::chrono::milliseconds(sensorPollMs));
304     waitTimer.async_wait([this](const boost::system::error_code& ec) {
305         if (ec)
306         {
307             return;
308         }
309         read();
310     });
311 }
312