xref: /openbmc/dbus-sensors/src/nvidia-gpu/NvidiaPciePortMetrics.cpp (revision 1180ed47947904ceca7e4227582ad62209bbfe93)
1 /*
2  * SPDX-FileCopyrightText: Copyright OpenBMC Authors
3  * SPDX-License-Identifier: Apache-2.0
4  */
5 
6 #include "NvidiaPciePortMetrics.hpp"
7 
8 #include "NvidiaUtils.hpp"
9 #include "Utils.hpp"
10 
11 #include <bits/basic_string.h>
12 
13 #include <MctpRequester.hpp>
14 #include <NvidiaGpuMctpVdm.hpp>
15 #include <NvidiaPcieDevice.hpp>
16 #include <OcpMctpVdm.hpp>
17 #include <phosphor-logging/lg2.hpp>
18 #include <sdbusplus/asio/connection.hpp>
19 #include <sdbusplus/asio/object_server.hpp>
20 #include <sdbusplus/message/native_types.hpp>
21 
22 #include <cstddef>
23 #include <cstdint>
24 #include <format>
25 #include <functional>
26 #include <memory>
27 #include <span>
28 #include <string>
29 #include <system_error>
30 #include <vector>
31 
32 using std::string;
33 
34 using namespace std::literals;
35 
36 constexpr const char* metricInterface = "xyz.openbmc_project.Metric.Value";
37 
NvidiaPciePortMetrics(std::shared_ptr<sdbusplus::asio::connection> & conn,mctp::MctpRequester & mctpRequester,const std::string & name,const std::string & pcieDeviceName,const std::string & path,uint8_t eid,gpu::PciePortType portType,uint8_t upstreamPortNumber,uint8_t portNumber,sdbusplus::asio::object_server & objectServer,uint8_t scalarGroupId,const std::vector<NvidiaMetricInfo> & metricsInfo)38 NvidiaPciePortMetrics::NvidiaPciePortMetrics(
39     std::shared_ptr<sdbusplus::asio::connection>& conn,
40     mctp::MctpRequester& mctpRequester, const std::string& name,
41     const std::string& pcieDeviceName, const std::string& path, uint8_t eid,
42     gpu::PciePortType portType, uint8_t upstreamPortNumber, uint8_t portNumber,
43     sdbusplus::asio::object_server& objectServer, uint8_t scalarGroupId,
44     const std::vector<NvidiaMetricInfo>& metricsInfo) :
45     eid(eid), portType(portType), upstreamPortNumber(upstreamPortNumber),
46     portNumber(portNumber), scalarGroupId(scalarGroupId), path(path),
47     conn(conn), mctpRequester(mctpRequester)
48 {
49     const std::string metricsDbusPathPrefix =
50         metricPath + std::format("port_{}_{}", pcieDeviceName, name);
51 
52     const sdbusplus::message::object_path portDbusPath =
53         sdbusplus::message::object_path(pcieDevicePathPrefix) / pcieDeviceName /
54         name;
55 
56     for (const auto& [id, name] : metricsInfo)
57     {
58         const std::string metricsDbusPath = metricsDbusPathPrefix + name;
59 
60         metricValueInterfaces[id] =
61             objectServer.add_interface(metricsDbusPath, metricInterface);
62         metricValueInterfaces[id]->register_property(
63             "Unit", "xyz.openbmc_project.Metric.Value.Unit.Count"s);
64         metricValueInterfaces[id]->register_property("Value", 0.0);
65 
66         std::vector<Association> associations;
67         associations.emplace_back("measuring", "measured_by", portDbusPath);
68 
69         metricAssociationInterfaces[id] =
70             objectServer.add_interface(metricsDbusPath, association::interface);
71         metricAssociationInterfaces[id]->register_property("Associations",
72                                                            associations);
73 
74         if (!metricValueInterfaces[id]->initialize())
75         {
76             lg2::error(
77                 "Error initializing PCIe Port Metric Interface for EID={EID}, "
78                 "PortType={PT}, PortNumber={PN}, ScalarGroup={SG}, Metric={MN}",
79                 "EID", eid, "PT", static_cast<uint8_t>(portType), "PN",
80                 portNumber, "EID", eid, "PN", portNumber, "SG", scalarGroupId,
81                 "MN", name);
82         }
83 
84         if (!metricAssociationInterfaces[id]->initialize())
85         {
86             lg2::error(
87                 "Error initializing PCIe Port Metric Association Interface for EID={EID}, "
88                 "PortType={PT}, PortNumber={PN}, ScalarGroup={SG}, Metric={MN}",
89                 "EID", eid, "PT", static_cast<uint8_t>(portType), "PN",
90                 portNumber, "EID", eid, "PN", portNumber, "SG", scalarGroupId,
91                 "MN", name);
92         }
93     }
94 }
95 
processResponse(const std::error_code & sendRecvMsgResult,std::span<const uint8_t> response)96 void NvidiaPciePortMetrics::processResponse(
97     const std::error_code& sendRecvMsgResult, std::span<const uint8_t> response)
98 {
99     if (sendRecvMsgResult)
100     {
101         lg2::error(
102             "Error updating PCIe Port Metrics: sending message over MCTP failed, "
103             "rc={RC}, EID={EID}, PortType={PT}, PortNumber={PN}, ScalarGroup={SG}",
104             "RC", sendRecvMsgResult.message(), "EID", eid, "PT",
105             static_cast<uint8_t>(portType), "PN", portNumber, "SG",
106             scalarGroupId);
107         return;
108     }
109 
110     ocp::accelerator_management::CompletionCode cc{};
111     uint16_t reasonCode = 0;
112     size_t numTelemetryValue = 0;
113 
114     int rc = gpu::decodeQueryScalarGroupTelemetryV2Response(
115         response, cc, reasonCode, numTelemetryValue, telemetryValues);
116 
117     if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
118     {
119         lg2::error(
120             "Error updating PCIe Port Errors: decode failed, "
121             "rc={RC}, cc={CC}, reasonCode={RESC}, EID={EID}, PortType={PT}, PortNumber={PN}, ScalarGroup={SG}",
122             "RC", rc, "CC", static_cast<uint8_t>(cc), "RESC", reasonCode, "EID",
123             eid, "PT", static_cast<uint8_t>(portType), "PN", portNumber, "SG",
124             scalarGroupId);
125         return;
126     }
127 
128     for (size_t i = 0; i < numTelemetryValue; ++i)
129     {
130         if (metricValueInterfaces[i] != nullptr)
131         {
132             metricValueInterfaces[i]->set_property(
133                 "Value", static_cast<double>(telemetryValues[i]));
134         }
135     }
136 }
137 
update()138 void NvidiaPciePortMetrics::update()
139 {
140     auto rc = gpu::encodeQueryScalarGroupTelemetryV2Request(
141         0, portType, upstreamPortNumber, portNumber, scalarGroupId, request);
142 
143     if (rc != 0)
144     {
145         lg2::error(
146             "Error updating PCIe Port Errors: encode failed, rc={RC}, EID={EID}, PortType={PT}, PortNumber={PN}, ScalarGroup={SG}",
147             "RC", rc, "EID", eid, "PT", static_cast<uint8_t>(portType), "PN",
148             portNumber, "SG", scalarGroupId);
149         return;
150     }
151 
152     mctpRequester.sendRecvMsg(
153         eid, request,
154         [weak{weak_from_this()}](const std::error_code& ec,
155                                  std::span<const uint8_t> buffer) {
156             std::shared_ptr<NvidiaPciePortMetrics> self = weak.lock();
157             if (!self)
158             {
159                 lg2::error("Invalid reference to NvidiaPciePortErrors");
160                 return;
161             }
162             self->processResponse(ec, buffer);
163         });
164 }
165 
makeNvidiaPciePortErrors(std::shared_ptr<sdbusplus::asio::connection> & conn,mctp::MctpRequester & mctpRequester,const std::string & name,const std::string & pcieDeviceName,const std::string & path,uint8_t eid,gpu::PciePortType portType,uint8_t upstreamPortNumber,uint8_t portNumber,sdbusplus::asio::object_server & objectServer)166 std::shared_ptr<NvidiaPciePortMetrics> makeNvidiaPciePortErrors(
167     std::shared_ptr<sdbusplus::asio::connection>& conn,
168     mctp::MctpRequester& mctpRequester, const std::string& name,
169     const std::string& pcieDeviceName, const std::string& path, uint8_t eid,
170     gpu::PciePortType portType, uint8_t upstreamPortNumber, uint8_t portNumber,
171     sdbusplus::asio::object_server& objectServer)
172 {
173     static constexpr uint8_t nvidiaPciePortErrorScalarGroupId = 2;
174 
175     return std::make_shared<NvidiaPciePortMetrics>(
176         conn, mctpRequester, name, pcieDeviceName, path, eid, portType,
177         upstreamPortNumber, portNumber, objectServer,
178         nvidiaPciePortErrorScalarGroupId,
179         std::vector<NvidiaMetricInfo>{
180             {0, "/pcie/non_fatal_error_count"},
181             {1, "/pcie/fatal_error_count"},
182             {2, "/pcie/unsupported_request_count"},
183             {3, "/pcie/correctable_error_count"},
184         });
185 }
186 
makeNvidiaPciePortCounters(std::shared_ptr<sdbusplus::asio::connection> & conn,mctp::MctpRequester & mctpRequester,const std::string & name,const std::string & pcieDeviceName,const std::string & path,uint8_t eid,gpu::PciePortType portType,uint8_t upstreamPortNumber,uint8_t portNumber,sdbusplus::asio::object_server & objectServer)187 std::shared_ptr<NvidiaPciePortMetrics> makeNvidiaPciePortCounters(
188     std::shared_ptr<sdbusplus::asio::connection>& conn,
189     mctp::MctpRequester& mctpRequester, const std::string& name,
190     const std::string& pcieDeviceName, const std::string& path, uint8_t eid,
191     gpu::PciePortType portType, uint8_t upstreamPortNumber, uint8_t portNumber,
192     sdbusplus::asio::object_server& objectServer)
193 {
194     static constexpr uint8_t nvidiaPciePortCounterScalarGroupId = 4;
195 
196     return std::make_shared<NvidiaPciePortMetrics>(
197         conn, mctpRequester, name, pcieDeviceName, path, eid, portType,
198         upstreamPortNumber, portNumber, objectServer,
199         nvidiaPciePortCounterScalarGroupId,
200         std::vector<NvidiaMetricInfo>{
201             {1, "/pcie/nak_received_count"},
202             {2, "/pcie/nak_sent_count"},
203             {4, "/pcie/replay_rollover_count"},
204             {6, "/pcie/replay_count"},
205         });
206 }
207 
makeNvidiaPciePortL0ToRecoveryCount(std::shared_ptr<sdbusplus::asio::connection> & conn,mctp::MctpRequester & mctpRequester,const std::string & name,const std::string & pcieDeviceName,const std::string & path,uint8_t eid,gpu::PciePortType portType,uint8_t upstreamPortNumber,uint8_t portNumber,sdbusplus::asio::object_server & objectServer)208 std::shared_ptr<NvidiaPciePortMetrics> makeNvidiaPciePortL0ToRecoveryCount(
209     std::shared_ptr<sdbusplus::asio::connection>& conn,
210     mctp::MctpRequester& mctpRequester, const std::string& name,
211     const std::string& pcieDeviceName, const std::string& path, uint8_t eid,
212     gpu::PciePortType portType, uint8_t upstreamPortNumber, uint8_t portNumber,
213     sdbusplus::asio::object_server& objectServer)
214 {
215     static constexpr uint8_t nvidiaPciePortL0ToRecoveryCountScalarGroupId = 3;
216 
217     return std::make_shared<NvidiaPciePortMetrics>(
218         conn, mctpRequester, name, pcieDeviceName, path, eid, portType,
219         upstreamPortNumber, portNumber, objectServer,
220         nvidiaPciePortL0ToRecoveryCountScalarGroupId,
221         std::vector<NvidiaMetricInfo>{
222             {0, "/pcie/l0_to_recovery_count"},
223         });
224 }
225