1 /*
2 * SPDX-FileCopyrightText: Copyright OpenBMC Authors
3 * SPDX-License-Identifier: Apache-2.0
4 */
5
6 #include "NvidiaPciePortMetrics.hpp"
7
8 #include "NvidiaUtils.hpp"
9 #include "Utils.hpp"
10
11 #include <bits/basic_string.h>
12
13 #include <MctpRequester.hpp>
14 #include <NvidiaGpuMctpVdm.hpp>
15 #include <NvidiaPcieDevice.hpp>
16 #include <OcpMctpVdm.hpp>
17 #include <phosphor-logging/lg2.hpp>
18 #include <sdbusplus/asio/connection.hpp>
19 #include <sdbusplus/asio/object_server.hpp>
20 #include <sdbusplus/message/native_types.hpp>
21
22 #include <cstddef>
23 #include <cstdint>
24 #include <format>
25 #include <functional>
26 #include <memory>
27 #include <span>
28 #include <string>
29 #include <system_error>
30 #include <vector>
31
32 using std::string;
33
34 using namespace std::literals;
35
36 constexpr const char* metricInterface = "xyz.openbmc_project.Metric.Value";
37
NvidiaPciePortMetrics(std::shared_ptr<sdbusplus::asio::connection> & conn,mctp::MctpRequester & mctpRequester,const std::string & name,const std::string & pcieDeviceName,const std::string & path,uint8_t eid,gpu::PciePortType portType,uint8_t upstreamPortNumber,uint8_t portNumber,sdbusplus::asio::object_server & objectServer,uint8_t scalarGroupId,const std::vector<NvidiaMetricInfo> & metricsInfo)38 NvidiaPciePortMetrics::NvidiaPciePortMetrics(
39 std::shared_ptr<sdbusplus::asio::connection>& conn,
40 mctp::MctpRequester& mctpRequester, const std::string& name,
41 const std::string& pcieDeviceName, const std::string& path, uint8_t eid,
42 gpu::PciePortType portType, uint8_t upstreamPortNumber, uint8_t portNumber,
43 sdbusplus::asio::object_server& objectServer, uint8_t scalarGroupId,
44 const std::vector<NvidiaMetricInfo>& metricsInfo) :
45 eid(eid), portType(portType), upstreamPortNumber(upstreamPortNumber),
46 portNumber(portNumber), scalarGroupId(scalarGroupId), path(path),
47 conn(conn), mctpRequester(mctpRequester)
48 {
49 const std::string metricsDbusPathPrefix =
50 metricPath + std::format("port_{}_{}", pcieDeviceName, name);
51
52 const sdbusplus::message::object_path portDbusPath =
53 sdbusplus::message::object_path(pcieDevicePathPrefix) / pcieDeviceName /
54 name;
55
56 for (const auto& [id, name] : metricsInfo)
57 {
58 const std::string metricsDbusPath = metricsDbusPathPrefix + name;
59
60 metricValueInterfaces[id] =
61 objectServer.add_interface(metricsDbusPath, metricInterface);
62 metricValueInterfaces[id]->register_property(
63 "Unit", "xyz.openbmc_project.Metric.Value.Unit.Count"s);
64 metricValueInterfaces[id]->register_property("Value", 0.0);
65
66 std::vector<Association> associations;
67 associations.emplace_back("measuring", "measured_by", portDbusPath);
68
69 metricAssociationInterfaces[id] =
70 objectServer.add_interface(metricsDbusPath, association::interface);
71 metricAssociationInterfaces[id]->register_property("Associations",
72 associations);
73
74 if (!metricValueInterfaces[id]->initialize())
75 {
76 lg2::error(
77 "Error initializing PCIe Port Metric Interface for EID={EID}, "
78 "PortType={PT}, PortNumber={PN}, ScalarGroup={SG}, Metric={MN}",
79 "EID", eid, "PT", static_cast<uint8_t>(portType), "PN",
80 portNumber, "EID", eid, "PN", portNumber, "SG", scalarGroupId,
81 "MN", name);
82 }
83
84 if (!metricAssociationInterfaces[id]->initialize())
85 {
86 lg2::error(
87 "Error initializing PCIe Port Metric Association Interface for EID={EID}, "
88 "PortType={PT}, PortNumber={PN}, ScalarGroup={SG}, Metric={MN}",
89 "EID", eid, "PT", static_cast<uint8_t>(portType), "PN",
90 portNumber, "EID", eid, "PN", portNumber, "SG", scalarGroupId,
91 "MN", name);
92 }
93 }
94 }
95
processResponse(const std::error_code & sendRecvMsgResult,std::span<const uint8_t> response)96 void NvidiaPciePortMetrics::processResponse(
97 const std::error_code& sendRecvMsgResult, std::span<const uint8_t> response)
98 {
99 if (sendRecvMsgResult)
100 {
101 lg2::error(
102 "Error updating PCIe Port Metrics: sending message over MCTP failed, "
103 "rc={RC}, EID={EID}, PortType={PT}, PortNumber={PN}, ScalarGroup={SG}",
104 "RC", sendRecvMsgResult.message(), "EID", eid, "PT",
105 static_cast<uint8_t>(portType), "PN", portNumber, "SG",
106 scalarGroupId);
107 return;
108 }
109
110 ocp::accelerator_management::CompletionCode cc{};
111 uint16_t reasonCode = 0;
112 size_t numTelemetryValue = 0;
113
114 int rc = gpu::decodeQueryScalarGroupTelemetryV2Response(
115 response, cc, reasonCode, numTelemetryValue, telemetryValues);
116
117 if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
118 {
119 lg2::error(
120 "Error updating PCIe Port Errors: decode failed, "
121 "rc={RC}, cc={CC}, reasonCode={RESC}, EID={EID}, PortType={PT}, PortNumber={PN}, ScalarGroup={SG}",
122 "RC", rc, "CC", static_cast<uint8_t>(cc), "RESC", reasonCode, "EID",
123 eid, "PT", static_cast<uint8_t>(portType), "PN", portNumber, "SG",
124 scalarGroupId);
125 return;
126 }
127
128 for (size_t i = 0; i < numTelemetryValue; ++i)
129 {
130 if (metricValueInterfaces[i] != nullptr)
131 {
132 metricValueInterfaces[i]->set_property(
133 "Value", static_cast<double>(telemetryValues[i]));
134 }
135 }
136 }
137
update()138 void NvidiaPciePortMetrics::update()
139 {
140 auto rc = gpu::encodeQueryScalarGroupTelemetryV2Request(
141 0, portType, upstreamPortNumber, portNumber, scalarGroupId, request);
142
143 if (rc != 0)
144 {
145 lg2::error(
146 "Error updating PCIe Port Errors: encode failed, rc={RC}, EID={EID}, PortType={PT}, PortNumber={PN}, ScalarGroup={SG}",
147 "RC", rc, "EID", eid, "PT", static_cast<uint8_t>(portType), "PN",
148 portNumber, "SG", scalarGroupId);
149 return;
150 }
151
152 mctpRequester.sendRecvMsg(
153 eid, request,
154 [weak{weak_from_this()}](const std::error_code& ec,
155 std::span<const uint8_t> buffer) {
156 std::shared_ptr<NvidiaPciePortMetrics> self = weak.lock();
157 if (!self)
158 {
159 lg2::error("Invalid reference to NvidiaPciePortErrors");
160 return;
161 }
162 self->processResponse(ec, buffer);
163 });
164 }
165
makeNvidiaPciePortErrors(std::shared_ptr<sdbusplus::asio::connection> & conn,mctp::MctpRequester & mctpRequester,const std::string & name,const std::string & pcieDeviceName,const std::string & path,uint8_t eid,gpu::PciePortType portType,uint8_t upstreamPortNumber,uint8_t portNumber,sdbusplus::asio::object_server & objectServer)166 std::shared_ptr<NvidiaPciePortMetrics> makeNvidiaPciePortErrors(
167 std::shared_ptr<sdbusplus::asio::connection>& conn,
168 mctp::MctpRequester& mctpRequester, const std::string& name,
169 const std::string& pcieDeviceName, const std::string& path, uint8_t eid,
170 gpu::PciePortType portType, uint8_t upstreamPortNumber, uint8_t portNumber,
171 sdbusplus::asio::object_server& objectServer)
172 {
173 static constexpr uint8_t nvidiaPciePortErrorScalarGroupId = 2;
174
175 return std::make_shared<NvidiaPciePortMetrics>(
176 conn, mctpRequester, name, pcieDeviceName, path, eid, portType,
177 upstreamPortNumber, portNumber, objectServer,
178 nvidiaPciePortErrorScalarGroupId,
179 std::vector<NvidiaMetricInfo>{
180 {0, "/pcie/non_fatal_error_count"},
181 {1, "/pcie/fatal_error_count"},
182 {2, "/pcie/unsupported_request_count"},
183 {3, "/pcie/correctable_error_count"},
184 });
185 }
186
makeNvidiaPciePortCounters(std::shared_ptr<sdbusplus::asio::connection> & conn,mctp::MctpRequester & mctpRequester,const std::string & name,const std::string & pcieDeviceName,const std::string & path,uint8_t eid,gpu::PciePortType portType,uint8_t upstreamPortNumber,uint8_t portNumber,sdbusplus::asio::object_server & objectServer)187 std::shared_ptr<NvidiaPciePortMetrics> makeNvidiaPciePortCounters(
188 std::shared_ptr<sdbusplus::asio::connection>& conn,
189 mctp::MctpRequester& mctpRequester, const std::string& name,
190 const std::string& pcieDeviceName, const std::string& path, uint8_t eid,
191 gpu::PciePortType portType, uint8_t upstreamPortNumber, uint8_t portNumber,
192 sdbusplus::asio::object_server& objectServer)
193 {
194 static constexpr uint8_t nvidiaPciePortCounterScalarGroupId = 4;
195
196 return std::make_shared<NvidiaPciePortMetrics>(
197 conn, mctpRequester, name, pcieDeviceName, path, eid, portType,
198 upstreamPortNumber, portNumber, objectServer,
199 nvidiaPciePortCounterScalarGroupId,
200 std::vector<NvidiaMetricInfo>{
201 {1, "/pcie/nak_received_count"},
202 {2, "/pcie/nak_sent_count"},
203 {4, "/pcie/replay_rollover_count"},
204 {6, "/pcie/replay_count"},
205 });
206 }
207
makeNvidiaPciePortL0ToRecoveryCount(std::shared_ptr<sdbusplus::asio::connection> & conn,mctp::MctpRequester & mctpRequester,const std::string & name,const std::string & pcieDeviceName,const std::string & path,uint8_t eid,gpu::PciePortType portType,uint8_t upstreamPortNumber,uint8_t portNumber,sdbusplus::asio::object_server & objectServer)208 std::shared_ptr<NvidiaPciePortMetrics> makeNvidiaPciePortL0ToRecoveryCount(
209 std::shared_ptr<sdbusplus::asio::connection>& conn,
210 mctp::MctpRequester& mctpRequester, const std::string& name,
211 const std::string& pcieDeviceName, const std::string& path, uint8_t eid,
212 gpu::PciePortType portType, uint8_t upstreamPortNumber, uint8_t portNumber,
213 sdbusplus::asio::object_server& objectServer)
214 {
215 static constexpr uint8_t nvidiaPciePortL0ToRecoveryCountScalarGroupId = 3;
216
217 return std::make_shared<NvidiaPciePortMetrics>(
218 conn, mctpRequester, name, pcieDeviceName, path, eid, portType,
219 upstreamPortNumber, portNumber, objectServer,
220 nvidiaPciePortL0ToRecoveryCountScalarGroupId,
221 std::vector<NvidiaMetricInfo>{
222 {0, "/pcie/l0_to_recovery_count"},
223 });
224 }
225