xref: /openbmc/dbus-sensors/src/nvidia-gpu/NvidiaPcieInterface.cpp (revision e0b80e1e58bddcf218369f2f9e3ba2002b59b6f9)
1*e0b80e1eSHarshit Aghera /*
2*e0b80e1eSHarshit Aghera  * SPDX-FileCopyrightText: Copyright OpenBMC Authors
3*e0b80e1eSHarshit Aghera  * SPDX-License-Identifier: Apache-2.0
4*e0b80e1eSHarshit Aghera  */
5*e0b80e1eSHarshit Aghera 
6*e0b80e1eSHarshit Aghera #include "NvidiaPcieInterface.hpp"
7*e0b80e1eSHarshit Aghera 
8*e0b80e1eSHarshit Aghera #include "Utils.hpp"
9*e0b80e1eSHarshit Aghera 
10*e0b80e1eSHarshit Aghera #include <bits/basic_string.h>
11*e0b80e1eSHarshit Aghera 
12*e0b80e1eSHarshit Aghera #include <MctpRequester.hpp>
13*e0b80e1eSHarshit Aghera #include <NvidiaGpuMctpVdm.hpp>
14*e0b80e1eSHarshit Aghera #include <NvidiaPcieDevice.hpp>
15*e0b80e1eSHarshit Aghera #include <OcpMctpVdm.hpp>
16*e0b80e1eSHarshit Aghera #include <phosphor-logging/lg2.hpp>
17*e0b80e1eSHarshit Aghera #include <sdbusplus/asio/connection.hpp>
18*e0b80e1eSHarshit Aghera #include <sdbusplus/asio/object_server.hpp>
19*e0b80e1eSHarshit Aghera 
20*e0b80e1eSHarshit Aghera #include <cmath>
21*e0b80e1eSHarshit Aghera #include <cstddef>
22*e0b80e1eSHarshit Aghera #include <cstdint>
23*e0b80e1eSHarshit Aghera #include <functional>
24*e0b80e1eSHarshit Aghera #include <limits>
25*e0b80e1eSHarshit Aghera #include <memory>
26*e0b80e1eSHarshit Aghera #include <span>
27*e0b80e1eSHarshit Aghera #include <string>
28*e0b80e1eSHarshit Aghera #include <system_error>
29*e0b80e1eSHarshit Aghera #include <vector>
30*e0b80e1eSHarshit Aghera 
31*e0b80e1eSHarshit Aghera using std::string;
32*e0b80e1eSHarshit Aghera 
33*e0b80e1eSHarshit Aghera using namespace std::literals;
34*e0b80e1eSHarshit Aghera 
NvidiaPcieInterface(std::shared_ptr<sdbusplus::asio::connection> & conn,mctp::MctpRequester & mctpRequester,const std::string & name,const std::string & path,uint8_t eid,sdbusplus::asio::object_server & objectServer)35*e0b80e1eSHarshit Aghera NvidiaPcieInterface::NvidiaPcieInterface(
36*e0b80e1eSHarshit Aghera     std::shared_ptr<sdbusplus::asio::connection>& conn,
37*e0b80e1eSHarshit Aghera     mctp::MctpRequester& mctpRequester, const std::string& name,
38*e0b80e1eSHarshit Aghera     const std::string& path, uint8_t eid,
39*e0b80e1eSHarshit Aghera     sdbusplus::asio::object_server& objectServer) :
40*e0b80e1eSHarshit Aghera     eid(eid), path(path), conn(conn), mctpRequester(mctpRequester)
41*e0b80e1eSHarshit Aghera {
42*e0b80e1eSHarshit Aghera     const std::string dbusPath = pcieDevicePathPrefix + escapeName(name);
43*e0b80e1eSHarshit Aghera 
44*e0b80e1eSHarshit Aghera     pcieDeviceInterface = objectServer.add_interface(
45*e0b80e1eSHarshit Aghera         dbusPath, "xyz.openbmc_project.Inventory.Item.PCIeDevice");
46*e0b80e1eSHarshit Aghera 
47*e0b80e1eSHarshit Aghera     switchInterface = objectServer.add_interface(
48*e0b80e1eSHarshit Aghera         dbusPath, "xyz.openbmc_project.Inventory.Item.PCIeSwitch");
49*e0b80e1eSHarshit Aghera 
50*e0b80e1eSHarshit Aghera     pcieDeviceInterface->register_property(
51*e0b80e1eSHarshit Aghera         "GenerationInUse",
52*e0b80e1eSHarshit Aghera         std::string(
53*e0b80e1eSHarshit Aghera             "xyz.openbmc_project.Inventory.Item.PCIeSlot.Generations.Unknown"));
54*e0b80e1eSHarshit Aghera 
55*e0b80e1eSHarshit Aghera     pcieDeviceInterface->register_property("LanesInUse",
56*e0b80e1eSHarshit Aghera                                            std::numeric_limits<size_t>::max());
57*e0b80e1eSHarshit Aghera 
58*e0b80e1eSHarshit Aghera     pcieDeviceInterface->register_property(
59*e0b80e1eSHarshit Aghera         "GenerationSupported",
60*e0b80e1eSHarshit Aghera         std::string(
61*e0b80e1eSHarshit Aghera             "xyz.openbmc_project.Inventory.Item.PCIeSlot.Generations.Unknown"));
62*e0b80e1eSHarshit Aghera 
63*e0b80e1eSHarshit Aghera     pcieDeviceInterface->register_property("MaxLanes", static_cast<size_t>(0));
64*e0b80e1eSHarshit Aghera 
65*e0b80e1eSHarshit Aghera     if (!pcieDeviceInterface->initialize())
66*e0b80e1eSHarshit Aghera     {
67*e0b80e1eSHarshit Aghera         lg2::error("Error initializing PCIe Device Interface for EID={EID}",
68*e0b80e1eSHarshit Aghera                    "EID", eid);
69*e0b80e1eSHarshit Aghera     }
70*e0b80e1eSHarshit Aghera 
71*e0b80e1eSHarshit Aghera     if (!switchInterface->initialize())
72*e0b80e1eSHarshit Aghera     {
73*e0b80e1eSHarshit Aghera         lg2::error("Error initializing Switch Interface for EID={EID}", "EID",
74*e0b80e1eSHarshit Aghera                    eid);
75*e0b80e1eSHarshit Aghera     }
76*e0b80e1eSHarshit Aghera }
77*e0b80e1eSHarshit Aghera 
mapPcieGeneration(uint32_t value)78*e0b80e1eSHarshit Aghera string NvidiaPcieInterface::mapPcieGeneration(uint32_t value)
79*e0b80e1eSHarshit Aghera {
80*e0b80e1eSHarshit Aghera     switch (value)
81*e0b80e1eSHarshit Aghera     {
82*e0b80e1eSHarshit Aghera         case 1:
83*e0b80e1eSHarshit Aghera             return "xyz.openbmc_project.Inventory.Item.PCIeSlot.Generations.Gen1";
84*e0b80e1eSHarshit Aghera         case 2:
85*e0b80e1eSHarshit Aghera             return "xyz.openbmc_project.Inventory.Item.PCIeSlot.Generations.Gen2";
86*e0b80e1eSHarshit Aghera         case 3:
87*e0b80e1eSHarshit Aghera             return "xyz.openbmc_project.Inventory.Item.PCIeSlot.Generations.Gen3";
88*e0b80e1eSHarshit Aghera         case 4:
89*e0b80e1eSHarshit Aghera             return "xyz.openbmc_project.Inventory.Item.PCIeSlot.Generations.Gen4";
90*e0b80e1eSHarshit Aghera         case 5:
91*e0b80e1eSHarshit Aghera             return "xyz.openbmc_project.Inventory.Item.PCIeSlot.Generations.Gen5";
92*e0b80e1eSHarshit Aghera         case 6:
93*e0b80e1eSHarshit Aghera             return "xyz.openbmc_project.Inventory.Item.PCIeSlot.Generations.Gen6";
94*e0b80e1eSHarshit Aghera         default:
95*e0b80e1eSHarshit Aghera             return "xyz.openbmc_project.Inventory.Item.PCIeSlot.Generations.Unknown";
96*e0b80e1eSHarshit Aghera     }
97*e0b80e1eSHarshit Aghera }
98*e0b80e1eSHarshit Aghera 
decodeLinkWidth(uint32_t value)99*e0b80e1eSHarshit Aghera size_t NvidiaPcieInterface::decodeLinkWidth(uint32_t value)
100*e0b80e1eSHarshit Aghera {
101*e0b80e1eSHarshit Aghera     return (value > 0) ? pow(2, value - 1) : 0;
102*e0b80e1eSHarshit Aghera }
103*e0b80e1eSHarshit Aghera 
processResponse(const std::error_code & ec,std::span<const uint8_t> response)104*e0b80e1eSHarshit Aghera void NvidiaPcieInterface::processResponse(const std::error_code& ec,
105*e0b80e1eSHarshit Aghera                                           std::span<const uint8_t> response)
106*e0b80e1eSHarshit Aghera {
107*e0b80e1eSHarshit Aghera     if (ec)
108*e0b80e1eSHarshit Aghera     {
109*e0b80e1eSHarshit Aghera         lg2::error(
110*e0b80e1eSHarshit Aghera             "Error updating PCIe Interface: sending message over MCTP failed, "
111*e0b80e1eSHarshit Aghera             "rc={RC}, EID={EID}",
112*e0b80e1eSHarshit Aghera             "RC", ec.value(), "EID", eid);
113*e0b80e1eSHarshit Aghera         return;
114*e0b80e1eSHarshit Aghera     }
115*e0b80e1eSHarshit Aghera 
116*e0b80e1eSHarshit Aghera     ocp::accelerator_management::CompletionCode cc{};
117*e0b80e1eSHarshit Aghera     uint16_t reasonCode = 0;
118*e0b80e1eSHarshit Aghera     size_t numTelemetryValue = 0;
119*e0b80e1eSHarshit Aghera 
120*e0b80e1eSHarshit Aghera     auto rc = gpu::decodeQueryScalarGroupTelemetryV2Response(
121*e0b80e1eSHarshit Aghera         response, cc, reasonCode, numTelemetryValue, telemetryValues);
122*e0b80e1eSHarshit Aghera 
123*e0b80e1eSHarshit Aghera     if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
124*e0b80e1eSHarshit Aghera     {
125*e0b80e1eSHarshit Aghera         lg2::error("Error updating PCIe Interface: decode failed, "
126*e0b80e1eSHarshit Aghera                    "rc={RC}, cc={CC}, reasonCode={RESC}, EID={EID}",
127*e0b80e1eSHarshit Aghera                    "RC", rc, "CC", static_cast<uint8_t>(cc), "RESC", reasonCode,
128*e0b80e1eSHarshit Aghera                    "EID", eid);
129*e0b80e1eSHarshit Aghera         return;
130*e0b80e1eSHarshit Aghera     }
131*e0b80e1eSHarshit Aghera 
132*e0b80e1eSHarshit Aghera     if (!telemetryValues.empty())
133*e0b80e1eSHarshit Aghera     {
134*e0b80e1eSHarshit Aghera         pcieDeviceInterface->set_property(
135*e0b80e1eSHarshit Aghera             "GenerationInUse", mapPcieGeneration(telemetryValues[0]));
136*e0b80e1eSHarshit Aghera     }
137*e0b80e1eSHarshit Aghera 
138*e0b80e1eSHarshit Aghera     if (telemetryValues.size() > 1)
139*e0b80e1eSHarshit Aghera     {
140*e0b80e1eSHarshit Aghera         pcieDeviceInterface->set_property(
141*e0b80e1eSHarshit Aghera             "LanesInUse",
142*e0b80e1eSHarshit Aghera             decodeLinkWidth(static_cast<size_t>(telemetryValues[1])));
143*e0b80e1eSHarshit Aghera     }
144*e0b80e1eSHarshit Aghera 
145*e0b80e1eSHarshit Aghera     if (telemetryValues.size() > 3)
146*e0b80e1eSHarshit Aghera     {
147*e0b80e1eSHarshit Aghera         pcieDeviceInterface->set_property(
148*e0b80e1eSHarshit Aghera             "GenerationSupported", mapPcieGeneration(telemetryValues[3]));
149*e0b80e1eSHarshit Aghera     }
150*e0b80e1eSHarshit Aghera 
151*e0b80e1eSHarshit Aghera     if (telemetryValues.size() > 4)
152*e0b80e1eSHarshit Aghera     {
153*e0b80e1eSHarshit Aghera         pcieDeviceInterface->set_property(
154*e0b80e1eSHarshit Aghera             "MaxLanes",
155*e0b80e1eSHarshit Aghera             decodeLinkWidth(static_cast<size_t>(telemetryValues[4])));
156*e0b80e1eSHarshit Aghera     }
157*e0b80e1eSHarshit Aghera }
158*e0b80e1eSHarshit Aghera 
update()159*e0b80e1eSHarshit Aghera void NvidiaPcieInterface::update()
160*e0b80e1eSHarshit Aghera {
161*e0b80e1eSHarshit Aghera     auto rc =
162*e0b80e1eSHarshit Aghera         gpu::encodeQueryScalarGroupTelemetryV2Request(0, {}, 0, 0, 1, request);
163*e0b80e1eSHarshit Aghera 
164*e0b80e1eSHarshit Aghera     if (rc != 0)
165*e0b80e1eSHarshit Aghera     {
166*e0b80e1eSHarshit Aghera         lg2::error("Error updating PCIe Interface: failed, rc={RC}, EID={EID}",
167*e0b80e1eSHarshit Aghera                    "RC", rc, "EID", eid);
168*e0b80e1eSHarshit Aghera         return;
169*e0b80e1eSHarshit Aghera     }
170*e0b80e1eSHarshit Aghera 
171*e0b80e1eSHarshit Aghera     mctpRequester.sendRecvMsg(
172*e0b80e1eSHarshit Aghera         eid, request,
173*e0b80e1eSHarshit Aghera         [weak{weak_from_this()}](const std::error_code& ec,
174*e0b80e1eSHarshit Aghera                                  std::span<const uint8_t> buffer) {
175*e0b80e1eSHarshit Aghera             std::shared_ptr<NvidiaPcieInterface> self = weak.lock();
176*e0b80e1eSHarshit Aghera             if (!self)
177*e0b80e1eSHarshit Aghera             {
178*e0b80e1eSHarshit Aghera                 lg2::error(
179*e0b80e1eSHarshit Aghera                     "Invalid reference to NvidiaPcieInterface for EID {EID}",
180*e0b80e1eSHarshit Aghera                     "EID", self->eid);
181*e0b80e1eSHarshit Aghera                 return;
182*e0b80e1eSHarshit Aghera             }
183*e0b80e1eSHarshit Aghera             self->processResponse(ec, buffer);
184*e0b80e1eSHarshit Aghera         });
185*e0b80e1eSHarshit Aghera }
186