xref: /openbmc/dbus-sensors/src/nvidia-gpu/NvidiaGpuMctpVdm.hpp (revision 7427aeef4225bf23715539b195a23bce10865265)
1 /*
2  * SPDX-FileCopyrightText: Copyright OpenBMC Authors
3  * SPDX-License-Identifier: Apache-2.0
4  */
5 
6 #pragma once
7 
8 #include <OcpMctpVdm.hpp>
9 
10 #include <array>
11 #include <cstddef>
12 #include <cstdint>
13 #include <span>
14 #include <string>
15 #include <utility>
16 #include <variant>
17 #include <vector>
18 
19 namespace gpu
20 {
21 
22 using InventoryValue = std::variant<std::string, std::vector<uint8_t>>;
23 constexpr size_t maxInventoryDataSize = 256;
24 
25 constexpr uint16_t nvidiaPciVendorId = 0x10de;
26 
27 enum class MessageType : uint8_t
28 {
29     DEVICE_CAPABILITY_DISCOVERY = 0,
30     NETWORK_PORT = 1,
31     PCIE_LINK = 2,
32     PLATFORM_ENVIRONMENTAL = 3
33 };
34 
35 enum class DeviceCapabilityDiscoveryCommands : uint8_t
36 {
37     QUERY_DEVICE_IDENTIFICATION = 0x09,
38 };
39 
40 enum class PlatformEnvironmentalCommands : uint8_t
41 {
42     GET_TEMPERATURE_READING = 0x00,
43     READ_THERMAL_PARAMETERS = 0x02,
44     GET_CURRENT_POWER_DRAW = 0x03,
45     GET_MAX_OBSERVED_POWER = 0x04,
46     GET_CURRENT_ENERGY_COUNTER = 0x06,
47     GET_INVENTORY_INFORMATION = 0x0C,
48     GET_DRIVER_INFORMATION = 0x0E,
49     GET_VOLTAGE = 0x0F,
50 };
51 
52 enum class NetworkPortCommands : uint8_t
53 {
54     GetEthernetPortTelemetryCounters = 0x0F,
55     GetPortNetworkAddresses = 0x11,
56 };
57 
58 enum class PcieLinkCommands : uint8_t
59 {
60     ListPCIePorts = 0x07,
61     QueryScalarGroupTelemetryV2 = 0x24,
62 };
63 
64 enum class DeviceIdentification : uint8_t
65 {
66     DEVICE_GPU = 0,
67     DEVICE_PCIE = 2,
68     DEVICE_SMA = 5
69 };
70 
71 enum class InventoryPropertyId : uint8_t
72 {
73     BOARD_PART_NUMBER = 0,
74     SERIAL_NUMBER = 1,
75     MARKETING_NAME = 2,
76     DEVICE_PART_NUMBER = 3,
77     FRU_PART_NUMBER = 4,
78     MEMORY_VENDOR = 5,
79     MEMORY_PART_NUMBER = 6,
80     MAX_MEMORY_CAPACITY = 7,
81     BUILD_DATE = 8,
82     FIRMWARE_VERSION = 9,
83     DEVICE_GUID = 10,
84     INFOROM_VERSION = 11,
85     PRODUCT_LENGTH = 12,
86     PRODUCT_WIDTH = 13,
87     PRODUCT_HEIGHT = 14,
88     RATED_DEVICE_POWER_LIMIT = 15,
89     MIN_DEVICE_POWER_LIMIT = 16,
90     MAX_DEVICE_POWER_LIMIT = 17,
91     MAX_MODULE_POWER_LIMIT = 18,
92     MIN_MODULE_POWER_LIMIT = 19,
93     RATED_MODULE_POWER_LIMIT = 20,
94     DEFAULT_BOOST_CLOCKS = 21,
95     DEFAULT_BASE_CLOCKS = 22,
96     DEFAULT_EDPP_SCALING = 23,
97     MIN_EDPP_SCALING = 24,
98     MAX_EDPP_SCALING = 25,
99     MIN_GRAPHICS_CLOCK = 26,
100     MAX_GRAPHICS_CLOCK = 27,
101     MIN_MEMORY_CLOCK = 28,
102     MAX_MEMORY_CLOCK = 29,
103     INFINIBAND_GUID = 30,
104     RACK_GUID = 31,
105     RACK_SLOT_NUMBER = 32,
106     COMPUTE_SLOT_INDEX = 33,
107     NODE_INDEX = 34,
108     GPU_NODE_ID = 35,
109     NVLINK_PEER_TYPE = 36
110 };
111 
112 enum class PciePortType : uint8_t
113 {
114     UPSTREAM = 0,
115     DOWNSTREAM = 1,
116 };
117 
118 enum class DriverState : uint8_t
119 {
120     DRIVER_STATE_UNKNOWN = 0,
121     DRIVER_STATE_NOT_LOADED = 1,
122     DRIVER_STATE_LOADED = 2,
123 };
124 
125 enum class NetworkPortLinkType : uint8_t
126 {
127     ETHERNET = 0,
128     INFINIBAND = 1,
129     UNKNOWN = 0xFF,
130 };
131 
132 struct QueryDeviceIdentificationRequest
133 {
134     ocp::accelerator_management::CommonRequest hdr;
135 } __attribute__((packed));
136 
137 struct QueryDeviceIdentificationResponse
138 {
139     ocp::accelerator_management::CommonResponse hdr;
140     uint8_t device_identification;
141     uint8_t instance_id;
142 } __attribute__((packed));
143 
144 struct GetNumericSensorReadingRequest
145 {
146     ocp::accelerator_management::CommonRequest hdr;
147     uint8_t sensor_id;
148 } __attribute__((packed));
149 
150 using GetTemperatureReadingRequest = GetNumericSensorReadingRequest;
151 
152 using ReadThermalParametersRequest = GetNumericSensorReadingRequest;
153 
154 struct GetPowerDrawRequest
155 {
156     ocp::accelerator_management::CommonRequest hdr;
157     uint8_t sensorId;
158     uint8_t averagingInterval;
159 } __attribute__((packed));
160 
161 using GetCurrentEnergyCounterRequest = GetNumericSensorReadingRequest;
162 
163 using GetVoltageRequest = GetNumericSensorReadingRequest;
164 
165 struct QueryScalarGroupTelemetryV2Request
166 {
167     ocp::accelerator_management::CommonRequest hdr;
168     uint8_t upstreamPortNumber;
169     uint8_t portNumber;
170     uint8_t groupId;
171 } __attribute__((packed));
172 
173 struct GetPortNetworkAddressesRequest
174 {
175     ocp::accelerator_management::CommonRequest hdr;
176     uint16_t portNumber;
177 } __attribute__((packed));
178 
179 struct GetEthernetPortTelemetryCountersRequest
180 {
181     ocp::accelerator_management::CommonRequest hdr;
182     uint16_t portNumber;
183 } __attribute__((packed));
184 
185 struct GetTemperatureReadingResponse
186 {
187     ocp::accelerator_management::CommonResponse hdr;
188     int32_t reading;
189 } __attribute__((packed));
190 
191 struct ReadThermalParametersResponse
192 {
193     ocp::accelerator_management::CommonResponse hdr;
194     int32_t threshold;
195 } __attribute__((packed));
196 
197 struct GetPowerDrawResponse
198 {
199     ocp::accelerator_management::CommonResponse hdr;
200     uint32_t power;
201 } __attribute__((packed));
202 
203 struct GetCurrentEnergyCounterResponse
204 {
205     ocp::accelerator_management::CommonResponse hdr;
206     uint64_t energy;
207 } __attribute__((packed));
208 
209 struct GetVoltageResponse
210 {
211     ocp::accelerator_management::CommonResponse hdr;
212     uint32_t voltage;
213 } __attribute__((packed));
214 
215 struct ListPCIePortsResponse
216 {
217     ocp::accelerator_management::CommonResponse hdr;
218     uint16_t numUpstreamPorts;
219 } __attribute__((packed));
220 
221 struct ListPCIePortsDownstreamPortsData
222 {
223     uint8_t isInternal;
224     uint8_t count;
225 } __attribute__((packed));
226 
227 struct GetDriverInformationResponse
228 {
229     ocp::accelerator_management::CommonResponse hdr;
230     DriverState driverState;
231     char driverVersion;
232 } __attribute__((packed));
233 
234 struct GetInventoryInformationRequest
235 {
236     ocp::accelerator_management::CommonRequest hdr;
237     uint8_t property_id;
238 } __attribute__((packed));
239 
240 struct GetInventoryInformationResponse
241 {
242     ocp::accelerator_management::CommonResponse hdr;
243     std::array<uint8_t, maxInventoryDataSize> data;
244 } __attribute__((packed));
245 
246 int packHeader(const ocp::accelerator_management::BindingPciVidInfo& hdr,
247                ocp::accelerator_management::BindingPciVid& msg);
248 
249 int encodeQueryDeviceIdentificationRequest(uint8_t instanceId,
250                                            std::span<uint8_t> buf);
251 
252 int decodeQueryDeviceIdentificationResponse(
253     std::span<const uint8_t> buf,
254     ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode,
255     uint8_t& deviceIdentification, uint8_t& deviceInstance);
256 
257 int encodeGetTemperatureReadingRequest(uint8_t instanceId, uint8_t sensorId,
258                                        std::span<uint8_t> buf);
259 
260 int decodeGetTemperatureReadingResponse(
261     std::span<const uint8_t> buf,
262     ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode,
263     double& temperatureReading);
264 
265 int encodeReadThermalParametersRequest(uint8_t instanceId, uint8_t sensorId,
266                                        std::span<uint8_t> buf);
267 
268 int decodeReadThermalParametersResponse(
269     std::span<const uint8_t> buf,
270     ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode,
271     int32_t& threshold);
272 
273 int encodeGetPowerDrawRequest(
274     PlatformEnvironmentalCommands commandCode, uint8_t instanceId,
275     uint8_t sensorId, uint8_t averagingInterval, std::span<uint8_t> buf);
276 
277 int decodeGetPowerDrawResponse(std::span<const uint8_t> buf,
278                                ocp::accelerator_management::CompletionCode& cc,
279                                uint16_t& reasonCode, uint32_t& power);
280 
281 int encodeGetCurrentEnergyCounterRequest(uint8_t instanceId, uint8_t sensorId,
282                                          std::span<uint8_t> buf);
283 
284 int decodeGetCurrentEnergyCounterResponse(
285     std::span<const uint8_t> buf,
286     ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode,
287     uint64_t& energy);
288 
289 int encodeGetVoltageRequest(uint8_t instanceId, uint8_t sensorId,
290                             std::span<uint8_t> buf);
291 
292 int decodeGetVoltageResponse(std::span<const uint8_t> buf,
293                              ocp::accelerator_management::CompletionCode& cc,
294                              uint16_t& reasonCode, uint32_t& voltage);
295 
296 int encodeGetDriverInformationRequest(uint8_t instanceId,
297                                       std::span<uint8_t> buf);
298 
299 int decodeGetDriverInformationResponse(
300     std::span<const uint8_t> buf,
301     ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode,
302     DriverState& driverState, std::string& driverVersion);
303 
304 int encodeGetInventoryInformationRequest(uint8_t instanceId, uint8_t propertyId,
305                                          std::span<uint8_t> buf);
306 
307 int decodeGetInventoryInformationResponse(
308     std::span<const uint8_t> buf,
309     ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode,
310     InventoryPropertyId propertyId, InventoryValue& value);
311 
312 int encodeQueryScalarGroupTelemetryV2Request(
313     uint8_t instanceId, PciePortType portType, uint8_t upstreamPortNumber,
314     uint8_t portNumber, uint8_t groupId, std::span<uint8_t> buf);
315 
316 int decodeQueryScalarGroupTelemetryV2Response(
317     std::span<const uint8_t> buf,
318     ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode,
319     size_t& numTelemetryValues, std::vector<uint32_t>& telemetryValues);
320 
321 int encodeListPciePortsRequest(uint8_t instanceId, std::span<uint8_t> buf);
322 
323 int decodeListPciePortsResponse(
324     std::span<const uint8_t> buf,
325     ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode,
326     uint16_t& numUpstreamPorts, std::vector<uint8_t>& numDownstreamPorts);
327 
328 int encodeGetPortNetworkAddressesRequest(
329     uint8_t instanceId, uint16_t portNumber, std::span<uint8_t> buf);
330 
331 int decodeGetPortNetworkAddressesResponse(
332     std::span<const uint8_t> buf,
333     ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode,
334     NetworkPortLinkType& linkType,
335     std::vector<std::pair<uint8_t, uint64_t>>& addresses);
336 
337 int encodeGetEthernetPortTelemetryCountersRequest(
338     uint8_t instanceId, uint16_t portNumber, std::span<uint8_t> buf);
339 
340 int decodeGetEthernetPortTelemetryCountersResponse(
341     std::span<const uint8_t> buf,
342     ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode,
343     std::vector<std::pair<uint8_t, uint64_t>>& telemetryValues);
344 } // namespace gpu
345