xref: /openbmc/dbus-sensors/src/nvidia-gpu/NvidiaGpuMctpVdm.hpp (revision b139302ca2dd5f55253fb05bdc72ecef0b2280d8)
1 /*
2  * SPDX-FileCopyrightText: Copyright OpenBMC Authors
3  * SPDX-License-Identifier: Apache-2.0
4  */
5 
6 #pragma once
7 
8 #include <OcpMctpVdm.hpp>
9 
10 #include <array>
11 #include <cstddef>
12 #include <cstdint>
13 #include <span>
14 #include <string>
15 #include <utility>
16 #include <variant>
17 #include <vector>
18 
19 namespace gpu
20 {
21 
22 using InventoryValue =
23     std::variant<std::string, std::vector<uint8_t>, uint32_t>;
24 constexpr size_t maxInventoryDataSize = 256;
25 
26 constexpr uint16_t nvidiaPciVendorId = 0x10de;
27 
28 enum class MessageType : uint8_t
29 {
30     DEVICE_CAPABILITY_DISCOVERY = 0,
31     NETWORK_PORT = 1,
32     PCIE_LINK = 2,
33     PLATFORM_ENVIRONMENTAL = 3
34 };
35 
36 enum class DeviceCapabilityDiscoveryCommands : uint8_t
37 {
38     QUERY_DEVICE_IDENTIFICATION = 0x09,
39 };
40 
41 enum class PlatformEnvironmentalCommands : uint8_t
42 {
43     GET_TEMPERATURE_READING = 0x00,
44     READ_THERMAL_PARAMETERS = 0x02,
45     GET_CURRENT_POWER_DRAW = 0x03,
46     GET_MAX_OBSERVED_POWER = 0x04,
47     GET_CURRENT_ENERGY_COUNTER = 0x06,
48     GET_INVENTORY_INFORMATION = 0x0C,
49     GET_DRIVER_INFORMATION = 0x0E,
50     GET_VOLTAGE = 0x0F,
51 };
52 
53 enum class NetworkPortCommands : uint8_t
54 {
55     GetEthernetPortTelemetryCounters = 0x0F,
56     GetPortNetworkAddresses = 0x11,
57 };
58 
59 enum class PcieLinkCommands : uint8_t
60 {
61     ListPCIePorts = 0x07,
62     QueryScalarGroupTelemetryV2 = 0x24,
63 };
64 
65 enum class DeviceIdentification : uint8_t
66 {
67     DEVICE_GPU = 0,
68     DEVICE_PCIE = 2,
69     DEVICE_SMA = 5
70 };
71 
72 enum class InventoryPropertyId : uint8_t
73 {
74     BOARD_PART_NUMBER = 0,
75     SERIAL_NUMBER = 1,
76     MARKETING_NAME = 2,
77     DEVICE_PART_NUMBER = 3,
78     FRU_PART_NUMBER = 4,
79     MEMORY_VENDOR = 5,
80     MEMORY_PART_NUMBER = 6,
81     MAX_MEMORY_CAPACITY = 7,
82     BUILD_DATE = 8,
83     FIRMWARE_VERSION = 9,
84     DEVICE_GUID = 10,
85     INFOROM_VERSION = 11,
86     PRODUCT_LENGTH = 12,
87     PRODUCT_WIDTH = 13,
88     PRODUCT_HEIGHT = 14,
89     RATED_DEVICE_POWER_LIMIT = 15,
90     MIN_DEVICE_POWER_LIMIT = 16,
91     MAX_DEVICE_POWER_LIMIT = 17,
92     MAX_MODULE_POWER_LIMIT = 18,
93     MIN_MODULE_POWER_LIMIT = 19,
94     RATED_MODULE_POWER_LIMIT = 20,
95     DEFAULT_BOOST_CLOCKS = 21,
96     DEFAULT_BASE_CLOCKS = 22,
97     DEFAULT_EDPP_SCALING = 23,
98     MIN_EDPP_SCALING = 24,
99     MAX_EDPP_SCALING = 25,
100     MIN_GRAPHICS_CLOCK = 26,
101     MAX_GRAPHICS_CLOCK = 27,
102     MIN_MEMORY_CLOCK = 28,
103     MAX_MEMORY_CLOCK = 29,
104     INFINIBAND_GUID = 30,
105     RACK_GUID = 31,
106     RACK_SLOT_NUMBER = 32,
107     COMPUTE_SLOT_INDEX = 33,
108     NODE_INDEX = 34,
109     GPU_NODE_ID = 35,
110     NVLINK_PEER_TYPE = 36
111 };
112 
113 enum class PciePortType : uint8_t
114 {
115     UPSTREAM = 0,
116     DOWNSTREAM = 1,
117 };
118 
119 enum class DriverState : uint8_t
120 {
121     DRIVER_STATE_UNKNOWN = 0,
122     DRIVER_STATE_NOT_LOADED = 1,
123     DRIVER_STATE_LOADED = 2,
124 };
125 
126 enum class NetworkPortLinkType : uint8_t
127 {
128     ETHERNET = 0,
129     INFINIBAND = 1,
130     UNKNOWN = 0xFF,
131 };
132 
133 struct QueryDeviceIdentificationRequest
134 {
135     ocp::accelerator_management::CommonRequest hdr;
136 } __attribute__((packed));
137 
138 struct QueryDeviceIdentificationResponse
139 {
140     ocp::accelerator_management::CommonResponse hdr;
141     uint8_t device_identification;
142     uint8_t instance_id;
143 } __attribute__((packed));
144 
145 struct GetNumericSensorReadingRequest
146 {
147     ocp::accelerator_management::CommonRequest hdr;
148     uint8_t sensor_id;
149 } __attribute__((packed));
150 
151 using GetTemperatureReadingRequest = GetNumericSensorReadingRequest;
152 
153 using ReadThermalParametersRequest = GetNumericSensorReadingRequest;
154 
155 struct GetPowerDrawRequest
156 {
157     ocp::accelerator_management::CommonRequest hdr;
158     uint8_t sensorId;
159     uint8_t averagingInterval;
160 } __attribute__((packed));
161 
162 using GetCurrentEnergyCounterRequest = GetNumericSensorReadingRequest;
163 
164 using GetVoltageRequest = GetNumericSensorReadingRequest;
165 
166 struct QueryScalarGroupTelemetryV2Request
167 {
168     ocp::accelerator_management::CommonRequest hdr;
169     uint8_t upstreamPortNumber;
170     uint8_t portNumber;
171     uint8_t groupId;
172 } __attribute__((packed));
173 
174 struct GetPortNetworkAddressesRequest
175 {
176     ocp::accelerator_management::CommonRequest hdr;
177     uint16_t portNumber;
178 } __attribute__((packed));
179 
180 struct GetEthernetPortTelemetryCountersRequest
181 {
182     ocp::accelerator_management::CommonRequest hdr;
183     uint16_t portNumber;
184 } __attribute__((packed));
185 
186 struct GetTemperatureReadingResponse
187 {
188     ocp::accelerator_management::CommonResponse hdr;
189     int32_t reading;
190 } __attribute__((packed));
191 
192 struct ReadThermalParametersResponse
193 {
194     ocp::accelerator_management::CommonResponse hdr;
195     int32_t threshold;
196 } __attribute__((packed));
197 
198 struct GetPowerDrawResponse
199 {
200     ocp::accelerator_management::CommonResponse hdr;
201     uint32_t power;
202 } __attribute__((packed));
203 
204 struct GetCurrentEnergyCounterResponse
205 {
206     ocp::accelerator_management::CommonResponse hdr;
207     uint64_t energy;
208 } __attribute__((packed));
209 
210 struct GetVoltageResponse
211 {
212     ocp::accelerator_management::CommonResponse hdr;
213     uint32_t voltage;
214 } __attribute__((packed));
215 
216 struct ListPCIePortsResponse
217 {
218     ocp::accelerator_management::CommonResponse hdr;
219     uint16_t numUpstreamPorts;
220 } __attribute__((packed));
221 
222 struct ListPCIePortsDownstreamPortsData
223 {
224     uint8_t isInternal;
225     uint8_t count;
226 } __attribute__((packed));
227 
228 struct GetDriverInformationResponse
229 {
230     ocp::accelerator_management::CommonResponse hdr;
231     DriverState driverState;
232     char driverVersion;
233 } __attribute__((packed));
234 
235 struct GetInventoryInformationRequest
236 {
237     ocp::accelerator_management::CommonRequest hdr;
238     uint8_t property_id;
239 } __attribute__((packed));
240 
241 struct GetInventoryInformationResponse
242 {
243     ocp::accelerator_management::CommonResponse hdr;
244     std::array<uint8_t, maxInventoryDataSize> data;
245 } __attribute__((packed));
246 
247 int packHeader(const ocp::accelerator_management::BindingPciVidInfo& hdr,
248                ocp::accelerator_management::BindingPciVid& msg);
249 
250 int encodeQueryDeviceIdentificationRequest(uint8_t instanceId,
251                                            std::span<uint8_t> buf);
252 
253 int decodeQueryDeviceIdentificationResponse(
254     std::span<const uint8_t> buf,
255     ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode,
256     uint8_t& deviceIdentification, uint8_t& deviceInstance);
257 
258 int encodeGetTemperatureReadingRequest(uint8_t instanceId, uint8_t sensorId,
259                                        std::span<uint8_t> buf);
260 
261 int decodeGetTemperatureReadingResponse(
262     std::span<const uint8_t> buf,
263     ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode,
264     double& temperatureReading);
265 
266 int encodeReadThermalParametersRequest(uint8_t instanceId, uint8_t sensorId,
267                                        std::span<uint8_t> buf);
268 
269 int decodeReadThermalParametersResponse(
270     std::span<const uint8_t> buf,
271     ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode,
272     int32_t& threshold);
273 
274 int encodeGetPowerDrawRequest(
275     PlatformEnvironmentalCommands commandCode, uint8_t instanceId,
276     uint8_t sensorId, uint8_t averagingInterval, std::span<uint8_t> buf);
277 
278 int decodeGetPowerDrawResponse(std::span<const uint8_t> buf,
279                                ocp::accelerator_management::CompletionCode& cc,
280                                uint16_t& reasonCode, uint32_t& power);
281 
282 int encodeGetCurrentEnergyCounterRequest(uint8_t instanceId, uint8_t sensorId,
283                                          std::span<uint8_t> buf);
284 
285 int decodeGetCurrentEnergyCounterResponse(
286     std::span<const uint8_t> buf,
287     ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode,
288     uint64_t& energy);
289 
290 int encodeGetVoltageRequest(uint8_t instanceId, uint8_t sensorId,
291                             std::span<uint8_t> buf);
292 
293 int decodeGetVoltageResponse(std::span<const uint8_t> buf,
294                              ocp::accelerator_management::CompletionCode& cc,
295                              uint16_t& reasonCode, uint32_t& voltage);
296 
297 int encodeGetDriverInformationRequest(uint8_t instanceId,
298                                       std::span<uint8_t> buf);
299 
300 int decodeGetDriverInformationResponse(
301     std::span<const uint8_t> buf,
302     ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode,
303     DriverState& driverState, std::string& driverVersion);
304 
305 int encodeGetInventoryInformationRequest(uint8_t instanceId, uint8_t propertyId,
306                                          std::span<uint8_t> buf);
307 
308 int decodeGetInventoryInformationResponse(
309     std::span<const uint8_t> buf,
310     ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode,
311     InventoryPropertyId propertyId, InventoryValue& value);
312 
313 int encodeQueryScalarGroupTelemetryV2Request(
314     uint8_t instanceId, PciePortType portType, uint8_t upstreamPortNumber,
315     uint8_t portNumber, uint8_t groupId, std::span<uint8_t> buf);
316 
317 int decodeQueryScalarGroupTelemetryV2Response(
318     std::span<const uint8_t> buf,
319     ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode,
320     size_t& numTelemetryValues, std::vector<uint32_t>& telemetryValues);
321 
322 int encodeListPciePortsRequest(uint8_t instanceId, std::span<uint8_t> buf);
323 
324 int decodeListPciePortsResponse(
325     std::span<const uint8_t> buf,
326     ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode,
327     uint16_t& numUpstreamPorts, std::vector<uint8_t>& numDownstreamPorts);
328 
329 int encodeGetPortNetworkAddressesRequest(
330     uint8_t instanceId, uint16_t portNumber, std::span<uint8_t> buf);
331 
332 int decodeGetPortNetworkAddressesResponse(
333     std::span<const uint8_t> buf,
334     ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode,
335     NetworkPortLinkType& linkType,
336     std::vector<std::pair<uint8_t, uint64_t>>& addresses);
337 
338 int encodeGetEthernetPortTelemetryCountersRequest(
339     uint8_t instanceId, uint16_t portNumber, std::span<uint8_t> buf);
340 
341 int decodeGetEthernetPortTelemetryCountersResponse(
342     std::span<const uint8_t> buf,
343     ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode,
344     std::vector<std::pair<uint8_t, uint64_t>>& telemetryValues);
345 } // namespace gpu
346