xref: /openbmc/dbus-sensors/src/nvidia-gpu/NvidiaGpuMctpVdm.hpp (revision 86786b6c21320f3da413b9deed07d2f5360edabd)
1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
3  * AFFILIATES. All rights reserved.
4  * SPDX-License-Identifier: Apache-2.0
5  */
6 
7 #pragma once
8 
9 #include <OcpMctpVdm.hpp>
10 
11 #include <array>
12 #include <cstddef>
13 #include <cstdint>
14 #include <span>
15 #include <string>
16 #include <variant>
17 #include <vector>
18 
19 namespace gpu
20 {
21 
22 using InventoryValue = std::variant<std::string, std::vector<uint8_t>>;
23 constexpr size_t maxInventoryDataSize = 256;
24 
25 constexpr uint16_t nvidiaPciVendorId = 0x10de;
26 
27 enum class MessageType : uint8_t
28 {
29     DEVICE_CAPABILITY_DISCOVERY = 0,
30     PLATFORM_ENVIRONMENTAL = 3
31 };
32 
33 enum class DeviceCapabilityDiscoveryCommands : uint8_t
34 {
35     QUERY_DEVICE_IDENTIFICATION = 0x09,
36 };
37 
38 enum class PlatformEnvironmentalCommands : uint8_t
39 {
40     GET_TEMPERATURE_READING = 0x00,
41     READ_THERMAL_PARAMETERS = 0x02,
42     GET_CURRENT_POWER_DRAW = 0x03,
43     GET_CURRENT_ENERGY_COUNTER = 0x06,
44     GET_INVENTORY_INFORMATION = 0x0C,
45     GET_VOLTAGE = 0x0F,
46 };
47 
48 enum class DeviceIdentification : uint8_t
49 {
50     DEVICE_GPU = 0,
51     DEVICE_SMA = 5
52 };
53 
54 enum class InventoryPropertyId : uint8_t
55 {
56     BOARD_PART_NUMBER = 0,
57     SERIAL_NUMBER = 1,
58     MARKETING_NAME = 2,
59     DEVICE_PART_NUMBER = 3,
60     FRU_PART_NUMBER = 4,
61     MEMORY_VENDOR = 5,
62     MEMORY_PART_NUMBER = 6,
63     MAX_MEMORY_CAPACITY = 7,
64     BUILD_DATE = 8,
65     FIRMWARE_VERSION = 9,
66     DEVICE_GUID = 10,
67     INFOROM_VERSION = 11,
68     PRODUCT_LENGTH = 12,
69     PRODUCT_WIDTH = 13,
70     PRODUCT_HEIGHT = 14,
71     RATED_DEVICE_POWER_LIMIT = 15,
72     MIN_DEVICE_POWER_LIMIT = 16,
73     MAX_DEVICE_POWER_LIMIT = 17,
74     MAX_MODULE_POWER_LIMIT = 18,
75     MIN_MODULE_POWER_LIMIT = 19,
76     RATED_MODULE_POWER_LIMIT = 20,
77     DEFAULT_BOOST_CLOCKS = 21,
78     DEFAULT_BASE_CLOCKS = 22,
79     DEFAULT_EDPP_SCALING = 23,
80     MIN_EDPP_SCALING = 24,
81     MAX_EDPP_SCALING = 25,
82     MIN_GRAPHICS_CLOCK = 26,
83     MAX_GRAPHICS_CLOCK = 27,
84     MIN_MEMORY_CLOCK = 28,
85     MAX_MEMORY_CLOCK = 29,
86     INFINIBAND_GUID = 30,
87     RACK_GUID = 31,
88     RACK_SLOT_NUMBER = 32,
89     COMPUTE_SLOT_INDEX = 33,
90     NODE_INDEX = 34,
91     GPU_NODE_ID = 35,
92     NVLINK_PEER_TYPE = 36
93 };
94 
95 struct QueryDeviceIdentificationRequest
96 {
97     ocp::accelerator_management::CommonRequest hdr;
98 } __attribute__((packed));
99 
100 struct QueryDeviceIdentificationResponse
101 {
102     ocp::accelerator_management::CommonResponse hdr;
103     uint8_t device_identification;
104     uint8_t instance_id;
105 } __attribute__((packed));
106 
107 struct GetNumericSensorReadingRequest
108 {
109     ocp::accelerator_management::CommonRequest hdr;
110     uint8_t sensor_id;
111 } __attribute__((packed));
112 
113 using GetTemperatureReadingRequest = GetNumericSensorReadingRequest;
114 
115 using ReadThermalParametersRequest = GetNumericSensorReadingRequest;
116 
117 struct GetCurrentPowerDrawRequest
118 {
119     ocp::accelerator_management::CommonRequest hdr;
120     uint8_t sensorId;
121     uint8_t averagingInterval;
122 } __attribute__((packed));
123 
124 using GetCurrentEnergyCounterRequest = GetNumericSensorReadingRequest;
125 
126 using GetVoltageRequest = GetNumericSensorReadingRequest;
127 
128 struct GetTemperatureReadingResponse
129 {
130     ocp::accelerator_management::CommonResponse hdr;
131     int32_t reading;
132 } __attribute__((packed));
133 
134 struct ReadThermalParametersResponse
135 {
136     ocp::accelerator_management::CommonResponse hdr;
137     int32_t threshold;
138 } __attribute__((packed));
139 
140 struct GetCurrentPowerDrawResponse
141 {
142     ocp::accelerator_management::CommonResponse hdr;
143     uint32_t power;
144 } __attribute__((packed));
145 
146 struct GetCurrentEnergyCounterResponse
147 {
148     ocp::accelerator_management::CommonResponse hdr;
149     uint64_t energy;
150 } __attribute__((packed));
151 
152 struct GetVoltageResponse
153 {
154     ocp::accelerator_management::CommonResponse hdr;
155     uint32_t voltage;
156 } __attribute__((packed));
157 
158 struct GetInventoryInformationRequest
159 {
160     ocp::accelerator_management::CommonRequest hdr;
161     uint8_t property_id;
162 } __attribute__((packed));
163 
164 struct GetInventoryInformationResponse
165 {
166     ocp::accelerator_management::CommonResponse hdr;
167     std::array<uint8_t, maxInventoryDataSize> data;
168 } __attribute__((packed));
169 
170 int packHeader(const ocp::accelerator_management::BindingPciVidInfo& hdr,
171                ocp::accelerator_management::BindingPciVid& msg);
172 
173 int encodeQueryDeviceIdentificationRequest(uint8_t instanceId,
174                                            std::span<uint8_t> buf);
175 
176 int decodeQueryDeviceIdentificationResponse(
177     std::span<const uint8_t> buf,
178     ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode,
179     uint8_t& deviceIdentification, uint8_t& deviceInstance);
180 
181 int encodeGetTemperatureReadingRequest(uint8_t instanceId, uint8_t sensorId,
182                                        std::span<uint8_t> buf);
183 
184 int decodeGetTemperatureReadingResponse(
185     std::span<const uint8_t> buf,
186     ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode,
187     double& temperatureReading);
188 
189 int encodeReadThermalParametersRequest(uint8_t instanceId, uint8_t sensorId,
190                                        std::span<uint8_t> buf);
191 
192 int decodeReadThermalParametersResponse(
193     std::span<const uint8_t> buf,
194     ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode,
195     int32_t& threshold);
196 
197 int encodeGetCurrentPowerDrawRequest(uint8_t instanceId, uint8_t sensorId,
198                                      uint8_t averagingInterval,
199                                      std::span<uint8_t> buf);
200 
201 int decodeGetCurrentPowerDrawResponse(
202     std::span<const uint8_t> buf,
203     ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode,
204     uint32_t& power);
205 
206 int encodeGetCurrentEnergyCounterRequest(uint8_t instanceId, uint8_t sensorId,
207                                          std::span<uint8_t> buf);
208 
209 int decodeGetCurrentEnergyCounterResponse(
210     std::span<const uint8_t> buf,
211     ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode,
212     uint64_t& energy);
213 
214 int encodeGetVoltageRequest(uint8_t instanceId, uint8_t sensorId,
215                             std::span<uint8_t> buf);
216 
217 int decodeGetVoltageResponse(std::span<const uint8_t> buf,
218                              ocp::accelerator_management::CompletionCode& cc,
219                              uint16_t& reasonCode, uint32_t& voltage);
220 
221 int encodeGetInventoryInformationRequest(uint8_t instanceId, uint8_t propertyId,
222                                          std::span<uint8_t> buf);
223 
224 int decodeGetInventoryInformationResponse(
225     std::span<const uint8_t> buf,
226     ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode,
227     InventoryPropertyId propertyId, InventoryValue& value);
228 
229 } // namespace gpu
230