1 /* 2 * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & 3 * AFFILIATES. All rights reserved. 4 * SPDX-License-Identifier: Apache-2.0 5 */ 6 7 #pragma once 8 9 #include <OcpMctpVdm.hpp> 10 11 #include <array> 12 #include <cstddef> 13 #include <cstdint> 14 #include <span> 15 #include <string> 16 #include <variant> 17 #include <vector> 18 19 namespace gpu 20 { 21 22 using InventoryValue = std::variant<std::string, std::vector<uint8_t>>; 23 constexpr size_t maxInventoryDataSize = 256; 24 25 constexpr uint16_t nvidiaPciVendorId = 0x10de; 26 27 enum class MessageType : uint8_t 28 { 29 DEVICE_CAPABILITY_DISCOVERY = 0, 30 PLATFORM_ENVIRONMENTAL = 3 31 }; 32 33 enum class DeviceCapabilityDiscoveryCommands : uint8_t 34 { 35 QUERY_DEVICE_IDENTIFICATION = 0x09, 36 }; 37 38 enum class PlatformEnvironmentalCommands : uint8_t 39 { 40 GET_TEMPERATURE_READING = 0x00, 41 READ_THERMAL_PARAMETERS = 0x02, 42 GET_CURRENT_POWER_DRAW = 0x03, 43 GET_CURRENT_ENERGY_COUNTER = 0x06, 44 GET_INVENTORY_INFORMATION = 0x0C, 45 GET_VOLTAGE = 0x0F, 46 }; 47 48 enum class DeviceIdentification : uint8_t 49 { 50 DEVICE_GPU = 0, 51 DEVICE_SMA = 5 52 }; 53 54 enum class InventoryPropertyId : uint8_t 55 { 56 BOARD_PART_NUMBER = 0, 57 SERIAL_NUMBER = 1, 58 MARKETING_NAME = 2, 59 DEVICE_PART_NUMBER = 3, 60 FRU_PART_NUMBER = 4, 61 MEMORY_VENDOR = 5, 62 MEMORY_PART_NUMBER = 6, 63 MAX_MEMORY_CAPACITY = 7, 64 BUILD_DATE = 8, 65 FIRMWARE_VERSION = 9, 66 DEVICE_GUID = 10, 67 INFOROM_VERSION = 11, 68 PRODUCT_LENGTH = 12, 69 PRODUCT_WIDTH = 13, 70 PRODUCT_HEIGHT = 14, 71 RATED_DEVICE_POWER_LIMIT = 15, 72 MIN_DEVICE_POWER_LIMIT = 16, 73 MAX_DEVICE_POWER_LIMIT = 17, 74 MAX_MODULE_POWER_LIMIT = 18, 75 MIN_MODULE_POWER_LIMIT = 19, 76 RATED_MODULE_POWER_LIMIT = 20, 77 DEFAULT_BOOST_CLOCKS = 21, 78 DEFAULT_BASE_CLOCKS = 22, 79 DEFAULT_EDPP_SCALING = 23, 80 MIN_EDPP_SCALING = 24, 81 MAX_EDPP_SCALING = 25, 82 MIN_GRAPHICS_CLOCK = 26, 83 MAX_GRAPHICS_CLOCK = 27, 84 MIN_MEMORY_CLOCK = 28, 85 MAX_MEMORY_CLOCK = 29, 86 INFINIBAND_GUID = 30, 87 RACK_GUID = 31, 88 RACK_SLOT_NUMBER = 32, 89 COMPUTE_SLOT_INDEX = 33, 90 NODE_INDEX = 34, 91 GPU_NODE_ID = 35, 92 NVLINK_PEER_TYPE = 36 93 }; 94 95 struct QueryDeviceIdentificationRequest 96 { 97 ocp::accelerator_management::CommonRequest hdr; 98 } __attribute__((packed)); 99 100 struct QueryDeviceIdentificationResponse 101 { 102 ocp::accelerator_management::CommonResponse hdr; 103 uint8_t device_identification; 104 uint8_t instance_id; 105 } __attribute__((packed)); 106 107 struct GetNumericSensorReadingRequest 108 { 109 ocp::accelerator_management::CommonRequest hdr; 110 uint8_t sensor_id; 111 } __attribute__((packed)); 112 113 using GetTemperatureReadingRequest = GetNumericSensorReadingRequest; 114 115 using ReadThermalParametersRequest = GetNumericSensorReadingRequest; 116 117 struct GetCurrentPowerDrawRequest 118 { 119 ocp::accelerator_management::CommonRequest hdr; 120 uint8_t sensorId; 121 uint8_t averagingInterval; 122 } __attribute__((packed)); 123 124 using GetCurrentEnergyCounterRequest = GetNumericSensorReadingRequest; 125 126 using GetVoltageRequest = GetNumericSensorReadingRequest; 127 128 struct GetTemperatureReadingResponse 129 { 130 ocp::accelerator_management::CommonResponse hdr; 131 int32_t reading; 132 } __attribute__((packed)); 133 134 struct ReadThermalParametersResponse 135 { 136 ocp::accelerator_management::CommonResponse hdr; 137 int32_t threshold; 138 } __attribute__((packed)); 139 140 struct GetCurrentPowerDrawResponse 141 { 142 ocp::accelerator_management::CommonResponse hdr; 143 uint32_t power; 144 } __attribute__((packed)); 145 146 struct GetCurrentEnergyCounterResponse 147 { 148 ocp::accelerator_management::CommonResponse hdr; 149 uint64_t energy; 150 } __attribute__((packed)); 151 152 struct GetVoltageResponse 153 { 154 ocp::accelerator_management::CommonResponse hdr; 155 uint32_t voltage; 156 } __attribute__((packed)); 157 158 struct GetInventoryInformationRequest 159 { 160 ocp::accelerator_management::CommonRequest hdr; 161 uint8_t property_id; 162 } __attribute__((packed)); 163 164 struct GetInventoryInformationResponse 165 { 166 ocp::accelerator_management::CommonResponse hdr; 167 std::array<uint8_t, maxInventoryDataSize> data; 168 } __attribute__((packed)); 169 170 int packHeader(const ocp::accelerator_management::BindingPciVidInfo& hdr, 171 ocp::accelerator_management::BindingPciVid& msg); 172 173 int encodeQueryDeviceIdentificationRequest(uint8_t instanceId, 174 std::span<uint8_t> buf); 175 176 int decodeQueryDeviceIdentificationResponse( 177 std::span<const uint8_t> buf, 178 ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode, 179 uint8_t& deviceIdentification, uint8_t& deviceInstance); 180 181 int encodeGetTemperatureReadingRequest(uint8_t instanceId, uint8_t sensorId, 182 std::span<uint8_t> buf); 183 184 int decodeGetTemperatureReadingResponse( 185 std::span<const uint8_t> buf, 186 ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode, 187 double& temperatureReading); 188 189 int encodeReadThermalParametersRequest(uint8_t instanceId, uint8_t sensorId, 190 std::span<uint8_t> buf); 191 192 int decodeReadThermalParametersResponse( 193 std::span<const uint8_t> buf, 194 ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode, 195 int32_t& threshold); 196 197 int encodeGetCurrentPowerDrawRequest(uint8_t instanceId, uint8_t sensorId, 198 uint8_t averagingInterval, 199 std::span<uint8_t> buf); 200 201 int decodeGetCurrentPowerDrawResponse( 202 std::span<const uint8_t> buf, 203 ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode, 204 uint32_t& power); 205 206 int encodeGetCurrentEnergyCounterRequest(uint8_t instanceId, uint8_t sensorId, 207 std::span<uint8_t> buf); 208 209 int decodeGetCurrentEnergyCounterResponse( 210 std::span<const uint8_t> buf, 211 ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode, 212 uint64_t& energy); 213 214 int encodeGetVoltageRequest(uint8_t instanceId, uint8_t sensorId, 215 std::span<uint8_t> buf); 216 217 int decodeGetVoltageResponse(std::span<const uint8_t> buf, 218 ocp::accelerator_management::CompletionCode& cc, 219 uint16_t& reasonCode, uint32_t& voltage); 220 221 int encodeGetInventoryInformationRequest(uint8_t instanceId, uint8_t propertyId, 222 std::span<uint8_t> buf); 223 224 int decodeGetInventoryInformationResponse( 225 std::span<const uint8_t> buf, 226 ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode, 227 InventoryPropertyId propertyId, InventoryValue& value); 228 229 } // namespace gpu 230