1 /* 2 * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & 3 * AFFILIATES. All rights reserved. 4 * SPDX-License-Identifier: Apache-2.0 5 */ 6 7 #pragma once 8 9 #include <OcpMctpVdm.hpp> 10 11 #include <array> 12 #include <cstddef> 13 #include <cstdint> 14 #include <span> 15 #include <string> 16 #include <variant> 17 #include <vector> 18 19 namespace gpu 20 { 21 22 using InventoryValue = std::variant<std::string, std::vector<uint8_t>>; 23 constexpr size_t maxInventoryDataSize = 256; 24 25 constexpr uint16_t nvidiaPciVendorId = 0x10de; 26 27 enum class MessageType : uint8_t 28 { 29 DEVICE_CAPABILITY_DISCOVERY = 0, 30 PLATFORM_ENVIRONMENTAL = 3 31 }; 32 33 enum class DeviceCapabilityDiscoveryCommands : uint8_t 34 { 35 QUERY_DEVICE_IDENTIFICATION = 0x09, 36 }; 37 38 enum class PlatformEnvironmentalCommands : uint8_t 39 { 40 GET_TEMPERATURE_READING = 0x00, 41 READ_THERMAL_PARAMETERS = 0x02, 42 GET_CURRENT_POWER_DRAW = 0x03, 43 GET_MAX_OBSERVED_POWER = 0x04, 44 GET_CURRENT_ENERGY_COUNTER = 0x06, 45 GET_INVENTORY_INFORMATION = 0x0C, 46 GET_VOLTAGE = 0x0F, 47 }; 48 49 enum class DeviceIdentification : uint8_t 50 { 51 DEVICE_GPU = 0, 52 DEVICE_SMA = 5 53 }; 54 55 enum class InventoryPropertyId : uint8_t 56 { 57 BOARD_PART_NUMBER = 0, 58 SERIAL_NUMBER = 1, 59 MARKETING_NAME = 2, 60 DEVICE_PART_NUMBER = 3, 61 FRU_PART_NUMBER = 4, 62 MEMORY_VENDOR = 5, 63 MEMORY_PART_NUMBER = 6, 64 MAX_MEMORY_CAPACITY = 7, 65 BUILD_DATE = 8, 66 FIRMWARE_VERSION = 9, 67 DEVICE_GUID = 10, 68 INFOROM_VERSION = 11, 69 PRODUCT_LENGTH = 12, 70 PRODUCT_WIDTH = 13, 71 PRODUCT_HEIGHT = 14, 72 RATED_DEVICE_POWER_LIMIT = 15, 73 MIN_DEVICE_POWER_LIMIT = 16, 74 MAX_DEVICE_POWER_LIMIT = 17, 75 MAX_MODULE_POWER_LIMIT = 18, 76 MIN_MODULE_POWER_LIMIT = 19, 77 RATED_MODULE_POWER_LIMIT = 20, 78 DEFAULT_BOOST_CLOCKS = 21, 79 DEFAULT_BASE_CLOCKS = 22, 80 DEFAULT_EDPP_SCALING = 23, 81 MIN_EDPP_SCALING = 24, 82 MAX_EDPP_SCALING = 25, 83 MIN_GRAPHICS_CLOCK = 26, 84 MAX_GRAPHICS_CLOCK = 27, 85 MIN_MEMORY_CLOCK = 28, 86 MAX_MEMORY_CLOCK = 29, 87 INFINIBAND_GUID = 30, 88 RACK_GUID = 31, 89 RACK_SLOT_NUMBER = 32, 90 COMPUTE_SLOT_INDEX = 33, 91 NODE_INDEX = 34, 92 GPU_NODE_ID = 35, 93 NVLINK_PEER_TYPE = 36 94 }; 95 96 struct QueryDeviceIdentificationRequest 97 { 98 ocp::accelerator_management::CommonRequest hdr; 99 } __attribute__((packed)); 100 101 struct QueryDeviceIdentificationResponse 102 { 103 ocp::accelerator_management::CommonResponse hdr; 104 uint8_t device_identification; 105 uint8_t instance_id; 106 } __attribute__((packed)); 107 108 struct GetNumericSensorReadingRequest 109 { 110 ocp::accelerator_management::CommonRequest hdr; 111 uint8_t sensor_id; 112 } __attribute__((packed)); 113 114 using GetTemperatureReadingRequest = GetNumericSensorReadingRequest; 115 116 using ReadThermalParametersRequest = GetNumericSensorReadingRequest; 117 118 struct GetPowerDrawRequest 119 { 120 ocp::accelerator_management::CommonRequest hdr; 121 uint8_t sensorId; 122 uint8_t averagingInterval; 123 } __attribute__((packed)); 124 125 using GetCurrentEnergyCounterRequest = GetNumericSensorReadingRequest; 126 127 using GetVoltageRequest = GetNumericSensorReadingRequest; 128 129 struct GetTemperatureReadingResponse 130 { 131 ocp::accelerator_management::CommonResponse hdr; 132 int32_t reading; 133 } __attribute__((packed)); 134 135 struct ReadThermalParametersResponse 136 { 137 ocp::accelerator_management::CommonResponse hdr; 138 int32_t threshold; 139 } __attribute__((packed)); 140 141 struct GetPowerDrawResponse 142 { 143 ocp::accelerator_management::CommonResponse hdr; 144 uint32_t power; 145 } __attribute__((packed)); 146 147 struct GetCurrentEnergyCounterResponse 148 { 149 ocp::accelerator_management::CommonResponse hdr; 150 uint64_t energy; 151 } __attribute__((packed)); 152 153 struct GetVoltageResponse 154 { 155 ocp::accelerator_management::CommonResponse hdr; 156 uint32_t voltage; 157 } __attribute__((packed)); 158 159 struct GetInventoryInformationRequest 160 { 161 ocp::accelerator_management::CommonRequest hdr; 162 uint8_t property_id; 163 } __attribute__((packed)); 164 165 struct GetInventoryInformationResponse 166 { 167 ocp::accelerator_management::CommonResponse hdr; 168 std::array<uint8_t, maxInventoryDataSize> data; 169 } __attribute__((packed)); 170 171 int packHeader(const ocp::accelerator_management::BindingPciVidInfo& hdr, 172 ocp::accelerator_management::BindingPciVid& msg); 173 174 int encodeQueryDeviceIdentificationRequest(uint8_t instanceId, 175 std::span<uint8_t> buf); 176 177 int decodeQueryDeviceIdentificationResponse( 178 std::span<const uint8_t> buf, 179 ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode, 180 uint8_t& deviceIdentification, uint8_t& deviceInstance); 181 182 int encodeGetTemperatureReadingRequest(uint8_t instanceId, uint8_t sensorId, 183 std::span<uint8_t> buf); 184 185 int decodeGetTemperatureReadingResponse( 186 std::span<const uint8_t> buf, 187 ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode, 188 double& temperatureReading); 189 190 int encodeReadThermalParametersRequest(uint8_t instanceId, uint8_t sensorId, 191 std::span<uint8_t> buf); 192 193 int decodeReadThermalParametersResponse( 194 std::span<const uint8_t> buf, 195 ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode, 196 int32_t& threshold); 197 198 int encodeGetPowerDrawRequest( 199 PlatformEnvironmentalCommands commandCode, uint8_t instanceId, 200 uint8_t sensorId, uint8_t averagingInterval, std::span<uint8_t> buf); 201 202 int decodeGetPowerDrawResponse(std::span<const uint8_t> buf, 203 ocp::accelerator_management::CompletionCode& cc, 204 uint16_t& reasonCode, uint32_t& power); 205 206 int encodeGetCurrentEnergyCounterRequest(uint8_t instanceId, uint8_t sensorId, 207 std::span<uint8_t> buf); 208 209 int decodeGetCurrentEnergyCounterResponse( 210 std::span<const uint8_t> buf, 211 ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode, 212 uint64_t& energy); 213 214 int encodeGetVoltageRequest(uint8_t instanceId, uint8_t sensorId, 215 std::span<uint8_t> buf); 216 217 int decodeGetVoltageResponse(std::span<const uint8_t> buf, 218 ocp::accelerator_management::CompletionCode& cc, 219 uint16_t& reasonCode, uint32_t& voltage); 220 221 int encodeGetInventoryInformationRequest(uint8_t instanceId, uint8_t propertyId, 222 std::span<uint8_t> buf); 223 224 int decodeGetInventoryInformationResponse( 225 std::span<const uint8_t> buf, 226 ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode, 227 InventoryPropertyId propertyId, InventoryValue& value); 228 229 } // namespace gpu 230