1 /*
2 * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
3 * AFFILIATES. All rights reserved.
4 * SPDX-License-Identifier: Apache-2.0
5 */
6
7 #include "NvidiaGpuMctpVdm.hpp"
8
9 #include "OcpMctpVdm.hpp"
10
11 #include <endian.h>
12
13 #include <cerrno>
14 #include <cstdint>
15 #include <span>
16 #include <vector>
17
18 namespace gpu
19 {
20 // These functions encode/decode data communicated over the network
21 // The use of reinterpret_cast enables direct memory access to raw byte buffers
22 // without doing unnecessary data copying
23 // NOLINTBEGIN(cppcoreguidelines-pro-type-reinterpret-cast)
packHeader(const ocp::accelerator_management::BindingPciVidInfo & hdr,ocp::accelerator_management::BindingPciVid & msg)24 int packHeader(const ocp::accelerator_management::BindingPciVidInfo& hdr,
25 ocp::accelerator_management::BindingPciVid& msg)
26 {
27 return ocp::accelerator_management::packHeader(nvidiaPciVendorId, hdr, msg);
28 }
29
encodeQueryDeviceIdentificationRequest(uint8_t instanceId,const std::span<uint8_t> buf)30 int encodeQueryDeviceIdentificationRequest(uint8_t instanceId,
31 const std::span<uint8_t> buf)
32 {
33 if (buf.size() < sizeof(QueryDeviceIdentificationRequest))
34 {
35 return EINVAL;
36 }
37
38 auto* msg = reinterpret_cast<QueryDeviceIdentificationRequest*>(buf.data());
39
40 ocp::accelerator_management::BindingPciVidInfo header{};
41
42 header.ocp_accelerator_management_msg_type =
43 static_cast<uint8_t>(ocp::accelerator_management::MessageType::REQUEST);
44 header.instance_id = instanceId &
45 ocp::accelerator_management::instanceIdBitMask;
46 header.msg_type =
47 static_cast<uint8_t>(MessageType::DEVICE_CAPABILITY_DISCOVERY);
48
49 auto rc = packHeader(header, msg->hdr.msgHdr.hdr);
50
51 if (rc != 0)
52 {
53 return rc;
54 }
55
56 msg->hdr.command = static_cast<uint8_t>(
57 DeviceCapabilityDiscoveryCommands::QUERY_DEVICE_IDENTIFICATION);
58 msg->hdr.data_size = 0;
59
60 return 0;
61 }
62
decodeQueryDeviceIdentificationResponse(const std::span<const uint8_t> buf,ocp::accelerator_management::CompletionCode & cc,uint16_t & reasonCode,uint8_t & deviceIdentification,uint8_t & deviceInstance)63 int decodeQueryDeviceIdentificationResponse(
64 const std::span<const uint8_t> buf,
65 ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode,
66 uint8_t& deviceIdentification, uint8_t& deviceInstance)
67 {
68 auto rc =
69 ocp::accelerator_management::decodeReasonCodeAndCC(buf, cc, reasonCode);
70
71 if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
72 {
73 return rc;
74 }
75
76 if (buf.size() < sizeof(QueryDeviceIdentificationResponse))
77 {
78 return EINVAL;
79 }
80
81 const auto* response =
82 reinterpret_cast<const QueryDeviceIdentificationResponse*>(buf.data());
83
84 deviceIdentification = response->device_identification;
85 deviceInstance = response->instance_id;
86
87 return 0;
88 }
89
encodeGetTemperatureReadingRequest(uint8_t instanceId,uint8_t sensorId,std::span<uint8_t> buf)90 int encodeGetTemperatureReadingRequest(uint8_t instanceId, uint8_t sensorId,
91 std::span<uint8_t> buf)
92 {
93 if (buf.size() < sizeof(GetTemperatureReadingRequest))
94 {
95 return EINVAL;
96 }
97
98 auto* msg = reinterpret_cast<GetTemperatureReadingRequest*>(buf.data());
99
100 ocp::accelerator_management::BindingPciVidInfo header{};
101 header.ocp_accelerator_management_msg_type =
102 static_cast<uint8_t>(ocp::accelerator_management::MessageType::REQUEST);
103 header.instance_id = instanceId &
104 ocp::accelerator_management::instanceIdBitMask;
105 header.msg_type = static_cast<uint8_t>(MessageType::PLATFORM_ENVIRONMENTAL);
106
107 auto rc = packHeader(header, msg->hdr.msgHdr.hdr);
108
109 if (rc != 0)
110 {
111 return rc;
112 }
113
114 msg->hdr.command = static_cast<uint8_t>(
115 PlatformEnvironmentalCommands::GET_TEMPERATURE_READING);
116 msg->hdr.data_size = sizeof(sensorId);
117 msg->sensor_id = sensorId;
118
119 return 0;
120 }
121
decodeGetTemperatureReadingResponse(const std::span<const uint8_t> buf,ocp::accelerator_management::CompletionCode & cc,uint16_t & reasonCode,double & temperatureReading)122 int decodeGetTemperatureReadingResponse(
123 const std::span<const uint8_t> buf,
124 ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode,
125 double& temperatureReading)
126 {
127 auto rc =
128 ocp::accelerator_management::decodeReasonCodeAndCC(buf, cc, reasonCode);
129
130 if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
131 {
132 return rc;
133 }
134
135 if (buf.size() < sizeof(GetTemperatureReadingResponse))
136 {
137 return EINVAL;
138 }
139
140 const auto* response =
141 reinterpret_cast<const GetTemperatureReadingResponse*>(buf.data());
142
143 uint16_t dataSize = le16toh(response->hdr.data_size);
144
145 if (dataSize != sizeof(int32_t))
146 {
147 return EINVAL;
148 }
149
150 int32_t reading = le32toh(response->reading);
151 temperatureReading = reading / static_cast<double>(1 << 8);
152
153 return 0;
154 }
155
encodeReadThermalParametersRequest(uint8_t instanceId,uint8_t sensorId,std::span<uint8_t> buf)156 int encodeReadThermalParametersRequest(uint8_t instanceId, uint8_t sensorId,
157 std::span<uint8_t> buf)
158 {
159 if (buf.size() < sizeof(ReadThermalParametersRequest))
160 {
161 return EINVAL;
162 }
163
164 auto* msg = reinterpret_cast<ReadThermalParametersRequest*>(buf.data());
165
166 ocp::accelerator_management::BindingPciVidInfo header{};
167 header.ocp_accelerator_management_msg_type =
168 static_cast<uint8_t>(ocp::accelerator_management::MessageType::REQUEST);
169 header.instance_id = instanceId &
170 ocp::accelerator_management::instanceIdBitMask;
171 header.msg_type = static_cast<uint8_t>(MessageType::PLATFORM_ENVIRONMENTAL);
172
173 auto rc = packHeader(header, msg->hdr.msgHdr.hdr);
174
175 if (rc != 0)
176 {
177 return rc;
178 }
179
180 msg->hdr.command = static_cast<uint8_t>(
181 PlatformEnvironmentalCommands::READ_THERMAL_PARAMETERS);
182 msg->hdr.data_size = sizeof(sensorId);
183 msg->sensor_id = sensorId;
184
185 return 0;
186 }
187
decodeReadThermalParametersResponse(std::span<const uint8_t> buf,ocp::accelerator_management::CompletionCode & cc,uint16_t & reasonCode,int32_t & threshold)188 int decodeReadThermalParametersResponse(
189 std::span<const uint8_t> buf,
190 ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode,
191 int32_t& threshold)
192 {
193 auto rc =
194 ocp::accelerator_management::decodeReasonCodeAndCC(buf, cc, reasonCode);
195
196 if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
197 {
198 return rc;
199 }
200
201 if (buf.size() < sizeof(ReadThermalParametersResponse))
202 {
203 return EINVAL;
204 }
205
206 const auto* response =
207 reinterpret_cast<const ReadThermalParametersResponse*>(buf.data());
208
209 uint16_t dataSize = le16toh(response->hdr.data_size);
210
211 if (dataSize != sizeof(int32_t))
212 {
213 return EINVAL;
214 }
215
216 threshold = le32toh(response->threshold);
217
218 return 0;
219 }
220
encodeGetPowerDrawRequest(PlatformEnvironmentalCommands commandCode,uint8_t instanceId,uint8_t sensorId,uint8_t averagingInterval,std::span<uint8_t> buf)221 int encodeGetPowerDrawRequest(PlatformEnvironmentalCommands commandCode,
222 uint8_t instanceId, uint8_t sensorId,
223 uint8_t averagingInterval, std::span<uint8_t> buf)
224 {
225 if (buf.size() < sizeof(GetPowerDrawRequest))
226 {
227 return EINVAL;
228 }
229
230 auto* msg = reinterpret_cast<GetPowerDrawRequest*>(buf.data());
231
232 ocp::accelerator_management::BindingPciVidInfo header{};
233 header.ocp_accelerator_management_msg_type =
234 static_cast<uint8_t>(ocp::accelerator_management::MessageType::REQUEST);
235 header.instance_id = instanceId &
236 ocp::accelerator_management::instanceIdBitMask;
237 header.msg_type = static_cast<uint8_t>(MessageType::PLATFORM_ENVIRONMENTAL);
238
239 auto rc = packHeader(header, msg->hdr.msgHdr.hdr);
240
241 if (rc != 0)
242 {
243 return rc;
244 }
245
246 msg->hdr.command = static_cast<uint8_t>(commandCode);
247 msg->hdr.data_size = sizeof(sensorId) + sizeof(averagingInterval);
248 msg->sensorId = sensorId;
249 msg->averagingInterval = averagingInterval;
250
251 return 0;
252 }
253
decodeGetPowerDrawResponse(std::span<const uint8_t> buf,ocp::accelerator_management::CompletionCode & cc,uint16_t & reasonCode,uint32_t & power)254 int decodeGetPowerDrawResponse(std::span<const uint8_t> buf,
255 ocp::accelerator_management::CompletionCode& cc,
256 uint16_t& reasonCode, uint32_t& power)
257 {
258 auto rc =
259 ocp::accelerator_management::decodeReasonCodeAndCC(buf, cc, reasonCode);
260
261 if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
262 {
263 return rc;
264 }
265
266 if (buf.size() < sizeof(GetPowerDrawResponse))
267 {
268 return EINVAL;
269 }
270
271 const auto* response =
272 reinterpret_cast<const GetPowerDrawResponse*>(buf.data());
273
274 const uint16_t dataSize = le16toh(response->hdr.data_size);
275
276 if (dataSize != sizeof(uint32_t))
277 {
278 return EINVAL;
279 }
280
281 power = le32toh(response->power);
282
283 return 0;
284 }
285
encodeGetCurrentEnergyCounterRequest(uint8_t instanceId,uint8_t sensorId,std::span<uint8_t> buf)286 int encodeGetCurrentEnergyCounterRequest(uint8_t instanceId, uint8_t sensorId,
287 std::span<uint8_t> buf)
288 {
289 if (buf.size() < sizeof(GetTemperatureReadingRequest))
290 {
291 return EINVAL;
292 }
293
294 auto* msg = reinterpret_cast<GetCurrentEnergyCounterRequest*>(buf.data());
295
296 ocp::accelerator_management::BindingPciVidInfo header{};
297 header.ocp_accelerator_management_msg_type =
298 static_cast<uint8_t>(ocp::accelerator_management::MessageType::REQUEST);
299 header.instance_id = instanceId &
300 ocp::accelerator_management::instanceIdBitMask;
301 header.msg_type = static_cast<uint8_t>(MessageType::PLATFORM_ENVIRONMENTAL);
302
303 auto rc = packHeader(header, msg->hdr.msgHdr.hdr);
304
305 if (rc != 0)
306 {
307 return rc;
308 }
309
310 msg->hdr.command = static_cast<uint8_t>(
311 PlatformEnvironmentalCommands::GET_CURRENT_ENERGY_COUNTER);
312 msg->hdr.data_size = sizeof(sensorId);
313 msg->sensor_id = sensorId;
314
315 return 0;
316 }
317
decodeGetCurrentEnergyCounterResponse(std::span<const uint8_t> buf,ocp::accelerator_management::CompletionCode & cc,uint16_t & reasonCode,uint64_t & energy)318 int decodeGetCurrentEnergyCounterResponse(
319 std::span<const uint8_t> buf,
320 ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode,
321 uint64_t& energy)
322 {
323 auto rc =
324 ocp::accelerator_management::decodeReasonCodeAndCC(buf, cc, reasonCode);
325
326 if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
327 {
328 return rc;
329 }
330
331 if (buf.size() < sizeof(GetPowerDrawResponse))
332 {
333 return EINVAL;
334 }
335
336 const auto* response =
337 reinterpret_cast<const GetCurrentEnergyCounterResponse*>(buf.data());
338
339 const uint16_t dataSize = le16toh(response->hdr.data_size);
340
341 if (dataSize != sizeof(uint64_t))
342 {
343 return EINVAL;
344 }
345
346 energy = le32toh(response->energy);
347
348 return 0;
349 }
350
encodeGetVoltageRequest(uint8_t instanceId,uint8_t sensorId,std::span<uint8_t> buf)351 int encodeGetVoltageRequest(uint8_t instanceId, uint8_t sensorId,
352 std::span<uint8_t> buf)
353 {
354 if (buf.size() < sizeof(GetVoltageRequest))
355 {
356 return EINVAL;
357 }
358
359 auto* msg = reinterpret_cast<GetVoltageRequest*>(buf.data());
360
361 ocp::accelerator_management::BindingPciVidInfo header{};
362 header.ocp_accelerator_management_msg_type =
363 static_cast<uint8_t>(ocp::accelerator_management::MessageType::REQUEST);
364 header.instance_id = instanceId &
365 ocp::accelerator_management::instanceIdBitMask;
366 header.msg_type = static_cast<uint8_t>(MessageType::PLATFORM_ENVIRONMENTAL);
367
368 auto rc = packHeader(header, msg->hdr.msgHdr.hdr);
369
370 if (rc != 0)
371 {
372 return rc;
373 }
374
375 msg->hdr.command =
376 static_cast<uint8_t>(PlatformEnvironmentalCommands::GET_VOLTAGE);
377 msg->hdr.data_size = sizeof(sensorId);
378 msg->sensor_id = sensorId;
379
380 return 0;
381 }
382
decodeGetVoltageResponse(std::span<const uint8_t> buf,ocp::accelerator_management::CompletionCode & cc,uint16_t & reasonCode,uint32_t & voltage)383 int decodeGetVoltageResponse(std::span<const uint8_t> buf,
384 ocp::accelerator_management::CompletionCode& cc,
385 uint16_t& reasonCode, uint32_t& voltage)
386 {
387 auto rc =
388 ocp::accelerator_management::decodeReasonCodeAndCC(buf, cc, reasonCode);
389
390 if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
391 {
392 return rc;
393 }
394
395 if (buf.size() < sizeof(GetVoltageResponse))
396 {
397 return EINVAL;
398 }
399
400 const auto* response =
401 reinterpret_cast<const GetVoltageResponse*>(buf.data());
402
403 const uint16_t dataSize = le16toh(response->hdr.data_size);
404
405 if (dataSize != sizeof(uint32_t))
406 {
407 return EINVAL;
408 }
409
410 voltage = le32toh(response->voltage);
411
412 return 0;
413 }
414
encodeGetInventoryInformationRequest(uint8_t instanceId,uint8_t propertyId,std::span<uint8_t> buf)415 int encodeGetInventoryInformationRequest(uint8_t instanceId, uint8_t propertyId,
416 std::span<uint8_t> buf)
417 {
418 if (buf.size() < sizeof(GetInventoryInformationRequest))
419 {
420 return EINVAL;
421 }
422
423 auto* msg = reinterpret_cast<GetInventoryInformationRequest*>(buf.data());
424
425 ocp::accelerator_management::BindingPciVidInfo header{};
426 header.ocp_accelerator_management_msg_type =
427 static_cast<uint8_t>(ocp::accelerator_management::MessageType::REQUEST);
428 header.instance_id = instanceId &
429 ocp::accelerator_management::instanceIdBitMask;
430 header.msg_type = static_cast<uint8_t>(MessageType::PLATFORM_ENVIRONMENTAL);
431
432 auto rc = packHeader(header, msg->hdr.msgHdr.hdr);
433
434 if (rc != 0)
435 {
436 return rc;
437 }
438
439 msg->hdr.command = static_cast<uint8_t>(
440 PlatformEnvironmentalCommands::GET_INVENTORY_INFORMATION);
441 msg->hdr.data_size = sizeof(propertyId);
442 msg->property_id = propertyId;
443
444 return 0;
445 }
446
decodeGetInventoryInformationResponse(std::span<const uint8_t> buf,ocp::accelerator_management::CompletionCode & cc,uint16_t & reasonCode,InventoryPropertyId propertyId,InventoryValue & value)447 int decodeGetInventoryInformationResponse(
448 std::span<const uint8_t> buf,
449 ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode,
450 InventoryPropertyId propertyId, InventoryValue& value)
451 {
452 auto rc =
453 ocp::accelerator_management::decodeReasonCodeAndCC(buf, cc, reasonCode);
454 if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
455 {
456 return rc;
457 }
458 // Expect at least one byte of inventory response data after common response
459 if (buf.size() < (sizeof(ocp::accelerator_management::CommonResponse) + 1))
460 {
461 return EINVAL;
462 }
463
464 const auto* response =
465 reinterpret_cast<const GetInventoryInformationResponse*>(buf.data());
466 uint16_t dataSize = le16toh(response->hdr.data_size);
467
468 if (dataSize == 0 || dataSize > maxInventoryDataSize)
469 {
470 return EINVAL;
471 }
472
473 const uint8_t* dataPtr = response->data.data();
474
475 switch (propertyId)
476 {
477 case InventoryPropertyId::BOARD_PART_NUMBER:
478 case InventoryPropertyId::SERIAL_NUMBER:
479 case InventoryPropertyId::MARKETING_NAME:
480 case InventoryPropertyId::DEVICE_PART_NUMBER:
481 value =
482 std::string(reinterpret_cast<const char*>(dataPtr), dataSize);
483 break;
484 case InventoryPropertyId::DEVICE_GUID:
485 value = std::vector<uint8_t>(dataPtr, dataPtr + dataSize);
486 break;
487 default:
488 return EINVAL;
489 }
490 return 0;
491 }
492
493 // NOLINTEND(cppcoreguidelines-pro-type-reinterpret-cast)
494 } // namespace gpu
495