1 /*
2 * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
3 * AFFILIATES. All rights reserved.
4 * SPDX-License-Identifier: Apache-2.0
5 */
6
7 #include "NvidiaGpuMctpVdm.hpp"
8
9 #include "OcpMctpVdm.hpp"
10
11 #include <endian.h>
12
13 #include <cerrno>
14 #include <cstdint>
15 #include <span>
16 #include <vector>
17
18 namespace gpu
19 {
20 // These functions encode/decode data communicated over the network
21 // The use of reinterpret_cast enables direct memory access to raw byte buffers
22 // without doing unnecessary data copying
23 // NOLINTBEGIN(cppcoreguidelines-pro-type-reinterpret-cast)
packHeader(const ocp::accelerator_management::BindingPciVidInfo & hdr,ocp::accelerator_management::BindingPciVid & msg)24 int packHeader(const ocp::accelerator_management::BindingPciVidInfo& hdr,
25 ocp::accelerator_management::BindingPciVid& msg)
26 {
27 return ocp::accelerator_management::packHeader(nvidiaPciVendorId, hdr, msg);
28 }
29
encodeQueryDeviceIdentificationRequest(uint8_t instanceId,const std::span<uint8_t> buf)30 int encodeQueryDeviceIdentificationRequest(uint8_t instanceId,
31 const std::span<uint8_t> buf)
32 {
33 if (buf.size() < sizeof(QueryDeviceIdentificationRequest))
34 {
35 return EINVAL;
36 }
37
38 auto* msg = reinterpret_cast<QueryDeviceIdentificationRequest*>(buf.data());
39
40 ocp::accelerator_management::BindingPciVidInfo header{};
41
42 header.ocp_accelerator_management_msg_type =
43 static_cast<uint8_t>(ocp::accelerator_management::MessageType::REQUEST);
44 header.instance_id = instanceId &
45 ocp::accelerator_management::instanceIdBitMask;
46 header.msg_type =
47 static_cast<uint8_t>(MessageType::DEVICE_CAPABILITY_DISCOVERY);
48
49 auto rc = packHeader(header, msg->hdr.msgHdr.hdr);
50
51 if (rc != 0)
52 {
53 return rc;
54 }
55
56 msg->hdr.command = static_cast<uint8_t>(
57 DeviceCapabilityDiscoveryCommands::QUERY_DEVICE_IDENTIFICATION);
58 msg->hdr.data_size = 0;
59
60 return 0;
61 }
62
decodeQueryDeviceIdentificationResponse(const std::span<const uint8_t> buf,ocp::accelerator_management::CompletionCode & cc,uint16_t & reasonCode,uint8_t & deviceIdentification,uint8_t & deviceInstance)63 int decodeQueryDeviceIdentificationResponse(
64 const std::span<const uint8_t> buf,
65 ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode,
66 uint8_t& deviceIdentification, uint8_t& deviceInstance)
67 {
68 auto rc =
69 ocp::accelerator_management::decodeReasonCodeAndCC(buf, cc, reasonCode);
70
71 if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
72 {
73 return rc;
74 }
75
76 if (buf.size() < sizeof(QueryDeviceIdentificationResponse))
77 {
78 return EINVAL;
79 }
80
81 const auto* response =
82 reinterpret_cast<const QueryDeviceIdentificationResponse*>(buf.data());
83
84 deviceIdentification = response->device_identification;
85 deviceInstance = response->instance_id;
86
87 return 0;
88 }
89
encodeGetTemperatureReadingRequest(uint8_t instanceId,uint8_t sensorId,std::span<uint8_t> buf)90 int encodeGetTemperatureReadingRequest(uint8_t instanceId, uint8_t sensorId,
91 std::span<uint8_t> buf)
92 {
93 if (buf.size() < sizeof(GetTemperatureReadingRequest))
94 {
95 return EINVAL;
96 }
97
98 auto* msg = reinterpret_cast<GetTemperatureReadingRequest*>(buf.data());
99
100 ocp::accelerator_management::BindingPciVidInfo header{};
101 header.ocp_accelerator_management_msg_type =
102 static_cast<uint8_t>(ocp::accelerator_management::MessageType::REQUEST);
103 header.instance_id = instanceId &
104 ocp::accelerator_management::instanceIdBitMask;
105 header.msg_type = static_cast<uint8_t>(MessageType::PLATFORM_ENVIRONMENTAL);
106
107 auto rc = packHeader(header, msg->hdr.msgHdr.hdr);
108
109 if (rc != 0)
110 {
111 return rc;
112 }
113
114 msg->hdr.command = static_cast<uint8_t>(
115 PlatformEnvironmentalCommands::GET_TEMPERATURE_READING);
116 msg->hdr.data_size = sizeof(sensorId);
117 msg->sensor_id = sensorId;
118
119 return 0;
120 }
121
decodeGetTemperatureReadingResponse(const std::span<const uint8_t> buf,ocp::accelerator_management::CompletionCode & cc,uint16_t & reasonCode,double & temperatureReading)122 int decodeGetTemperatureReadingResponse(
123 const std::span<const uint8_t> buf,
124 ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode,
125 double& temperatureReading)
126 {
127 auto rc =
128 ocp::accelerator_management::decodeReasonCodeAndCC(buf, cc, reasonCode);
129
130 if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
131 {
132 return rc;
133 }
134
135 if (buf.size() < sizeof(GetTemperatureReadingResponse))
136 {
137 return EINVAL;
138 }
139
140 const auto* response =
141 reinterpret_cast<const GetTemperatureReadingResponse*>(buf.data());
142
143 uint16_t dataSize = le16toh(response->hdr.data_size);
144
145 if (dataSize != sizeof(int32_t))
146 {
147 return EINVAL;
148 }
149
150 int32_t reading = le32toh(response->reading);
151 temperatureReading = reading / static_cast<double>(1 << 8);
152
153 return 0;
154 }
155
encodeReadThermalParametersRequest(uint8_t instanceId,uint8_t sensorId,std::span<uint8_t> buf)156 int encodeReadThermalParametersRequest(uint8_t instanceId, uint8_t sensorId,
157 std::span<uint8_t> buf)
158 {
159 if (buf.size() < sizeof(ReadThermalParametersRequest))
160 {
161 return EINVAL;
162 }
163
164 auto* msg = reinterpret_cast<ReadThermalParametersRequest*>(buf.data());
165
166 ocp::accelerator_management::BindingPciVidInfo header{};
167 header.ocp_accelerator_management_msg_type =
168 static_cast<uint8_t>(ocp::accelerator_management::MessageType::REQUEST);
169 header.instance_id = instanceId &
170 ocp::accelerator_management::instanceIdBitMask;
171 header.msg_type = static_cast<uint8_t>(MessageType::PLATFORM_ENVIRONMENTAL);
172
173 auto rc = packHeader(header, msg->hdr.msgHdr.hdr);
174
175 if (rc != 0)
176 {
177 return rc;
178 }
179
180 msg->hdr.command = static_cast<uint8_t>(
181 PlatformEnvironmentalCommands::READ_THERMAL_PARAMETERS);
182 msg->hdr.data_size = sizeof(sensorId);
183 msg->sensor_id = sensorId;
184
185 return 0;
186 }
187
decodeReadThermalParametersResponse(std::span<const uint8_t> buf,ocp::accelerator_management::CompletionCode & cc,uint16_t & reasonCode,int32_t & threshold)188 int decodeReadThermalParametersResponse(
189 std::span<const uint8_t> buf,
190 ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode,
191 int32_t& threshold)
192 {
193 auto rc =
194 ocp::accelerator_management::decodeReasonCodeAndCC(buf, cc, reasonCode);
195
196 if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
197 {
198 return rc;
199 }
200
201 if (buf.size() < sizeof(ReadThermalParametersResponse))
202 {
203 return EINVAL;
204 }
205
206 const auto* response =
207 reinterpret_cast<const ReadThermalParametersResponse*>(buf.data());
208
209 uint16_t dataSize = le16toh(response->hdr.data_size);
210
211 if (dataSize != sizeof(int32_t))
212 {
213 return EINVAL;
214 }
215
216 threshold = le32toh(response->threshold);
217
218 return 0;
219 }
220
encodeGetCurrentPowerDrawRequest(uint8_t instanceId,uint8_t sensorId,uint8_t averagingInterval,std::span<uint8_t> buf)221 int encodeGetCurrentPowerDrawRequest(uint8_t instanceId, uint8_t sensorId,
222 uint8_t averagingInterval,
223 std::span<uint8_t> buf)
224 {
225 if (buf.size() < sizeof(GetCurrentPowerDrawRequest))
226 {
227 return EINVAL;
228 }
229
230 auto* msg = reinterpret_cast<GetCurrentPowerDrawRequest*>(buf.data());
231
232 ocp::accelerator_management::BindingPciVidInfo header{};
233 header.ocp_accelerator_management_msg_type =
234 static_cast<uint8_t>(ocp::accelerator_management::MessageType::REQUEST);
235 header.instance_id = instanceId &
236 ocp::accelerator_management::instanceIdBitMask;
237 header.msg_type = static_cast<uint8_t>(MessageType::PLATFORM_ENVIRONMENTAL);
238
239 auto rc = packHeader(header, msg->hdr.msgHdr.hdr);
240
241 if (rc != 0)
242 {
243 return rc;
244 }
245
246 msg->hdr.command = static_cast<uint8_t>(
247 PlatformEnvironmentalCommands::GET_CURRENT_POWER_DRAW);
248 msg->hdr.data_size = sizeof(sensorId) + sizeof(averagingInterval);
249 msg->sensorId = sensorId;
250 msg->averagingInterval = averagingInterval;
251
252 return 0;
253 }
254
decodeGetCurrentPowerDrawResponse(std::span<const uint8_t> buf,ocp::accelerator_management::CompletionCode & cc,uint16_t & reasonCode,uint32_t & power)255 int decodeGetCurrentPowerDrawResponse(
256 std::span<const uint8_t> buf,
257 ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode,
258 uint32_t& power)
259 {
260 auto rc =
261 ocp::accelerator_management::decodeReasonCodeAndCC(buf, cc, reasonCode);
262
263 if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
264 {
265 return rc;
266 }
267
268 if (buf.size() < sizeof(GetCurrentPowerDrawResponse))
269 {
270 return EINVAL;
271 }
272
273 const auto* response =
274 reinterpret_cast<const GetCurrentPowerDrawResponse*>(buf.data());
275
276 const uint16_t dataSize = le16toh(response->hdr.data_size);
277
278 if (dataSize != sizeof(uint32_t))
279 {
280 return EINVAL;
281 }
282
283 power = le32toh(response->power);
284
285 return 0;
286 }
287
encodeGetCurrentEnergyCounterRequest(uint8_t instanceId,uint8_t sensorId,std::span<uint8_t> buf)288 int encodeGetCurrentEnergyCounterRequest(uint8_t instanceId, uint8_t sensorId,
289 std::span<uint8_t> buf)
290 {
291 if (buf.size() < sizeof(GetTemperatureReadingRequest))
292 {
293 return EINVAL;
294 }
295
296 auto* msg = reinterpret_cast<GetCurrentEnergyCounterRequest*>(buf.data());
297
298 ocp::accelerator_management::BindingPciVidInfo header{};
299 header.ocp_accelerator_management_msg_type =
300 static_cast<uint8_t>(ocp::accelerator_management::MessageType::REQUEST);
301 header.instance_id = instanceId &
302 ocp::accelerator_management::instanceIdBitMask;
303 header.msg_type = static_cast<uint8_t>(MessageType::PLATFORM_ENVIRONMENTAL);
304
305 auto rc = packHeader(header, msg->hdr.msgHdr.hdr);
306
307 if (rc != 0)
308 {
309 return rc;
310 }
311
312 msg->hdr.command = static_cast<uint8_t>(
313 PlatformEnvironmentalCommands::GET_CURRENT_ENERGY_COUNTER);
314 msg->hdr.data_size = sizeof(sensorId);
315 msg->sensor_id = sensorId;
316
317 return 0;
318 }
319
decodeGetCurrentEnergyCounterResponse(std::span<const uint8_t> buf,ocp::accelerator_management::CompletionCode & cc,uint16_t & reasonCode,uint64_t & energy)320 int decodeGetCurrentEnergyCounterResponse(
321 std::span<const uint8_t> buf,
322 ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode,
323 uint64_t& energy)
324 {
325 auto rc =
326 ocp::accelerator_management::decodeReasonCodeAndCC(buf, cc, reasonCode);
327
328 if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
329 {
330 return rc;
331 }
332
333 if (buf.size() < sizeof(GetCurrentPowerDrawResponse))
334 {
335 return EINVAL;
336 }
337
338 const auto* response =
339 reinterpret_cast<const GetCurrentEnergyCounterResponse*>(buf.data());
340
341 const uint16_t dataSize = le16toh(response->hdr.data_size);
342
343 if (dataSize != sizeof(uint64_t))
344 {
345 return EINVAL;
346 }
347
348 energy = le32toh(response->energy);
349
350 return 0;
351 }
352
encodeGetVoltageRequest(uint8_t instanceId,uint8_t sensorId,std::span<uint8_t> buf)353 int encodeGetVoltageRequest(uint8_t instanceId, uint8_t sensorId,
354 std::span<uint8_t> buf)
355 {
356 if (buf.size() < sizeof(GetVoltageRequest))
357 {
358 return EINVAL;
359 }
360
361 auto* msg = reinterpret_cast<GetVoltageRequest*>(buf.data());
362
363 ocp::accelerator_management::BindingPciVidInfo header{};
364 header.ocp_accelerator_management_msg_type =
365 static_cast<uint8_t>(ocp::accelerator_management::MessageType::REQUEST);
366 header.instance_id = instanceId &
367 ocp::accelerator_management::instanceIdBitMask;
368 header.msg_type = static_cast<uint8_t>(MessageType::PLATFORM_ENVIRONMENTAL);
369
370 auto rc = packHeader(header, msg->hdr.msgHdr.hdr);
371
372 if (rc != 0)
373 {
374 return rc;
375 }
376
377 msg->hdr.command =
378 static_cast<uint8_t>(PlatformEnvironmentalCommands::GET_VOLTAGE);
379 msg->hdr.data_size = sizeof(sensorId);
380 msg->sensor_id = sensorId;
381
382 return 0;
383 }
384
decodeGetVoltageResponse(std::span<const uint8_t> buf,ocp::accelerator_management::CompletionCode & cc,uint16_t & reasonCode,uint32_t & voltage)385 int decodeGetVoltageResponse(std::span<const uint8_t> buf,
386 ocp::accelerator_management::CompletionCode& cc,
387 uint16_t& reasonCode, uint32_t& voltage)
388 {
389 auto rc =
390 ocp::accelerator_management::decodeReasonCodeAndCC(buf, cc, reasonCode);
391
392 if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
393 {
394 return rc;
395 }
396
397 if (buf.size() < sizeof(GetVoltageResponse))
398 {
399 return EINVAL;
400 }
401
402 const auto* response =
403 reinterpret_cast<const GetVoltageResponse*>(buf.data());
404
405 const uint16_t dataSize = le16toh(response->hdr.data_size);
406
407 if (dataSize != sizeof(uint32_t))
408 {
409 return EINVAL;
410 }
411
412 voltage = le32toh(response->voltage);
413
414 return 0;
415 }
416
encodeGetInventoryInformationRequest(uint8_t instanceId,uint8_t propertyId,std::span<uint8_t> buf)417 int encodeGetInventoryInformationRequest(uint8_t instanceId, uint8_t propertyId,
418 std::span<uint8_t> buf)
419 {
420 if (buf.size() < sizeof(GetInventoryInformationRequest))
421 {
422 return EINVAL;
423 }
424
425 auto* msg = reinterpret_cast<GetInventoryInformationRequest*>(buf.data());
426
427 ocp::accelerator_management::BindingPciVidInfo header{};
428 header.ocp_accelerator_management_msg_type =
429 static_cast<uint8_t>(ocp::accelerator_management::MessageType::REQUEST);
430 header.instance_id = instanceId &
431 ocp::accelerator_management::instanceIdBitMask;
432 header.msg_type = static_cast<uint8_t>(MessageType::PLATFORM_ENVIRONMENTAL);
433
434 auto rc = packHeader(header, msg->hdr.msgHdr.hdr);
435
436 if (rc != 0)
437 {
438 return rc;
439 }
440
441 msg->hdr.command = static_cast<uint8_t>(
442 PlatformEnvironmentalCommands::GET_INVENTORY_INFORMATION);
443 msg->hdr.data_size = sizeof(propertyId);
444 msg->property_id = propertyId;
445
446 return 0;
447 }
448
decodeGetInventoryInformationResponse(std::span<const uint8_t> buf,ocp::accelerator_management::CompletionCode & cc,uint16_t & reasonCode,InventoryPropertyId propertyId,InventoryValue & value)449 int decodeGetInventoryInformationResponse(
450 std::span<const uint8_t> buf,
451 ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode,
452 InventoryPropertyId propertyId, InventoryValue& value)
453 {
454 auto rc =
455 ocp::accelerator_management::decodeReasonCodeAndCC(buf, cc, reasonCode);
456 if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
457 {
458 return rc;
459 }
460 // Expect at least one byte of inventory response data after common response
461 if (buf.size() < (sizeof(ocp::accelerator_management::CommonResponse) + 1))
462 {
463 return EINVAL;
464 }
465
466 const auto* response =
467 reinterpret_cast<const GetInventoryInformationResponse*>(buf.data());
468 uint16_t dataSize = le16toh(response->hdr.data_size);
469
470 if (dataSize == 0 || dataSize > maxInventoryDataSize)
471 {
472 return EINVAL;
473 }
474
475 const uint8_t* dataPtr = response->data.data();
476
477 switch (propertyId)
478 {
479 case InventoryPropertyId::BOARD_PART_NUMBER:
480 case InventoryPropertyId::SERIAL_NUMBER:
481 case InventoryPropertyId::MARKETING_NAME:
482 case InventoryPropertyId::DEVICE_PART_NUMBER:
483 value =
484 std::string(reinterpret_cast<const char*>(dataPtr), dataSize);
485 break;
486 case InventoryPropertyId::DEVICE_GUID:
487 value = std::vector<uint8_t>(dataPtr, dataPtr + dataSize);
488 break;
489 default:
490 return EINVAL;
491 }
492 return 0;
493 }
494
495 // NOLINTEND(cppcoreguidelines-pro-type-reinterpret-cast)
496 } // namespace gpu
497