1 /* 2 * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & 3 * AFFILIATES. All rights reserved. 4 * SPDX-License-Identifier: Apache-2.0 5 */ 6 7 #include "NvidiaGpuDevice.hpp" 8 9 #include "Inventory.hpp" 10 #include "NvidiaDeviceDiscovery.hpp" 11 #include "NvidiaGpuSensor.hpp" 12 #include "Thresholds.hpp" 13 #include "Utils.hpp" 14 15 #include <bits/basic_string.h> 16 17 #include <MctpRequester.hpp> 18 #include <NvidiaGpuEnergySensor.hpp> 19 #include <NvidiaGpuMctpVdm.hpp> 20 #include <NvidiaGpuPowerSensor.hpp> 21 #include <NvidiaGpuThresholds.hpp> 22 #include <NvidiaGpuVoltageSensor.hpp> 23 #include <boost/asio/io_context.hpp> 24 #include <phosphor-logging/lg2.hpp> 25 #include <sdbusplus/asio/connection.hpp> 26 #include <sdbusplus/asio/object_server.hpp> 27 28 #include <chrono> 29 #include <cstdint> 30 #include <functional> 31 #include <memory> 32 #include <string> 33 #include <utility> 34 #include <vector> 35 36 GpuDevice::GpuDevice(const SensorConfigs& configs, const std::string& name, 37 const std::string& path, 38 const std::shared_ptr<sdbusplus::asio::connection>& conn, 39 uint8_t eid, boost::asio::io_context& io, 40 mctp::MctpRequester& mctpRequester, 41 sdbusplus::asio::object_server& objectServer) : 42 eid(eid), sensorPollMs(std::chrono::milliseconds{configs.pollRate}), 43 waitTimer(io, std::chrono::steady_clock::duration(0)), 44 mctpRequester(mctpRequester), conn(conn), objectServer(objectServer), 45 configs(configs), name(escapeName(name)), path(path) 46 { 47 inventory = std::make_shared<Inventory>( 48 conn, objectServer, name, mctpRequester, 49 gpu::DeviceIdentification::DEVICE_GPU, eid, io); 50 makeSensors(); 51 } 52 53 void GpuDevice::makeSensors() 54 { 55 tempSensor = std::make_shared<NvidiaGpuTempSensor>( 56 conn, mctpRequester, name + "_TEMP_0", path, eid, gpuTempSensorId, 57 objectServer, std::vector<thresholds::Threshold>{}); 58 59 readThermalParameters( 60 eid, 61 std::vector<gpuThresholdId>{gpuTLimitWarnringThresholdId, 62 gpuTLimitCriticalThresholdId, 63 gpuTLimitHardshutDownThresholdId}, 64 mctpRequester, 65 std::bind_front(&GpuDevice::processTLimitThresholds, this)); 66 67 dramTempSensor = std::make_shared<NvidiaGpuTempSensor>( 68 conn, mctpRequester, name + "_DRAM_0_TEMP_0", path, eid, 69 gpuDramTempSensorId, objectServer, 70 std::vector<thresholds::Threshold>{thresholds::Threshold{ 71 thresholds::Level::CRITICAL, thresholds::Direction::HIGH, 95.0}}); 72 73 powerSensor = std::make_shared<NvidiaGpuPowerSensor>( 74 conn, mctpRequester, name + "_Power_0", path, eid, gpuPowerSensorId, 75 objectServer, std::vector<thresholds::Threshold>{}); 76 77 energySensor = std::make_shared<NvidiaGpuEnergySensor>( 78 conn, mctpRequester, name + "_Energy_0", path, eid, gpuEnergySensorId, 79 objectServer, std::vector<thresholds::Threshold>{}); 80 81 voltageSensor = std::make_shared<NvidiaGpuVoltageSensor>( 82 conn, mctpRequester, name + "_Voltage_0", path, eid, gpuVoltageSensorId, 83 objectServer, std::vector<thresholds::Threshold>{}); 84 85 lg2::info("Added GPU {NAME} Sensors with chassis path: {PATH}.", "NAME", 86 name, "PATH", path); 87 88 read(); 89 } 90 91 void GpuDevice::processTLimitThresholds(uint8_t rc, 92 const std::vector<int32_t>& thresholds) 93 { 94 std::vector<thresholds::Threshold> tLimitThresholds{}; 95 if (rc == 0) 96 { 97 tLimitThresholds = { 98 thresholds::Threshold{thresholds::Level::WARNING, 99 thresholds::Direction::LOW, 100 static_cast<double>(thresholds[0])}, 101 thresholds::Threshold{thresholds::Level::CRITICAL, 102 thresholds::Direction::LOW, 103 static_cast<double>(thresholds[1])}, 104 thresholds::Threshold{thresholds::Level::HARDSHUTDOWN, 105 thresholds::Direction::LOW, 106 static_cast<double>(thresholds[2])}}; 107 } 108 109 tLimitSensor = std::make_shared<NvidiaGpuTempSensor>( 110 conn, mctpRequester, name + "_TEMP_1", path, eid, gpuTLimitSensorId, 111 objectServer, std::move(tLimitThresholds)); 112 } 113 114 void GpuDevice::read() 115 { 116 tempSensor->update(); 117 if (tLimitSensor) 118 { 119 tLimitSensor->update(); 120 } 121 dramTempSensor->update(); 122 powerSensor->update(); 123 energySensor->update(); 124 voltageSensor->update(); 125 126 waitTimer.expires_after(std::chrono::milliseconds(sensorPollMs)); 127 waitTimer.async_wait([this](const boost::system::error_code& ec) { 128 if (ec) 129 { 130 return; 131 } 132 read(); 133 }); 134 } 135