/* * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & * AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ #include "NvidiaGpuDevice.hpp" #include "Inventory.hpp" #include "NvidiaDeviceDiscovery.hpp" #include "NvidiaGpuSensor.hpp" #include "Thresholds.hpp" #include "Utils.hpp" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include GpuDevice::GpuDevice(const SensorConfigs& configs, const std::string& name, const std::string& path, const std::shared_ptr& conn, uint8_t eid, boost::asio::io_context& io, mctp::MctpRequester& mctpRequester, sdbusplus::asio::object_server& objectServer) : eid(eid), sensorPollMs(std::chrono::milliseconds{configs.pollRate}), waitTimer(io, std::chrono::steady_clock::duration(0)), mctpRequester(mctpRequester), conn(conn), objectServer(objectServer), configs(configs), name(escapeName(name)), path(path) { inventory = std::make_shared( conn, objectServer, name, mctpRequester, gpu::DeviceIdentification::DEVICE_GPU, eid, io); makeSensors(); } void GpuDevice::makeSensors() { tempSensor = std::make_shared( conn, mctpRequester, name + "_TEMP_0", path, eid, gpuTempSensorId, objectServer, std::vector{}); readThermalParameters( eid, std::vector{gpuTLimitWarnringThresholdId, gpuTLimitCriticalThresholdId, gpuTLimitHardshutDownThresholdId}, mctpRequester, std::bind_front(&GpuDevice::processTLimitThresholds, this)); dramTempSensor = std::make_shared( conn, mctpRequester, name + "_DRAM_0_TEMP_0", path, eid, gpuDramTempSensorId, objectServer, std::vector{thresholds::Threshold{ thresholds::Level::CRITICAL, thresholds::Direction::HIGH, 95.0}}); powerSensor = std::make_shared( conn, mctpRequester, name + "_Power_0", path, eid, gpuPowerSensorId, objectServer, std::vector{}); energySensor = std::make_shared( conn, mctpRequester, name + "_Energy_0", path, eid, gpuEnergySensorId, objectServer, std::vector{}); voltageSensor = std::make_shared( conn, mctpRequester, name + "_Voltage_0", path, eid, gpuVoltageSensorId, objectServer, std::vector{}); lg2::info("Added GPU {NAME} Sensors with chassis path: {PATH}.", "NAME", name, "PATH", path); read(); } void GpuDevice::processTLimitThresholds(uint8_t rc, const std::vector& thresholds) { std::vector tLimitThresholds{}; if (rc == 0) { tLimitThresholds = { thresholds::Threshold{thresholds::Level::WARNING, thresholds::Direction::LOW, static_cast(thresholds[0])}, thresholds::Threshold{thresholds::Level::CRITICAL, thresholds::Direction::LOW, static_cast(thresholds[1])}, thresholds::Threshold{thresholds::Level::HARDSHUTDOWN, thresholds::Direction::LOW, static_cast(thresholds[2])}}; } tLimitSensor = std::make_shared( conn, mctpRequester, name + "_TEMP_1", path, eid, gpuTLimitSensorId, objectServer, std::move(tLimitThresholds)); } void GpuDevice::read() { tempSensor->update(); if (tLimitSensor) { tLimitSensor->update(); } dramTempSensor->update(); powerSensor->update(); energySensor->update(); voltageSensor->update(); waitTimer.expires_after(std::chrono::milliseconds(sensorPollMs)); waitTimer.async_wait([this](const boost::system::error_code& ec) { if (ec) { return; } read(); }); }