14ecdfaaaSHarshit Aghera /* 24ecdfaaaSHarshit Aghera * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & 34ecdfaaaSHarshit Aghera * AFFILIATES. All rights reserved. 44ecdfaaaSHarshit Aghera * SPDX-License-Identifier: Apache-2.0 54ecdfaaaSHarshit Aghera */ 64ecdfaaaSHarshit Aghera 74ecdfaaaSHarshit Aghera #include "NvidiaGpuDevice.hpp" 84ecdfaaaSHarshit Aghera 94ecdfaaaSHarshit Aghera #include "NvidiaDeviceDiscovery.hpp" 104ecdfaaaSHarshit Aghera #include "NvidiaGpuSensor.hpp" 114ecdfaaaSHarshit Aghera #include "Thresholds.hpp" 124ecdfaaaSHarshit Aghera #include "Utils.hpp" 134ecdfaaaSHarshit Aghera 144ecdfaaaSHarshit Aghera #include <bits/basic_string.h> 154ecdfaaaSHarshit Aghera 164ecdfaaaSHarshit Aghera #include <MctpRequester.hpp> 17*5e7deccdSHarshit Aghera #include <NvidiaGpuThresholds.hpp> 184ecdfaaaSHarshit Aghera #include <boost/asio/io_context.hpp> 194ecdfaaaSHarshit Aghera #include <phosphor-logging/lg2.hpp> 204ecdfaaaSHarshit Aghera #include <sdbusplus/asio/connection.hpp> 214ecdfaaaSHarshit Aghera #include <sdbusplus/asio/object_server.hpp> 224ecdfaaaSHarshit Aghera 234ecdfaaaSHarshit Aghera #include <chrono> 244ecdfaaaSHarshit Aghera #include <cstdint> 25*5e7deccdSHarshit Aghera #include <functional> 264ecdfaaaSHarshit Aghera #include <memory> 274ecdfaaaSHarshit Aghera #include <string> 28*5e7deccdSHarshit Aghera #include <utility> 294ecdfaaaSHarshit Aghera #include <vector> 304ecdfaaaSHarshit Aghera 314ecdfaaaSHarshit Aghera GpuDevice::GpuDevice(const SensorConfigs& configs, const std::string& name, 324ecdfaaaSHarshit Aghera const std::string& path, 334ecdfaaaSHarshit Aghera const std::shared_ptr<sdbusplus::asio::connection>& conn, 344ecdfaaaSHarshit Aghera uint8_t eid, boost::asio::io_context& io, 354ecdfaaaSHarshit Aghera mctp::MctpRequester& mctpRequester, 364ecdfaaaSHarshit Aghera sdbusplus::asio::object_server& objectServer) : 374ecdfaaaSHarshit Aghera eid(eid), sensorPollMs(std::chrono::milliseconds{configs.pollRate}), 384ecdfaaaSHarshit Aghera waitTimer(io, std::chrono::steady_clock::duration(0)), 394ecdfaaaSHarshit Aghera mctpRequester(mctpRequester), conn(conn), objectServer(objectServer), 404ecdfaaaSHarshit Aghera configs(configs), name(escapeName(name)), path(path) 414ecdfaaaSHarshit Aghera { 424ecdfaaaSHarshit Aghera makeSensors(); 434ecdfaaaSHarshit Aghera } 444ecdfaaaSHarshit Aghera 454ecdfaaaSHarshit Aghera void GpuDevice::makeSensors() 464ecdfaaaSHarshit Aghera { 474ecdfaaaSHarshit Aghera tempSensor = std::make_shared<NvidiaGpuTempSensor>( 48ba138daeSHarshit Aghera conn, mctpRequester, name + "_TEMP_0", path, eid, gpuTempSensorId, 49ba138daeSHarshit Aghera objectServer, std::vector<thresholds::Threshold>{}); 50ba138daeSHarshit Aghera 51*5e7deccdSHarshit Aghera readThermalParameters( 52*5e7deccdSHarshit Aghera eid, 53*5e7deccdSHarshit Aghera std::vector<gpuThresholdId>{gpuTLimitWarnringThresholdId, 54*5e7deccdSHarshit Aghera gpuTLimitCriticalThresholdId, 55*5e7deccdSHarshit Aghera gpuTLimitHardshutDownThresholdId}, 56*5e7deccdSHarshit Aghera mctpRequester, 57*5e7deccdSHarshit Aghera std::bind_front(&GpuDevice::processTLimitThresholds, this)); 584ecdfaaaSHarshit Aghera 594ecdfaaaSHarshit Aghera lg2::info("Added GPU {NAME} Sensors with chassis path: {PATH}.", "NAME", 604ecdfaaaSHarshit Aghera name, "PATH", path); 614ecdfaaaSHarshit Aghera 624ecdfaaaSHarshit Aghera read(); 634ecdfaaaSHarshit Aghera } 644ecdfaaaSHarshit Aghera 65*5e7deccdSHarshit Aghera void GpuDevice::processTLimitThresholds(uint8_t rc, 66*5e7deccdSHarshit Aghera const std::vector<int32_t>& thresholds) 67*5e7deccdSHarshit Aghera { 68*5e7deccdSHarshit Aghera std::vector<thresholds::Threshold> tLimitThresholds{}; 69*5e7deccdSHarshit Aghera if (rc == 0) 70*5e7deccdSHarshit Aghera { 71*5e7deccdSHarshit Aghera tLimitThresholds = { 72*5e7deccdSHarshit Aghera thresholds::Threshold{thresholds::Level::WARNING, 73*5e7deccdSHarshit Aghera thresholds::Direction::LOW, 74*5e7deccdSHarshit Aghera static_cast<double>(thresholds[0])}, 75*5e7deccdSHarshit Aghera thresholds::Threshold{thresholds::Level::CRITICAL, 76*5e7deccdSHarshit Aghera thresholds::Direction::LOW, 77*5e7deccdSHarshit Aghera static_cast<double>(thresholds[1])}, 78*5e7deccdSHarshit Aghera thresholds::Threshold{thresholds::Level::HARDSHUTDOWN, 79*5e7deccdSHarshit Aghera thresholds::Direction::LOW, 80*5e7deccdSHarshit Aghera static_cast<double>(thresholds[2])}}; 81*5e7deccdSHarshit Aghera } 82*5e7deccdSHarshit Aghera 83*5e7deccdSHarshit Aghera tLimitSensor = std::make_shared<NvidiaGpuTempSensor>( 84*5e7deccdSHarshit Aghera conn, mctpRequester, name + "_TEMP_1", path, eid, gpuTLimitSensorId, 85*5e7deccdSHarshit Aghera objectServer, std::move(tLimitThresholds)); 86*5e7deccdSHarshit Aghera } 87*5e7deccdSHarshit Aghera 884ecdfaaaSHarshit Aghera void GpuDevice::read() 894ecdfaaaSHarshit Aghera { 904ecdfaaaSHarshit Aghera tempSensor->update(); 91*5e7deccdSHarshit Aghera if (tLimitSensor) 92*5e7deccdSHarshit Aghera { 93ba138daeSHarshit Aghera tLimitSensor->update(); 94*5e7deccdSHarshit Aghera } 954ecdfaaaSHarshit Aghera 964ecdfaaaSHarshit Aghera waitTimer.expires_after(std::chrono::milliseconds(sensorPollMs)); 974ecdfaaaSHarshit Aghera waitTimer.async_wait([this](const boost::system::error_code& ec) { 984ecdfaaaSHarshit Aghera if (ec) 994ecdfaaaSHarshit Aghera { 1004ecdfaaaSHarshit Aghera return; 1014ecdfaaaSHarshit Aghera } 1024ecdfaaaSHarshit Aghera read(); 1034ecdfaaaSHarshit Aghera }); 1044ecdfaaaSHarshit Aghera } 105