xref: /openbmc/dbus-sensors/src/nvidia-gpu/NvidiaGpuPowerSensor.cpp (revision 4c0a0b452cfd9afe56cf3ac1bfa824dfba9ca166)
1 /*
2  * SPDX-FileCopyrightText: Copyright OpenBMC Authors
3  * SPDX-License-Identifier: Apache-2.0
4  */
5 
6 #include "NvidiaGpuPowerSensor.hpp"
7 
8 #include "MctpRequester.hpp"
9 #include "NvidiaSensorUtils.hpp"
10 #include "SensorPaths.hpp"
11 #include "Thresholds.hpp"
12 #include "Utils.hpp"
13 #include "sensor.hpp"
14 
15 #include <bits/basic_string.h>
16 
17 #include <NvidiaDeviceDiscovery.hpp>
18 #include <NvidiaGpuMctpVdm.hpp>
19 #include <OcpMctpVdm.hpp>
20 #include <phosphor-logging/lg2.hpp>
21 #include <sdbusplus/asio/connection.hpp>
22 #include <sdbusplus/asio/object_server.hpp>
23 
24 #include <cstddef>
25 #include <cstdint>
26 #include <functional>
27 #include <limits>
28 #include <memory>
29 #include <optional>
30 #include <span>
31 #include <string>
32 #include <system_error>
33 #include <utility>
34 #include <vector>
35 
36 using namespace std::literals;
37 
38 static constexpr double gpuPowerSensorMaxReading = 5000;
39 static constexpr double gpuPowerSensorMinReading =
40     std::numeric_limits<uint32_t>::min();
41 
NvidiaGpuPowerSensor(std::shared_ptr<sdbusplus::asio::connection> & conn,mctp::MctpRequester & mctpRequester,const std::string & name,const std::string & sensorConfiguration,uint8_t eid,uint8_t sensorId,sdbusplus::asio::object_server & objectServer,std::vector<thresholds::Threshold> && thresholdData,const gpu::DeviceIdentification deviceType)42 NvidiaGpuPowerSensor::NvidiaGpuPowerSensor(
43     std::shared_ptr<sdbusplus::asio::connection>& conn,
44     mctp::MctpRequester& mctpRequester, const std::string& name,
45     const std::string& sensorConfiguration, uint8_t eid, uint8_t sensorId,
46     sdbusplus::asio::object_server& objectServer,
47     std::vector<thresholds::Threshold>&& thresholdData,
48     const gpu::DeviceIdentification deviceType) :
49     Sensor(escapeName(name), std::move(thresholdData), sensorConfiguration,
50            "power", false, true, gpuPowerSensorMaxReading,
51            gpuPowerSensorMinReading, conn),
52     eid(eid), sensorId{sensorId},
53 
54     mctpRequester(mctpRequester), objectServer(objectServer)
55 
56 {
57     std::string dbusPath = sensorPathPrefix + "power/"s + escapeName(name);
58 
59     sensorInterface = objectServer.add_interface(
60         dbusPath, "xyz.openbmc_project.Sensor.Value");
61 
62     for (const auto& threshold : thresholds)
63     {
64         std::string interface = thresholds::getInterface(threshold.level);
65         thresholdInterfaces[static_cast<size_t>(threshold.level)] =
66             objectServer.add_interface(dbusPath, interface);
67     }
68 
69     association = objectServer.add_interface(dbusPath, association::interface);
70 
71     setInitialProperties(sensor_paths::unitWatts);
72 
73     const std::optional<std::string> physicalContext =
74         nvidia_sensor_utils::deviceTypeToPhysicalContext(deviceType);
75 
76     if (physicalContext)
77     {
78         commonPhysicalContextInterface = objectServer.add_interface(
79             dbusPath, "xyz.openbmc_project.Common.PhysicalContext");
80 
81         commonPhysicalContextInterface->register_property("Type",
82                                                           *physicalContext);
83 
84         if (!commonPhysicalContextInterface->initialize())
85         {
86             lg2::error(
87                 "Error initializing PhysicalContext Interface for Power Sensor for eid {EID} and sensor id {SID}",
88                 "EID", eid, "SID", sensorId);
89         }
90     }
91 }
92 
~NvidiaGpuPowerSensor()93 NvidiaGpuPowerSensor::~NvidiaGpuPowerSensor()
94 {
95     for (const auto& iface : thresholdInterfaces)
96     {
97         objectServer.remove_interface(iface);
98     }
99     objectServer.remove_interface(association);
100     objectServer.remove_interface(sensorInterface);
101     if (commonPhysicalContextInterface)
102     {
103         objectServer.remove_interface(commonPhysicalContextInterface);
104     }
105 }
106 
checkThresholds()107 void NvidiaGpuPowerSensor::checkThresholds()
108 {
109     thresholds::checkThresholds(this);
110 }
111 
processResponse(const std::error_code & ec,std::span<const uint8_t> buffer)112 void NvidiaGpuPowerSensor::processResponse(const std::error_code& ec,
113                                            std::span<const uint8_t> buffer)
114 {
115     if (ec)
116     {
117         lg2::error(
118             "Error updating Power Sensor for eid {EID} and sensor id {SID} : sending message over MCTP failed, rc={RC}",
119             "EID", eid, "SID", sensorId, "RC", ec.message());
120         return;
121     }
122 
123     ocp::accelerator_management::CompletionCode cc{};
124     uint16_t reasonCode = 0;
125     uint32_t power = 0;
126 
127     const int rc =
128         gpu::decodeGetPowerDrawResponse(buffer, cc, reasonCode, power);
129 
130     if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
131     {
132         lg2::error(
133             "Error updating Power Sensor eid {EID} and sensor id {SID} : decode failed, rc={RC}, cc={CC}, reasonCode={RESC}",
134             "EID", eid, "SID", sensorId, "RC", rc, "CC", cc, "RESC",
135             reasonCode);
136         return;
137     }
138 
139     // Reading from the device is in milliwatts and unit set on the dbus
140     // is watts.
141     updateValue(power / 1000.0);
142 }
143 
update()144 void NvidiaGpuPowerSensor::update()
145 {
146     const int rc = gpu::encodeGetPowerDrawRequest(
147         gpu::PlatformEnvironmentalCommands::GET_CURRENT_POWER_DRAW, 0, sensorId,
148         averagingInterval, request);
149 
150     if (rc != 0)
151     {
152         lg2::error(
153             "Error updating Temperature Sensor for eid {EID} and sensor id {SID} : encode failed, rc={RC}",
154             "EID", eid, "SID", sensorId, "RC", rc);
155     }
156 
157     mctpRequester.sendRecvMsg(
158         eid, request,
159         [weak{weak_from_this()}](const std::error_code& ec,
160                                  std::span<const uint8_t> buffer) {
161             std::shared_ptr<NvidiaGpuPowerSensor> self = weak.lock();
162             if (!self)
163             {
164                 lg2::error("Invalid reference to NvidiaGpuPowerSensor");
165                 return;
166             }
167             self->processResponse(ec, buffer);
168         });
169 }
170