xref: /openbmc/phosphor-pid-control/dbus/dbuspassive.cpp (revision f8b6e55147148c3cfb42327ff267197a460b411c)
1 /**
2  * Copyright 2017 Google Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "dbuspassive.hpp"
18 
19 #include "conf.hpp"
20 #include "dbushelper_interface.hpp"
21 #include "dbuspassiveredundancy.hpp"
22 #include "dbusutil.hpp"
23 #include "failsafeloggers/failsafe_logger_utility.hpp"
24 #include "interfaces.hpp"
25 #include "util.hpp"
26 
27 #include <systemd/sd-bus.h>
28 
29 #include <sdbusplus/bus.hpp>
30 #include <sdbusplus/message.hpp>
31 
32 #include <chrono>
33 #include <cmath>
34 #include <cstdint>
35 #include <exception>
36 #include <limits>
37 #include <map>
38 #include <memory>
39 #include <mutex>
40 #include <set>
41 #include <string>
42 #include <utility>
43 #include <variant>
44 
45 #include "failsafeloggers/failsafe_logger.cpp"
46 
47 namespace pid_control
48 {
49 
createDbusPassive(sdbusplus::bus_t & bus,const std::string & type,const std::string & id,std::unique_ptr<DbusHelperInterface> helper,const conf::SensorConfig * info,const std::shared_ptr<DbusPassiveRedundancy> & redundancy)50 std::unique_ptr<ReadInterface> DbusPassive::createDbusPassive(
51     sdbusplus::bus_t& bus, const std::string& type, const std::string& id,
52     std::unique_ptr<DbusHelperInterface> helper, const conf::SensorConfig* info,
53     const std::shared_ptr<DbusPassiveRedundancy>& redundancy)
54 {
55     if (helper == nullptr)
56     {
57         return nullptr;
58     }
59     if (!validType(type))
60     {
61         return nullptr;
62     }
63 
64     /* Need to get the scale and initial value */
65     /* service == busname */
66     std::string path;
67     if (info->readPath.empty())
68     {
69         path = getSensorPath(type, id);
70     }
71     else
72     {
73         path = info->readPath;
74     }
75 
76     SensorProperties settings;
77     bool failed;
78     std::string service;
79 
80     try
81     {
82         service = helper->getService(sensorintf, path);
83     }
84     catch (const std::exception& e)
85     {
86 #ifndef HANDLE_MISSING_OBJECT_PATHS
87         return nullptr;
88 #else
89         // CASE1: The sensor is not on DBus, but as it is not in the
90         // MissingIsAcceptable list, the sensor should be built with a failed
91         // state to send the zone to failsafe mode. Everything will recover if
92         // all important sensors are back to DBus. swampd will be informed
93         // through InterfacesAdded signals and the sensors will be built again.
94 
95         // CASE2: The sensor is on D-Bus (getService succeeds) but getProperties
96         // fails (e.g., D-Bus error or property fetch failure). In this case,
97         // handle-missing-object-paths does not apply. The sensor build fails,
98         // and the control loop will keep restarting until getProperties
99         // succeeds.
100 
101         // Only CASE1 may send the zone to failsafe mode if the sensor is not
102         // in MissingIsAcceptable. CASE2 results in continuous restart until
103         // recovery.
104 
105         failed = true;
106         settings.value = std::numeric_limits<double>::quiet_NaN();
107         settings.unit = getSensorUnit(type);
108         settings.available = false;
109         settings.unavailableAsFailed = true;
110         if (info->ignoreDbusMinMax)
111         {
112             settings.min = 0;
113             settings.max = 0;
114         }
115         std::cerr << "DbusPassive: Sensor " << path
116                   << " is missing from D-Bus, build this sensor as failed\n";
117         return std::make_unique<DbusPassive>(
118             bus, type, id, std::move(helper), settings, failed, path,
119             redundancy);
120 #endif
121     }
122 
123     try
124     {
125         helper->getProperties(service, path, &settings);
126         failed = helper->thresholdsAsserted(service, path);
127     }
128     catch (const std::exception& e)
129     {
130         return nullptr;
131     }
132 
133     /* if these values are zero, they're ignored. */
134     if (info->ignoreDbusMinMax)
135     {
136         settings.min = 0;
137         settings.max = 0;
138     }
139 
140     settings.unavailableAsFailed = info->unavailableAsFailed;
141 
142     return std::make_unique<DbusPassive>(bus, type, id, std::move(helper),
143                                          settings, failed, path, redundancy);
144 }
145 
DbusPassive(sdbusplus::bus_t & bus,const std::string & type,const std::string & id,std::unique_ptr<DbusHelperInterface> helper,const SensorProperties & settings,bool failed,const std::string & path,const std::shared_ptr<DbusPassiveRedundancy> & redundancy)146 DbusPassive::DbusPassive(
147     sdbusplus::bus_t& bus, const std::string& type, const std::string& id,
148     std::unique_ptr<DbusHelperInterface> helper,
149     const SensorProperties& settings, bool failed, const std::string& path,
150     const std::shared_ptr<DbusPassiveRedundancy>& redundancy) :
151     ReadInterface(), _signal(bus, getMatch(path), dbusHandleSignal, this),
152     _id(id), _helper(std::move(helper)), _failed(failed), path(path),
153     redundancy(redundancy)
154 
155 {
156     _scale = settings.scale;
157     _min = settings.min * std::pow(10.0, _scale);
158     _max = settings.max * std::pow(10.0, _scale);
159     _available = settings.available;
160     _unavailableAsFailed = settings.unavailableAsFailed;
161 
162     // Cache this type knowledge, to avoid repeated string comparison
163     _typeMargin = (type == "margin");
164     _typeFan = (type == "fan");
165 
166     // Force value to be stored, otherwise member would be uninitialized
167     updateValue(settings.value, true);
168 }
169 
read(void)170 ReadReturn DbusPassive::read(void)
171 {
172     std::lock_guard<std::mutex> guard(_lock);
173 
174     ReadReturn r = {_value, _updated, _unscaled};
175 
176     return r;
177 }
178 
setValue(double value,double unscaled)179 void DbusPassive::setValue(double value, double unscaled)
180 {
181     std::lock_guard<std::mutex> guard(_lock);
182 
183     _value = value;
184     _unscaled = unscaled;
185     _updated = std::chrono::high_resolution_clock::now();
186 }
187 
setValue(double value)188 void DbusPassive::setValue(double value)
189 {
190     // First param is scaled, second param is unscaled, assume same here
191     setValue(value, value);
192 }
193 
getFailed(void) const194 bool DbusPassive::getFailed(void) const
195 {
196     if (redundancy)
197     {
198         const std::set<std::string>& failures = redundancy->getFailed();
199         if (failures.find(path) != failures.end())
200         {
201             outputFailsafeLogWithSensor(_id, true, _id,
202                                         "The sensor path is marked redundant.");
203             return true;
204         }
205     }
206 
207     /*
208      * Unavailable thermal sensors, who are not present or
209      * power-state-not-matching, should not trigger the failSafe mode. For
210      * example, when a system stays at a powered-off state, its CPU Temp
211      * sensors will be unavailable, these unavailable sensors should not be
212      * treated as failed and trigger failSafe.
213      * This is important for systems whose Fans are always on.
214      */
215     if (!_typeFan && !_available && !_unavailableAsFailed)
216     {
217         return false;
218     }
219 
220     // If a reading has came in,
221     // but its value bad in some way (determined by sensor type),
222     // indicate this sensor has failed,
223     // until another value comes in that is no longer bad.
224     // This is different from the overall _failed flag,
225     // which is set and cleared by other causes.
226     if (_badReading)
227     {
228         outputFailsafeLogWithSensor(_id, true, _id,
229                                     "The sensor has bad readings.");
230         return true;
231     }
232 
233     // If a reading has came in, and it is not a bad reading,
234     // but it indicates there is no more thermal margin left,
235     // that is bad, something is wrong with the PID loops,
236     // they are not cooling the system, enable failsafe mode also.
237     if (_marginHot)
238     {
239         outputFailsafeLogWithSensor(_id, true, _id,
240                                     "The sensor has no thermal margin left.");
241         return true;
242     }
243 
244     if (_failed)
245     {
246         outputFailsafeLogWithSensor(
247             _id, true, _id, "The sensor has failed with a critical issue.");
248         return true;
249     }
250 
251     if (!_available)
252     {
253         outputFailsafeLogWithSensor(_id, true, _id,
254                                     "The sensor is unavailable.");
255         return true;
256     }
257 
258     if (!_functional)
259     {
260         outputFailsafeLogWithSensor(_id, true, _id,
261                                     "The sensor is not functional.");
262         return true;
263     }
264 
265     outputFailsafeLogWithSensor(_id, false, _id, "The sensor has recovered.");
266 
267     return false;
268 }
269 
getFailReason(void) const270 std::string DbusPassive::getFailReason(void) const
271 {
272     if (_badReading)
273     {
274         return "Sensor reading bad";
275     }
276     if (_marginHot)
277     {
278         return "Margin hot";
279     }
280     if (_failed)
281     {
282         return "Sensor threshold asserted";
283     }
284     if (!_available)
285     {
286         return "Sensor unavailable";
287     }
288     if (!_functional)
289     {
290         return "Sensor not functional";
291     }
292     return "Unknown";
293 }
294 
setFailed(bool value)295 void DbusPassive::setFailed(bool value)
296 {
297     _failed = value;
298 }
299 
setFunctional(bool value)300 void DbusPassive::setFunctional(bool value)
301 {
302     _functional = value;
303 }
304 
setAvailable(bool value)305 void DbusPassive::setAvailable(bool value)
306 {
307     _available = value;
308 }
309 
getScale(void)310 int64_t DbusPassive::getScale(void)
311 {
312     return _scale;
313 }
314 
getID(void)315 std::string DbusPassive::getID(void)
316 {
317     return _id;
318 }
319 
getMax(void)320 double DbusPassive::getMax(void)
321 {
322     return _max;
323 }
324 
getMin(void)325 double DbusPassive::getMin(void)
326 {
327     return _min;
328 }
329 
updateValue(double value,bool force)330 void DbusPassive::updateValue(double value, bool force)
331 {
332     _badReading = false;
333 
334     // Do not let a NAN, or other floating-point oddity, be used to update
335     // the value, as that indicates the sensor has no valid reading.
336     if (!(std::isfinite(value)))
337     {
338         _badReading = true;
339 
340         // Do not continue with a bad reading, unless caller forcing
341         if (!force)
342         {
343             return;
344         }
345     }
346 
347     value *= std::pow(10.0, _scale);
348 
349     auto unscaled = value;
350     scaleSensorReading(_min, _max, value);
351 
352     if (_typeMargin)
353     {
354         _marginHot = false;
355 
356         // Unlike an absolute temperature sensor,
357         // where 0 degrees C is a good reading,
358         // a value received of 0 (or negative) margin is worrisome,
359         // and should be flagged.
360         // Either it indicates margin not calculated properly,
361         // or somebody forgot to set the margin-zero setpoint,
362         // or the system is really overheating that much.
363         // This is a different condition from _failed
364         // and _badReading, so it merits its own flag.
365         // The sensor has not failed, the reading is good, but the zone
366         // still needs to know that it should go to failsafe mode.
367         if (unscaled <= 0.0)
368         {
369             _marginHot = true;
370         }
371     }
372 
373     setValue(value, unscaled);
374 }
375 
handleSensorValue(sdbusplus::message_t & msg,DbusPassive * owner)376 int handleSensorValue(sdbusplus::message_t& msg, DbusPassive* owner)
377 {
378     std::string msgSensor;
379     std::map<std::string, std::variant<int64_t, double, bool>> msgData;
380 
381     msg.read(msgSensor, msgData);
382 
383     if (msgSensor == "xyz.openbmc_project.Sensor.Value")
384     {
385         auto valPropMap = msgData.find("Value");
386         if (valPropMap != msgData.end())
387         {
388             double value =
389                 std::visit(VariantToDoubleVisitor(), valPropMap->second);
390 
391             owner->updateValue(value, false);
392         }
393     }
394     else if (msgSensor == "xyz.openbmc_project.Sensor.Threshold.Critical")
395     {
396         auto criticalAlarmLow = msgData.find("CriticalAlarmLow");
397         auto criticalAlarmHigh = msgData.find("CriticalAlarmHigh");
398         if (criticalAlarmHigh == msgData.end() &&
399             criticalAlarmLow == msgData.end())
400         {
401             return 0;
402         }
403 
404         bool asserted = false;
405         if (criticalAlarmLow != msgData.end())
406         {
407             asserted = std::get<bool>(criticalAlarmLow->second);
408         }
409 
410         // checking both as in theory you could de-assert one threshold and
411         // assert the other at the same moment
412         if (!asserted && criticalAlarmHigh != msgData.end())
413         {
414             asserted = std::get<bool>(criticalAlarmHigh->second);
415         }
416         owner->setFailed(asserted);
417     }
418 #ifdef UNC_FAILSAFE
419     else if (msgSensor == "xyz.openbmc_project.Sensor.Threshold.Warning")
420     {
421         auto warningAlarmHigh = msgData.find("WarningAlarmHigh");
422         if (warningAlarmHigh == msgData.end())
423         {
424             return 0;
425         }
426 
427         bool asserted = false;
428         if (warningAlarmHigh != msgData.end())
429         {
430             asserted = std::get<bool>(warningAlarmHigh->second);
431         }
432         owner->setFailed(asserted);
433     }
434 #endif
435     else if (msgSensor == "xyz.openbmc_project.State.Decorator.Availability")
436     {
437         auto available = msgData.find("Available");
438         if (available == msgData.end())
439         {
440             return 0;
441         }
442         bool asserted = std::get<bool>(available->second);
443         owner->setAvailable(asserted);
444         if (!asserted)
445         {
446             // A thermal controller will continue its PID calculation and not
447             // trigger a 'failsafe' when some inputs are unavailable.
448             // So, forced to clear the value here to prevent a historical
449             // value to participate in a latter PID calculation.
450             owner->updateValue(std::numeric_limits<double>::quiet_NaN(), true);
451         }
452     }
453     else if (msgSensor ==
454              "xyz.openbmc_project.State.Decorator.OperationalStatus")
455     {
456         auto functional = msgData.find("Functional");
457         if (functional == msgData.end())
458         {
459             return 0;
460         }
461         bool asserted = std::get<bool>(functional->second);
462         owner->setFunctional(asserted);
463     }
464 
465     return 0;
466 }
467 
dbusHandleSignal(sd_bus_message * msg,void * usrData,sd_bus_error * err)468 int dbusHandleSignal(sd_bus_message* msg, void* usrData,
469                      [[maybe_unused]] sd_bus_error* err)
470 {
471     auto sdbpMsg = sdbusplus::message_t(msg);
472     DbusPassive* obj = static_cast<DbusPassive*>(usrData);
473 
474     return handleSensorValue(sdbpMsg, obj);
475 }
476 
477 } // namespace pid_control
478