xref: /openbmc/phosphor-pid-control/dbus/dbuspassive.cpp (revision 46a755fce8dc0bdd9c0c5ea09d55d3e5494f335f)
1 // SPDX-License-Identifier: Apache-2.0
2 // SPDX-FileCopyrightText: Copyright 2017 Google Inc
3 
4 #include "config.h"
5 
6 #include "dbuspassive.hpp"
7 
8 #include "conf.hpp"
9 #include "dbushelper_interface.hpp"
10 #include "dbuspassiveredundancy.hpp"
11 #include "dbusutil.hpp"
12 #include "failsafeloggers/failsafe_logger_utility.hpp"
13 #include "interfaces.hpp"
14 #include "util.hpp"
15 
16 #include <systemd/sd-bus.h>
17 
18 #include <sdbusplus/bus.hpp>
19 #include <sdbusplus/message.hpp>
20 
21 #include <chrono>
22 #include <cmath>
23 #include <cstdint>
24 #include <exception>
25 #include <limits>
26 #include <map>
27 #include <memory>
28 #include <mutex>
29 #include <set>
30 #include <string>
31 #include <utility>
32 #include <variant>
33 
34 #include "failsafeloggers/failsafe_logger.cpp"
35 
36 namespace pid_control
37 {
38 
39 std::unique_ptr<ReadInterface> DbusPassive::createDbusPassive(
40     sdbusplus::bus_t& bus, const std::string& type, const std::string& id,
41     std::unique_ptr<DbusHelperInterface> helper, const conf::SensorConfig* info,
42     const std::shared_ptr<DbusPassiveRedundancy>& redundancy)
43 {
44     if (helper == nullptr)
45     {
46         return nullptr;
47     }
48     if (!validType(type))
49     {
50         return nullptr;
51     }
52 
53     /* Need to get the scale and initial value */
54     /* service == busname */
55     std::string path;
56     if (info->readPath.empty())
57     {
58         path = getSensorPath(type, id);
59     }
60     else
61     {
62         path = info->readPath;
63     }
64 
65     SensorProperties settings;
66     bool failed;
67     bool objectMissing = false;
68     std::string service;
69 
70     try
71     {
72         service = helper->getService(sensorintf, path);
73     }
74     catch (const std::exception& e)
75     {
76 #ifndef HANDLE_MISSING_OBJECT_PATHS
77         return nullptr;
78 #else
79         // CASE1: The sensor is not on DBus, but as it is not in the
80         // MissingIsAcceptable list, the sensor should be built with a failed
81         // state to send the zone to failsafe mode. Everything will recover if
82         // all important sensors are back to DBus. swampd will be informed
83         // through InterfacesAdded signals and the sensors will be built again.
84 
85         // CASE2: The sensor is on D-Bus (getService succeeds) but getProperties
86         // fails (e.g., D-Bus error or property fetch failure). In this case,
87         // handle-missing-object-paths does not apply. The sensor build fails,
88         // and the control loop will keep restarting until getProperties
89         // succeeds.
90 
91         // Only CASE1 may send the zone to failsafe mode if the sensor is not
92         // in MissingIsAcceptable. CASE2 results in continuous restart until
93         // recovery.
94 
95         failed = true;
96         objectMissing = true;
97         settings.value = std::numeric_limits<double>::quiet_NaN();
98         settings.unit = getSensorUnit(type);
99         settings.available = false;
100         settings.unavailableAsFailed = true;
101         if (info->ignoreDbusMinMax)
102         {
103             settings.min = 0;
104             settings.max = 0;
105         }
106         std::cerr << "DbusPassive: Sensor " << path
107                   << " is missing from D-Bus, build this sensor as failed\n";
108         return std::make_unique<DbusPassive>(
109             bus, type, id, std::move(helper), settings, failed, objectMissing,
110             path, redundancy);
111 #endif
112     }
113 
114     try
115     {
116         helper->getProperties(service, path, &settings);
117         failed = helper->thresholdsAsserted(service, path);
118     }
119     catch (const std::exception& e)
120     {
121         return nullptr;
122     }
123 
124     /* if these values are zero, they're ignored. */
125     if (info->ignoreDbusMinMax)
126     {
127         settings.min = 0;
128         settings.max = 0;
129     }
130 
131     settings.unavailableAsFailed = info->unavailableAsFailed;
132 
133     return std::make_unique<DbusPassive>(
134         bus, type, id, std::move(helper), settings, failed, objectMissing, path,
135         redundancy);
136 }
137 
138 DbusPassive::DbusPassive(
139     sdbusplus::bus_t& bus, const std::string& type, const std::string& id,
140     std::unique_ptr<DbusHelperInterface> helper,
141     const SensorProperties& settings, bool failed, bool objectMissing,
142     const std::string& path,
143     const std::shared_ptr<DbusPassiveRedundancy>& redundancy) :
144     ReadInterface(), _signal(bus, getMatch(path), dbusHandleSignal, this),
145     _id(id), _helper(std::move(helper)), _failed(failed),
146     _objectMissing(objectMissing), path(path), redundancy(redundancy)
147 
148 {
149     _scale = settings.scale;
150     _min = settings.min * std::pow(10.0, _scale);
151     _max = settings.max * std::pow(10.0, _scale);
152     _available = settings.available;
153     _unavailableAsFailed = settings.unavailableAsFailed;
154 
155     // Cache this type knowledge, to avoid repeated string comparison
156     _typeMargin = (type == "margin");
157     _typeFan = (type == "fan");
158 
159     // Force value to be stored, otherwise member would be uninitialized
160     updateValue(settings.value, true);
161 }
162 
163 ReadReturn DbusPassive::read(void)
164 {
165     std::lock_guard<std::mutex> guard(_lock);
166 
167     ReadReturn r = {_value, _updated, _unscaled};
168 
169     return r;
170 }
171 
172 void DbusPassive::setValue(double value, double unscaled)
173 {
174     std::lock_guard<std::mutex> guard(_lock);
175 
176     _value = value;
177     _unscaled = unscaled;
178     _updated = std::chrono::high_resolution_clock::now();
179 }
180 
181 void DbusPassive::setValue(double value)
182 {
183     // First param is scaled, second param is unscaled, assume same here
184     setValue(value, value);
185 }
186 
187 bool DbusPassive::getFailed(void) const
188 {
189     if (redundancy)
190     {
191         const std::set<std::string>& failures = redundancy->getFailed();
192         if (failures.find(path) != failures.end())
193         {
194             outputFailsafeLogWithSensor(_id, true, _id,
195                                         "The sensor path is marked redundant.");
196             return true;
197         }
198     }
199 
200     /*
201      * If handle-missing-object-paths is enabled, and the expected D-Bus object
202      * path is not exported, this sensor is created to represent that condition.
203      * Indicate this sensor has failed so the zone enters failSafe mode.
204      */
205     if (_objectMissing)
206     {
207         outputFailsafeLogWithSensor(_id, true, _id,
208                                     "The sensor D-Bus object is missing.");
209         return true;
210     }
211 
212     /*
213      * Unavailable thermal sensors, who are not present or
214      * power-state-not-matching, should not trigger the failSafe mode. For
215      * example, when a system stays at a powered-off state, its CPU Temp
216      * sensors will be unavailable, these unavailable sensors should not be
217      * treated as failed and trigger failSafe.
218      * This is important for systems whose Fans are always on.
219      */
220     if (!_typeFan && !_available && !_unavailableAsFailed)
221     {
222         return false;
223     }
224 
225     // If a reading has came in,
226     // but its value bad in some way (determined by sensor type),
227     // indicate this sensor has failed,
228     // until another value comes in that is no longer bad.
229     // This is different from the overall _failed flag,
230     // which is set and cleared by other causes.
231     if (_badReading)
232     {
233         outputFailsafeLogWithSensor(_id, true, _id,
234                                     "The sensor has bad readings.");
235         return true;
236     }
237 
238     // If a reading has came in, and it is not a bad reading,
239     // but it indicates there is no more thermal margin left,
240     // that is bad, something is wrong with the PID loops,
241     // they are not cooling the system, enable failsafe mode also.
242     if (_marginHot)
243     {
244         outputFailsafeLogWithSensor(_id, true, _id,
245                                     "The sensor has no thermal margin left.");
246         return true;
247     }
248 
249     if (_failed)
250     {
251         outputFailsafeLogWithSensor(
252             _id, true, _id, "The sensor has failed with a critical issue.");
253         return true;
254     }
255 
256     if (!_available)
257     {
258         outputFailsafeLogWithSensor(_id, true, _id,
259                                     "The sensor is unavailable.");
260         return true;
261     }
262 
263     if (!_functional)
264     {
265         outputFailsafeLogWithSensor(_id, true, _id,
266                                     "The sensor is not functional.");
267         return true;
268     }
269 
270     outputFailsafeLogWithSensor(_id, false, _id, "The sensor has recovered.");
271 
272     return false;
273 }
274 
275 std::string DbusPassive::getFailReason(void) const
276 {
277     if (_objectMissing)
278     {
279         return "Sensor D-Bus object missing";
280     }
281     if (_badReading)
282     {
283         return "Sensor reading bad";
284     }
285     if (_marginHot)
286     {
287         return "Margin hot";
288     }
289     if (_failed)
290     {
291         return "Sensor threshold asserted";
292     }
293     if (!_available)
294     {
295         return "Sensor unavailable";
296     }
297     if (!_functional)
298     {
299         return "Sensor not functional";
300     }
301     return "Unknown";
302 }
303 
304 void DbusPassive::setFailed(bool value)
305 {
306     _failed = value;
307 }
308 
309 void DbusPassive::setFunctional(bool value)
310 {
311     _functional = value;
312 }
313 
314 void DbusPassive::setAvailable(bool value)
315 {
316     _available = value;
317 }
318 
319 int64_t DbusPassive::getScale(void)
320 {
321     return _scale;
322 }
323 
324 std::string DbusPassive::getID(void)
325 {
326     return _id;
327 }
328 
329 double DbusPassive::getMax(void)
330 {
331     return _max;
332 }
333 
334 double DbusPassive::getMin(void)
335 {
336     return _min;
337 }
338 
339 void DbusPassive::updateValue(double value, bool force)
340 {
341     _badReading = false;
342 
343     // Do not let a NAN, or other floating-point oddity, be used to update
344     // the value, as that indicates the sensor has no valid reading.
345     if (!(std::isfinite(value)))
346     {
347         _badReading = true;
348 
349         // Do not continue with a bad reading, unless caller forcing
350         if (!force)
351         {
352             return;
353         }
354     }
355 
356     value *= std::pow(10.0, _scale);
357 
358     auto unscaled = value;
359     scaleSensorReading(_min, _max, value);
360 
361     if (_typeMargin)
362     {
363         _marginHot = false;
364 
365         // Unlike an absolute temperature sensor,
366         // where 0 degrees C is a good reading,
367         // a value received of 0 (or negative) margin is worrisome,
368         // and should be flagged.
369         // Either it indicates margin not calculated properly,
370         // or somebody forgot to set the margin-zero setpoint,
371         // or the system is really overheating that much.
372         // This is a different condition from _failed
373         // and _badReading, so it merits its own flag.
374         // The sensor has not failed, the reading is good, but the zone
375         // still needs to know that it should go to failsafe mode.
376         if (unscaled <= 0.0)
377         {
378             _marginHot = true;
379         }
380     }
381 
382     setValue(value, unscaled);
383 }
384 
385 int handleSensorValue(sdbusplus::message_t& msg, DbusPassive* owner)
386 {
387     std::string msgSensor;
388     std::map<std::string, std::variant<int64_t, double, bool>> msgData;
389 
390     msg.read(msgSensor, msgData);
391 
392     if (msgSensor == "xyz.openbmc_project.Sensor.Value")
393     {
394         auto valPropMap = msgData.find("Value");
395         if (valPropMap != msgData.end())
396         {
397             double value =
398                 std::visit(VariantToDoubleVisitor(), valPropMap->second);
399 
400             owner->updateValue(value, false);
401         }
402     }
403     else if (msgSensor == "xyz.openbmc_project.Sensor.Threshold.Critical")
404     {
405         auto criticalAlarmLow = msgData.find("CriticalAlarmLow");
406         auto criticalAlarmHigh = msgData.find("CriticalAlarmHigh");
407         if (criticalAlarmHigh == msgData.end() &&
408             criticalAlarmLow == msgData.end())
409         {
410             return 0;
411         }
412 
413         bool asserted = false;
414         if (criticalAlarmLow != msgData.end())
415         {
416             asserted = std::get<bool>(criticalAlarmLow->second);
417         }
418 
419         // checking both as in theory you could de-assert one threshold and
420         // assert the other at the same moment
421         if (!asserted && criticalAlarmHigh != msgData.end())
422         {
423             asserted = std::get<bool>(criticalAlarmHigh->second);
424         }
425         owner->setFailed(asserted);
426     }
427 #ifdef UNC_FAILSAFE
428     else if (msgSensor == "xyz.openbmc_project.Sensor.Threshold.Warning")
429     {
430         auto warningAlarmHigh = msgData.find("WarningAlarmHigh");
431         if (warningAlarmHigh == msgData.end())
432         {
433             return 0;
434         }
435 
436         bool asserted = false;
437         if (warningAlarmHigh != msgData.end())
438         {
439             asserted = std::get<bool>(warningAlarmHigh->second);
440         }
441         owner->setFailed(asserted);
442     }
443 #endif
444     else if (msgSensor == "xyz.openbmc_project.State.Decorator.Availability")
445     {
446         auto available = msgData.find("Available");
447         if (available == msgData.end())
448         {
449             return 0;
450         }
451         bool asserted = std::get<bool>(available->second);
452         owner->setAvailable(asserted);
453         if (!asserted)
454         {
455             // A thermal controller will continue its PID calculation and not
456             // trigger a 'failsafe' when some inputs are unavailable.
457             // So, forced to clear the value here to prevent a historical
458             // value to participate in a latter PID calculation.
459             owner->updateValue(std::numeric_limits<double>::quiet_NaN(), true);
460         }
461     }
462     else if (msgSensor ==
463              "xyz.openbmc_project.State.Decorator.OperationalStatus")
464     {
465         auto functional = msgData.find("Functional");
466         if (functional == msgData.end())
467         {
468             return 0;
469         }
470         bool asserted = std::get<bool>(functional->second);
471         owner->setFunctional(asserted);
472     }
473 
474     return 0;
475 }
476 
477 int dbusHandleSignal(sd_bus_message* msg, void* usrData,
478                      [[maybe_unused]] sd_bus_error* err)
479 {
480     auto sdbpMsg = sdbusplus::message_t(msg);
481     DbusPassive* obj = static_cast<DbusPassive*>(usrData);
482 
483     return handleSensorValue(sdbpMsg, obj);
484 }
485 
486 } // namespace pid_control
487