xref: /openbmc/phosphor-pid-control/dbus/dbuspassive.cpp (revision ccb9e727d9c87f1dc2c0784ed45e54e9168c4200)
1 // SPDX-License-Identifier: Apache-2.0
2 // SPDX-FileCopyrightText: Copyright 2017 Google Inc
3 
4 #include "config.h"
5 
6 #include "dbuspassive.hpp"
7 
8 #include "conf.hpp"
9 #include "dbushelper_interface.hpp"
10 #include "dbuspassiveredundancy.hpp"
11 #include "dbusutil.hpp"
12 #include "failsafeloggers/failsafe_logger_utility.hpp"
13 #include "interfaces.hpp"
14 #include "util.hpp"
15 
16 #include <systemd/sd-bus.h>
17 
18 #include <sdbusplus/bus.hpp>
19 #include <sdbusplus/message.hpp>
20 #include <xyz/openbmc_project/Sensor/Threshold/Critical/common.hpp>
21 #include <xyz/openbmc_project/Sensor/Threshold/Warning/common.hpp>
22 #include <xyz/openbmc_project/Sensor/Value/client.hpp>
23 #include <xyz/openbmc_project/State/Decorator/Availability/common.hpp>
24 #include <xyz/openbmc_project/State/Decorator/OperationalStatus/common.hpp>
25 
26 #include <chrono>
27 #include <cmath>
28 #include <cstdint>
29 #include <exception>
30 #include <limits>
31 #include <map>
32 #include <memory>
33 #include <mutex>
34 #include <set>
35 #include <string>
36 #include <utility>
37 #include <variant>
38 
39 #include "failsafeloggers/failsafe_logger.cpp"
40 
41 using SensorValue = sdbusplus::common::xyz::openbmc_project::sensor::Value;
42 using SensorThresholdWarning =
43     sdbusplus::common::xyz::openbmc_project::sensor::threshold::Warning;
44 using SensorThresholdCritical =
45     sdbusplus::common::xyz::openbmc_project::sensor::threshold::Critical;
46 using StateDecoratorAvailability =
47     sdbusplus::common::xyz::openbmc_project::state::decorator::Availability;
48 using StateDecoratorOperationalStatus = sdbusplus::common::xyz::
49     openbmc_project::state::decorator::OperationalStatus;
50 
51 namespace pid_control
52 {
53 
createDbusPassive(sdbusplus::bus_t & bus,const std::string & type,const std::string & id,std::unique_ptr<DbusHelperInterface> helper,const conf::SensorConfig * info,const std::shared_ptr<DbusPassiveRedundancy> & redundancy)54 std::unique_ptr<ReadInterface> DbusPassive::createDbusPassive(
55     sdbusplus::bus_t& bus, const std::string& type, const std::string& id,
56     std::unique_ptr<DbusHelperInterface> helper, const conf::SensorConfig* info,
57     const std::shared_ptr<DbusPassiveRedundancy>& redundancy)
58 {
59     if (helper == nullptr)
60     {
61         return nullptr;
62     }
63     if (!validType(type))
64     {
65         return nullptr;
66     }
67 
68     /* Need to get the scale and initial value */
69     /* service == busname */
70     std::string path;
71     if (info->readPath.empty())
72     {
73         path = getSensorPath(type, id);
74     }
75     else
76     {
77         path = info->readPath;
78     }
79 
80     SensorProperties settings;
81     bool failed;
82     bool objectMissing = false;
83     std::string service;
84 
85     try
86     {
87         service = helper->getService(SensorValue::interface, path);
88     }
89     catch (const std::exception& e)
90     {
91 #ifndef HANDLE_MISSING_OBJECT_PATHS
92         return nullptr;
93 #else
94         // CASE1: The sensor is not on DBus, but as it is not in the
95         // MissingIsAcceptable list, the sensor should be built with a failed
96         // state to send the zone to failsafe mode. Everything will recover if
97         // all important sensors are back to DBus. swampd will be informed
98         // through InterfacesAdded signals and the sensors will be built again.
99 
100         // CASE2: The sensor is on D-Bus (getService succeeds) but getProperties
101         // fails (e.g., D-Bus error or property fetch failure). In this case,
102         // handle-missing-object-paths does not apply. The sensor build fails,
103         // and the control loop will keep restarting until getProperties
104         // succeeds.
105 
106         // Only CASE1 may send the zone to failsafe mode if the sensor is not
107         // in MissingIsAcceptable. CASE2 results in continuous restart until
108         // recovery.
109         objectMissing = true;
110         auto sensor = std::make_unique<DbusPassive>(
111             bus, type, id, std::move(helper), objectMissing, path, redundancy);
112         failed = true;
113         settings.value = std::numeric_limits<double>::quiet_NaN();
114         settings.unit = getSensorUnit(type);
115         settings.available = false;
116         settings.unavailableAsFailed = true;
117         if (info->ignoreDbusMinMax)
118         {
119             settings.min = 0;
120             settings.max = 0;
121         }
122         sensor->initFromSettings(settings, true);
123         std::cerr << "DbusPassive: Sensor " << path
124                   << " is missing from D-Bus, build this sensor as failed\n";
125         return sensor;
126 #endif
127     }
128 
129     auto sensor = std::make_unique<DbusPassive>(
130         bus, type, id, std::move(helper), objectMissing, path, redundancy);
131 
132     try
133     {
134         sensor->_helper->getProperties(service, path, &settings);
135         failed = sensor->_helper->thresholdsAsserted(service, path);
136     }
137     catch (const std::exception& e)
138     {
139         return nullptr;
140     }
141 
142     /* if these values are zero, they're ignored. */
143     if (info->ignoreDbusMinMax)
144     {
145         settings.min = 0;
146         settings.max = 0;
147     }
148 
149     settings.unavailableAsFailed = info->unavailableAsFailed;
150     sensor->initFromSettings(settings, failed);
151 
152     return sensor;
153 }
154 
DbusPassive(sdbusplus::bus_t & bus,const std::string & type,const std::string & id,std::unique_ptr<DbusHelperInterface> helper,bool objectMissing,const std::string & path,const std::shared_ptr<DbusPassiveRedundancy> & redundancy)155 DbusPassive::DbusPassive(
156     sdbusplus::bus_t& bus, const std::string& type, const std::string& id,
157     std::unique_ptr<DbusHelperInterface> helper, bool objectMissing,
158     const std::string& path,
159     const std::shared_ptr<DbusPassiveRedundancy>& redundancy) :
160     ReadInterface(), _signal(bus, getMatch(path), dbusHandleSignal, this),
161     _id(id), _helper(std::move(helper)), _objectMissing(objectMissing),
162     path(path), redundancy(redundancy)
163 
164 {
165     // Cache this type knowledge, to avoid repeated string comparison
166     _typeMargin = (type == "margin");
167     _typeFan = (type == "fan");
168 }
169 
read(void)170 ReadReturn DbusPassive::read(void)
171 {
172     std::lock_guard<std::mutex> guard(_lock);
173 
174     ReadReturn r = {_value, _updated, _unscaled};
175 
176     return r;
177 }
178 
setValue(double value,double unscaled)179 void DbusPassive::setValue(double value, double unscaled)
180 {
181     std::lock_guard<std::mutex> guard(_lock);
182 
183     _value = value;
184     _unscaled = unscaled;
185     _updated = std::chrono::high_resolution_clock::now();
186 }
187 
setValue(double value)188 void DbusPassive::setValue(double value)
189 {
190     // First param is scaled, second param is unscaled, assume same here
191     setValue(value, value);
192 }
193 
getFailed(void) const194 bool DbusPassive::getFailed(void) const
195 {
196     if (redundancy)
197     {
198         const std::set<std::string>& failures = redundancy->getFailed();
199         if (failures.find(path) != failures.end())
200         {
201             outputFailsafeLogWithSensor(_id, true, _id,
202                                         "The sensor path is marked redundant.");
203             return true;
204         }
205     }
206 
207     /*
208      * If handle-missing-object-paths is enabled, and the expected D-Bus object
209      * path is not exported, this sensor is created to represent that condition.
210      * Indicate this sensor has failed so the zone enters failSafe mode.
211      */
212     if (_objectMissing)
213     {
214         outputFailsafeLogWithSensor(_id, true, _id,
215                                     "The sensor D-Bus object is missing.");
216         return true;
217     }
218 
219     /*
220      * Unavailable thermal sensors, who are not present or
221      * power-state-not-matching, should not trigger the failSafe mode. For
222      * example, when a system stays at a powered-off state, its CPU Temp
223      * sensors will be unavailable, these unavailable sensors should not be
224      * treated as failed and trigger failSafe.
225      * This is important for systems whose Fans are always on.
226      */
227     if (!_typeFan && !_available && !_unavailableAsFailed)
228     {
229         return false;
230     }
231 
232     // If a reading has came in,
233     // but its value bad in some way (determined by sensor type),
234     // indicate this sensor has failed,
235     // until another value comes in that is no longer bad.
236     // This is different from the overall _failed flag,
237     // which is set and cleared by other causes.
238     if (_badReading)
239     {
240         outputFailsafeLogWithSensor(_id, true, _id,
241                                     "The sensor has bad readings.");
242         return true;
243     }
244 
245     // If a reading has came in, and it is not a bad reading,
246     // but it indicates there is no more thermal margin left,
247     // that is bad, something is wrong with the PID loops,
248     // they are not cooling the system, enable failsafe mode also.
249     if (_marginHot)
250     {
251         outputFailsafeLogWithSensor(_id, true, _id,
252                                     "The sensor has no thermal margin left.");
253         return true;
254     }
255 
256     if (_failed)
257     {
258         outputFailsafeLogWithSensor(
259             _id, true, _id, "The sensor has failed with a critical issue.");
260         return true;
261     }
262 
263     if (!_available)
264     {
265         outputFailsafeLogWithSensor(_id, true, _id,
266                                     "The sensor is unavailable.");
267         return true;
268     }
269 
270     if (!_functional)
271     {
272         outputFailsafeLogWithSensor(_id, true, _id,
273                                     "The sensor is not functional.");
274         return true;
275     }
276 
277     outputFailsafeLogWithSensor(_id, false, _id, "The sensor has recovered.");
278 
279     return false;
280 }
281 
getFailReason(void) const282 std::string DbusPassive::getFailReason(void) const
283 {
284     if (_objectMissing)
285     {
286         return "Sensor D-Bus object missing";
287     }
288     if (_badReading)
289     {
290         return "Sensor reading bad";
291     }
292     if (_marginHot)
293     {
294         return "Margin hot";
295     }
296     if (_failed)
297     {
298         return "Sensor threshold asserted";
299     }
300     if (!_available)
301     {
302         return "Sensor unavailable";
303     }
304     if (!_functional)
305     {
306         return "Sensor not functional";
307     }
308     return "Unknown";
309 }
310 
setFailed(bool value)311 void DbusPassive::setFailed(bool value)
312 {
313     _failed = value;
314 }
315 
setFunctional(bool value)316 void DbusPassive::setFunctional(bool value)
317 {
318     _functional = value;
319 }
320 
setAvailable(bool value)321 void DbusPassive::setAvailable(bool value)
322 {
323     _available = value;
324     _availableOverridden = true;
325 }
326 
initFromSettings(const SensorProperties & settings,bool failed)327 void DbusPassive::initFromSettings(const SensorProperties& settings,
328                                    bool failed)
329 {
330     _failed = failed;
331     _scale = settings.scale;
332     _min = settings.min * std::pow(10.0, _scale);
333     _max = settings.max * std::pow(10.0, _scale);
334     _unavailableAsFailed = settings.unavailableAsFailed;
335     setAvailableFromProperty(settings.available);
336 
337     // Force value to be stored, otherwise member would be uninitialized
338     updateValue(settings.value, true);
339 }
340 
setAvailableFromProperty(bool value)341 void DbusPassive::setAvailableFromProperty(bool value)
342 {
343     if (!_availableOverridden)
344     {
345         _available = value;
346     }
347 }
348 
getScale(void)349 int64_t DbusPassive::getScale(void)
350 {
351     return _scale;
352 }
353 
getID(void)354 std::string DbusPassive::getID(void)
355 {
356     return _id;
357 }
358 
getMax(void)359 double DbusPassive::getMax(void)
360 {
361     return _max;
362 }
363 
getMin(void)364 double DbusPassive::getMin(void)
365 {
366     return _min;
367 }
368 
updateValue(double value,bool force)369 void DbusPassive::updateValue(double value, bool force)
370 {
371     _badReading = false;
372 
373     // Do not let a NAN, or other floating-point oddity, be used to update
374     // the value, as that indicates the sensor has no valid reading.
375     if (!(std::isfinite(value)))
376     {
377         _badReading = true;
378 
379         // Do not continue with a bad reading, unless caller forcing
380         if (!force)
381         {
382             return;
383         }
384     }
385 
386     value *= std::pow(10.0, _scale);
387 
388     auto unscaled = value;
389     scaleSensorReading(_min, _max, value);
390 
391     if (_typeMargin)
392     {
393         _marginHot = false;
394 
395         // Unlike an absolute temperature sensor,
396         // where 0 degrees C is a good reading,
397         // a value received of 0 (or negative) margin is worrisome,
398         // and should be flagged.
399         // Either it indicates margin not calculated properly,
400         // or somebody forgot to set the margin-zero setpoint,
401         // or the system is really overheating that much.
402         // This is a different condition from _failed
403         // and _badReading, so it merits its own flag.
404         // The sensor has not failed, the reading is good, but the zone
405         // still needs to know that it should go to failsafe mode.
406         if (unscaled <= 0.0)
407         {
408             _marginHot = true;
409         }
410     }
411 
412     setValue(value, unscaled);
413 }
414 
handleSensorValue(sdbusplus::message_t & msg,DbusPassive * owner)415 int handleSensorValue(sdbusplus::message_t& msg, DbusPassive* owner)
416 {
417     std::string msgSensor;
418     std::map<std::string, std::variant<int64_t, double, bool>> msgData;
419 
420     msg.read(msgSensor, msgData);
421 
422     if (msgSensor == SensorValue::interface)
423     {
424         auto valPropMap = msgData.find(SensorValue::property_names::value);
425         if (valPropMap != msgData.end())
426         {
427             double value =
428                 std::visit(VariantToDoubleVisitor(), valPropMap->second);
429 
430             owner->updateValue(value, false);
431         }
432     }
433     else if (msgSensor == SensorThresholdCritical::interface)
434     {
435         auto criticalAlarmLow = msgData.find(
436             SensorThresholdCritical::property_names::critical_alarm_low);
437         auto criticalAlarmHigh = msgData.find(
438             SensorThresholdCritical::property_names::critical_alarm_high);
439         if (criticalAlarmHigh == msgData.end() &&
440             criticalAlarmLow == msgData.end())
441         {
442             return 0;
443         }
444 
445         bool asserted = false;
446         if (criticalAlarmLow != msgData.end())
447         {
448             asserted = std::get<bool>(criticalAlarmLow->second);
449         }
450 
451         // checking both as in theory you could de-assert one threshold and
452         // assert the other at the same moment
453         if (!asserted && criticalAlarmHigh != msgData.end())
454         {
455             asserted = std::get<bool>(criticalAlarmHigh->second);
456         }
457         owner->setFailed(asserted);
458     }
459 #ifdef UNC_FAILSAFE
460     else if (msgSensor == SensorThresholdWarning::interface)
461     {
462         auto warningAlarmHigh = msgData.find(
463             SensorThresholdWarning::property_names::warning_alarm_high);
464         if (warningAlarmHigh == msgData.end())
465         {
466             return 0;
467         }
468 
469         bool asserted = false;
470         if (warningAlarmHigh != msgData.end())
471         {
472             asserted = std::get<bool>(warningAlarmHigh->second);
473         }
474         owner->setFailed(asserted);
475     }
476 #endif
477     else if (msgSensor == StateDecoratorAvailability::interface)
478     {
479         auto available =
480             msgData.find(StateDecoratorAvailability::property_names::available);
481         if (available == msgData.end())
482         {
483             return 0;
484         }
485         bool asserted = std::get<bool>(available->second);
486         owner->setAvailable(asserted);
487         if (!asserted)
488         {
489             // A thermal controller will continue its PID calculation and not
490             // trigger a 'failsafe' when some inputs are unavailable.
491             // So, forced to clear the value here to prevent a historical
492             // value to participate in a latter PID calculation.
493             owner->updateValue(std::numeric_limits<double>::quiet_NaN(), true);
494         }
495     }
496     else if (msgSensor == StateDecoratorOperationalStatus::interface)
497     {
498         auto functional = msgData.find(
499             StateDecoratorOperationalStatus::property_names::functional);
500         if (functional == msgData.end())
501         {
502             return 0;
503         }
504         bool asserted = std::get<bool>(functional->second);
505         owner->setFunctional(asserted);
506     }
507 
508     return 0;
509 }
510 
dbusHandleSignal(sd_bus_message * msg,void * usrData,sd_bus_error * err)511 int dbusHandleSignal(sd_bus_message* msg, void* usrData,
512                      [[maybe_unused]] sd_bus_error* err)
513 {
514     auto sdbpMsg = sdbusplus::message_t(msg);
515     DbusPassive* obj = static_cast<DbusPassive*>(usrData);
516 
517     return handleSensorValue(sdbpMsg, obj);
518 }
519 
520 } // namespace pid_control
521