xref: /openbmc/phosphor-pid-control/dbus/dbuspassive.cpp (revision e1fa85942c66533699a3b785990d95e9c89b6050)
1 /**
2  * Copyright 2017 Google Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #include "config.h"
17 
18 #include "dbuspassive.hpp"
19 
20 #include "conf.hpp"
21 #include "dbushelper_interface.hpp"
22 #include "dbuspassiveredundancy.hpp"
23 #include "dbusutil.hpp"
24 #include "failsafeloggers/failsafe_logger_utility.hpp"
25 #include "interfaces.hpp"
26 #include "util.hpp"
27 
28 #include <systemd/sd-bus.h>
29 
30 #include <sdbusplus/bus.hpp>
31 #include <sdbusplus/message.hpp>
32 
33 #include <chrono>
34 #include <cmath>
35 #include <cstdint>
36 #include <exception>
37 #include <limits>
38 #include <map>
39 #include <memory>
40 #include <mutex>
41 #include <set>
42 #include <string>
43 #include <utility>
44 #include <variant>
45 
46 #include "failsafeloggers/failsafe_logger.cpp"
47 
48 namespace pid_control
49 {
50 
51 std::unique_ptr<ReadInterface> DbusPassive::createDbusPassive(
52     sdbusplus::bus_t& bus, const std::string& type, const std::string& id,
53     std::unique_ptr<DbusHelperInterface> helper, const conf::SensorConfig* info,
54     const std::shared_ptr<DbusPassiveRedundancy>& redundancy)
55 {
56     if (helper == nullptr)
57     {
58         return nullptr;
59     }
60     if (!validType(type))
61     {
62         return nullptr;
63     }
64 
65     /* Need to get the scale and initial value */
66     /* service == busname */
67     std::string path;
68     if (info->readPath.empty())
69     {
70         path = getSensorPath(type, id);
71     }
72     else
73     {
74         path = info->readPath;
75     }
76 
77     SensorProperties settings;
78     bool failed;
79     std::string service;
80 
81     try
82     {
83         service = helper->getService(sensorintf, path);
84     }
85     catch (const std::exception& e)
86     {
87 #ifndef HANDLE_MISSING_OBJECT_PATHS
88         return nullptr;
89 #else
90         // CASE1: The sensor is not on DBus, but as it is not in the
91         // MissingIsAcceptable list, the sensor should be built with a failed
92         // state to send the zone to failsafe mode. Everything will recover if
93         // all important sensors are back to DBus. swampd will be informed
94         // through InterfacesAdded signals and the sensors will be built again.
95 
96         // CASE2: The sensor is on D-Bus (getService succeeds) but getProperties
97         // fails (e.g., D-Bus error or property fetch failure). In this case,
98         // handle-missing-object-paths does not apply. The sensor build fails,
99         // and the control loop will keep restarting until getProperties
100         // succeeds.
101 
102         // Only CASE1 may send the zone to failsafe mode if the sensor is not
103         // in MissingIsAcceptable. CASE2 results in continuous restart until
104         // recovery.
105 
106         failed = true;
107         settings.value = std::numeric_limits<double>::quiet_NaN();
108         settings.unit = getSensorUnit(type);
109         settings.available = false;
110         settings.unavailableAsFailed = true;
111         if (info->ignoreDbusMinMax)
112         {
113             settings.min = 0;
114             settings.max = 0;
115         }
116         std::cerr << "DbusPassive: Sensor " << path
117                   << " is missing from D-Bus, build this sensor as failed\n";
118         return std::make_unique<DbusPassive>(
119             bus, type, id, std::move(helper), settings, failed, path,
120             redundancy);
121 #endif
122     }
123 
124     try
125     {
126         helper->getProperties(service, path, &settings);
127         failed = helper->thresholdsAsserted(service, path);
128     }
129     catch (const std::exception& e)
130     {
131         return nullptr;
132     }
133 
134     /* if these values are zero, they're ignored. */
135     if (info->ignoreDbusMinMax)
136     {
137         settings.min = 0;
138         settings.max = 0;
139     }
140 
141     settings.unavailableAsFailed = info->unavailableAsFailed;
142 
143     return std::make_unique<DbusPassive>(bus, type, id, std::move(helper),
144                                          settings, failed, path, redundancy);
145 }
146 
147 DbusPassive::DbusPassive(
148     sdbusplus::bus_t& bus, const std::string& type, const std::string& id,
149     std::unique_ptr<DbusHelperInterface> helper,
150     const SensorProperties& settings, bool failed, const std::string& path,
151     const std::shared_ptr<DbusPassiveRedundancy>& redundancy) :
152     ReadInterface(), _signal(bus, getMatch(path), dbusHandleSignal, this),
153     _id(id), _helper(std::move(helper)), _failed(failed), path(path),
154     redundancy(redundancy)
155 
156 {
157     _scale = settings.scale;
158     _min = settings.min * std::pow(10.0, _scale);
159     _max = settings.max * std::pow(10.0, _scale);
160     _available = settings.available;
161     _unavailableAsFailed = settings.unavailableAsFailed;
162 
163     // Cache this type knowledge, to avoid repeated string comparison
164     _typeMargin = (type == "margin");
165     _typeFan = (type == "fan");
166 
167     // Force value to be stored, otherwise member would be uninitialized
168     updateValue(settings.value, true);
169 }
170 
171 ReadReturn DbusPassive::read(void)
172 {
173     std::lock_guard<std::mutex> guard(_lock);
174 
175     ReadReturn r = {_value, _updated, _unscaled};
176 
177     return r;
178 }
179 
180 void DbusPassive::setValue(double value, double unscaled)
181 {
182     std::lock_guard<std::mutex> guard(_lock);
183 
184     _value = value;
185     _unscaled = unscaled;
186     _updated = std::chrono::high_resolution_clock::now();
187 }
188 
189 void DbusPassive::setValue(double value)
190 {
191     // First param is scaled, second param is unscaled, assume same here
192     setValue(value, value);
193 }
194 
195 bool DbusPassive::getFailed(void) const
196 {
197     if (redundancy)
198     {
199         const std::set<std::string>& failures = redundancy->getFailed();
200         if (failures.find(path) != failures.end())
201         {
202             outputFailsafeLogWithSensor(_id, true, _id,
203                                         "The sensor path is marked redundant.");
204             return true;
205         }
206     }
207 
208     /*
209      * Unavailable thermal sensors, who are not present or
210      * power-state-not-matching, should not trigger the failSafe mode. For
211      * example, when a system stays at a powered-off state, its CPU Temp
212      * sensors will be unavailable, these unavailable sensors should not be
213      * treated as failed and trigger failSafe.
214      * This is important for systems whose Fans are always on.
215      */
216     if (!_typeFan && !_available && !_unavailableAsFailed)
217     {
218         return false;
219     }
220 
221     // If a reading has came in,
222     // but its value bad in some way (determined by sensor type),
223     // indicate this sensor has failed,
224     // until another value comes in that is no longer bad.
225     // This is different from the overall _failed flag,
226     // which is set and cleared by other causes.
227     if (_badReading)
228     {
229         outputFailsafeLogWithSensor(_id, true, _id,
230                                     "The sensor has bad readings.");
231         return true;
232     }
233 
234     // If a reading has came in, and it is not a bad reading,
235     // but it indicates there is no more thermal margin left,
236     // that is bad, something is wrong with the PID loops,
237     // they are not cooling the system, enable failsafe mode also.
238     if (_marginHot)
239     {
240         outputFailsafeLogWithSensor(_id, true, _id,
241                                     "The sensor has no thermal margin left.");
242         return true;
243     }
244 
245     if (_failed)
246     {
247         outputFailsafeLogWithSensor(
248             _id, true, _id, "The sensor has failed with a critical issue.");
249         return true;
250     }
251 
252     if (!_available)
253     {
254         outputFailsafeLogWithSensor(_id, true, _id,
255                                     "The sensor is unavailable.");
256         return true;
257     }
258 
259     if (!_functional)
260     {
261         outputFailsafeLogWithSensor(_id, true, _id,
262                                     "The sensor is not functional.");
263         return true;
264     }
265 
266     outputFailsafeLogWithSensor(_id, false, _id, "The sensor has recovered.");
267 
268     return false;
269 }
270 
271 std::string DbusPassive::getFailReason(void) const
272 {
273     if (_badReading)
274     {
275         return "Sensor reading bad";
276     }
277     if (_marginHot)
278     {
279         return "Margin hot";
280     }
281     if (_failed)
282     {
283         return "Sensor threshold asserted";
284     }
285     if (!_available)
286     {
287         return "Sensor unavailable";
288     }
289     if (!_functional)
290     {
291         return "Sensor not functional";
292     }
293     return "Unknown";
294 }
295 
296 void DbusPassive::setFailed(bool value)
297 {
298     _failed = value;
299 }
300 
301 void DbusPassive::setFunctional(bool value)
302 {
303     _functional = value;
304 }
305 
306 void DbusPassive::setAvailable(bool value)
307 {
308     _available = value;
309 }
310 
311 int64_t DbusPassive::getScale(void)
312 {
313     return _scale;
314 }
315 
316 std::string DbusPassive::getID(void)
317 {
318     return _id;
319 }
320 
321 double DbusPassive::getMax(void)
322 {
323     return _max;
324 }
325 
326 double DbusPassive::getMin(void)
327 {
328     return _min;
329 }
330 
331 void DbusPassive::updateValue(double value, bool force)
332 {
333     _badReading = false;
334 
335     // Do not let a NAN, or other floating-point oddity, be used to update
336     // the value, as that indicates the sensor has no valid reading.
337     if (!(std::isfinite(value)))
338     {
339         _badReading = true;
340 
341         // Do not continue with a bad reading, unless caller forcing
342         if (!force)
343         {
344             return;
345         }
346     }
347 
348     value *= std::pow(10.0, _scale);
349 
350     auto unscaled = value;
351     scaleSensorReading(_min, _max, value);
352 
353     if (_typeMargin)
354     {
355         _marginHot = false;
356 
357         // Unlike an absolute temperature sensor,
358         // where 0 degrees C is a good reading,
359         // a value received of 0 (or negative) margin is worrisome,
360         // and should be flagged.
361         // Either it indicates margin not calculated properly,
362         // or somebody forgot to set the margin-zero setpoint,
363         // or the system is really overheating that much.
364         // This is a different condition from _failed
365         // and _badReading, so it merits its own flag.
366         // The sensor has not failed, the reading is good, but the zone
367         // still needs to know that it should go to failsafe mode.
368         if (unscaled <= 0.0)
369         {
370             _marginHot = true;
371         }
372     }
373 
374     setValue(value, unscaled);
375 }
376 
377 int handleSensorValue(sdbusplus::message_t& msg, DbusPassive* owner)
378 {
379     std::string msgSensor;
380     std::map<std::string, std::variant<int64_t, double, bool>> msgData;
381 
382     msg.read(msgSensor, msgData);
383 
384     if (msgSensor == "xyz.openbmc_project.Sensor.Value")
385     {
386         auto valPropMap = msgData.find("Value");
387         if (valPropMap != msgData.end())
388         {
389             double value =
390                 std::visit(VariantToDoubleVisitor(), valPropMap->second);
391 
392             owner->updateValue(value, false);
393         }
394     }
395     else if (msgSensor == "xyz.openbmc_project.Sensor.Threshold.Critical")
396     {
397         auto criticalAlarmLow = msgData.find("CriticalAlarmLow");
398         auto criticalAlarmHigh = msgData.find("CriticalAlarmHigh");
399         if (criticalAlarmHigh == msgData.end() &&
400             criticalAlarmLow == msgData.end())
401         {
402             return 0;
403         }
404 
405         bool asserted = false;
406         if (criticalAlarmLow != msgData.end())
407         {
408             asserted = std::get<bool>(criticalAlarmLow->second);
409         }
410 
411         // checking both as in theory you could de-assert one threshold and
412         // assert the other at the same moment
413         if (!asserted && criticalAlarmHigh != msgData.end())
414         {
415             asserted = std::get<bool>(criticalAlarmHigh->second);
416         }
417         owner->setFailed(asserted);
418     }
419 #ifdef UNC_FAILSAFE
420     else if (msgSensor == "xyz.openbmc_project.Sensor.Threshold.Warning")
421     {
422         auto warningAlarmHigh = msgData.find("WarningAlarmHigh");
423         if (warningAlarmHigh == msgData.end())
424         {
425             return 0;
426         }
427 
428         bool asserted = false;
429         if (warningAlarmHigh != msgData.end())
430         {
431             asserted = std::get<bool>(warningAlarmHigh->second);
432         }
433         owner->setFailed(asserted);
434     }
435 #endif
436     else if (msgSensor == "xyz.openbmc_project.State.Decorator.Availability")
437     {
438         auto available = msgData.find("Available");
439         if (available == msgData.end())
440         {
441             return 0;
442         }
443         bool asserted = std::get<bool>(available->second);
444         owner->setAvailable(asserted);
445         if (!asserted)
446         {
447             // A thermal controller will continue its PID calculation and not
448             // trigger a 'failsafe' when some inputs are unavailable.
449             // So, forced to clear the value here to prevent a historical
450             // value to participate in a latter PID calculation.
451             owner->updateValue(std::numeric_limits<double>::quiet_NaN(), true);
452         }
453     }
454     else if (msgSensor ==
455              "xyz.openbmc_project.State.Decorator.OperationalStatus")
456     {
457         auto functional = msgData.find("Functional");
458         if (functional == msgData.end())
459         {
460             return 0;
461         }
462         bool asserted = std::get<bool>(functional->second);
463         owner->setFunctional(asserted);
464     }
465 
466     return 0;
467 }
468 
469 int dbusHandleSignal(sd_bus_message* msg, void* usrData,
470                      [[maybe_unused]] sd_bus_error* err)
471 {
472     auto sdbpMsg = sdbusplus::message_t(msg);
473     DbusPassive* obj = static_cast<DbusPassive*>(usrData);
474 
475     return handleSensorValue(sdbpMsg, obj);
476 }
477 
478 } // namespace pid_control
479