1 // SPDX-License-Identifier: Apache-2.0
2 // SPDX-FileCopyrightText: Copyright 2017 Google Inc
3
4 #include "config.h"
5
6 #include "dbuspassive.hpp"
7
8 #include "conf.hpp"
9 #include "dbushelper_interface.hpp"
10 #include "dbuspassiveredundancy.hpp"
11 #include "dbusutil.hpp"
12 #include "failsafeloggers/failsafe_logger_utility.hpp"
13 #include "interfaces.hpp"
14 #include "util.hpp"
15
16 #include <systemd/sd-bus.h>
17
18 #include <sdbusplus/bus.hpp>
19 #include <sdbusplus/message.hpp>
20 #include <xyz/openbmc_project/Sensor/Threshold/Critical/common.hpp>
21 #include <xyz/openbmc_project/Sensor/Threshold/Warning/common.hpp>
22 #include <xyz/openbmc_project/Sensor/Value/client.hpp>
23 #include <xyz/openbmc_project/State/Decorator/Availability/common.hpp>
24 #include <xyz/openbmc_project/State/Decorator/OperationalStatus/common.hpp>
25
26 #include <chrono>
27 #include <cmath>
28 #include <cstdint>
29 #include <exception>
30 #include <limits>
31 #include <map>
32 #include <memory>
33 #include <mutex>
34 #include <set>
35 #include <string>
36 #include <utility>
37 #include <variant>
38
39 #include "failsafeloggers/failsafe_logger.cpp"
40
41 using SensorValue = sdbusplus::common::xyz::openbmc_project::sensor::Value;
42 using SensorThresholdWarning =
43 sdbusplus::common::xyz::openbmc_project::sensor::threshold::Warning;
44 using SensorThresholdCritical =
45 sdbusplus::common::xyz::openbmc_project::sensor::threshold::Critical;
46 using StateDecoratorAvailability =
47 sdbusplus::common::xyz::openbmc_project::state::decorator::Availability;
48 using StateDecoratorOperationalStatus = sdbusplus::common::xyz::
49 openbmc_project::state::decorator::OperationalStatus;
50
51 namespace pid_control
52 {
53
createDbusPassive(sdbusplus::bus_t & bus,const std::string & type,const std::string & id,std::unique_ptr<DbusHelperInterface> helper,const conf::SensorConfig * info,const std::shared_ptr<DbusPassiveRedundancy> & redundancy)54 std::unique_ptr<ReadInterface> DbusPassive::createDbusPassive(
55 sdbusplus::bus_t& bus, const std::string& type, const std::string& id,
56 std::unique_ptr<DbusHelperInterface> helper, const conf::SensorConfig* info,
57 const std::shared_ptr<DbusPassiveRedundancy>& redundancy)
58 {
59 if (helper == nullptr)
60 {
61 return nullptr;
62 }
63 if (!validType(type))
64 {
65 return nullptr;
66 }
67
68 /* Need to get the scale and initial value */
69 /* service == busname */
70 std::string path;
71 if (info->readPath.empty())
72 {
73 path = getSensorPath(type, id);
74 }
75 else
76 {
77 path = info->readPath;
78 }
79
80 SensorProperties settings;
81 bool failed;
82 bool objectMissing = false;
83 std::string service;
84
85 try
86 {
87 service = helper->getService(SensorValue::interface, path);
88 }
89 catch (const std::exception& e)
90 {
91 #ifndef HANDLE_MISSING_OBJECT_PATHS
92 return nullptr;
93 #else
94 // CASE1: The sensor is not on DBus, but as it is not in the
95 // MissingIsAcceptable list, the sensor should be built with a failed
96 // state to send the zone to failsafe mode. Everything will recover if
97 // all important sensors are back to DBus. swampd will be informed
98 // through InterfacesAdded signals and the sensors will be built again.
99
100 // CASE2: The sensor is on D-Bus (getService succeeds) but getProperties
101 // fails (e.g., D-Bus error or property fetch failure). In this case,
102 // handle-missing-object-paths does not apply. The sensor build fails,
103 // and the control loop will keep restarting until getProperties
104 // succeeds.
105
106 // Only CASE1 may send the zone to failsafe mode if the sensor is not
107 // in MissingIsAcceptable. CASE2 results in continuous restart until
108 // recovery.
109 objectMissing = true;
110 auto sensor = std::make_unique<DbusPassive>(
111 bus, type, id, std::move(helper), objectMissing, path, redundancy);
112 failed = true;
113 settings.value = std::numeric_limits<double>::quiet_NaN();
114 settings.unit = getSensorUnit(type);
115 settings.available = false;
116 settings.unavailableAsFailed = true;
117 if (info->ignoreDbusMinMax)
118 {
119 settings.min = 0;
120 settings.max = 0;
121 }
122 sensor->initFromSettings(settings, true);
123 std::cerr << "DbusPassive: Sensor " << path
124 << " is missing from D-Bus, build this sensor as failed\n";
125 return sensor;
126 #endif
127 }
128
129 auto sensor = std::make_unique<DbusPassive>(
130 bus, type, id, std::move(helper), objectMissing, path, redundancy);
131
132 try
133 {
134 sensor->_helper->getProperties(service, path, &settings);
135 failed = sensor->_helper->thresholdsAsserted(service, path);
136 }
137 catch (const std::exception& e)
138 {
139 return nullptr;
140 }
141
142 /* if these values are zero, they're ignored. */
143 if (info->ignoreDbusMinMax)
144 {
145 settings.min = 0;
146 settings.max = 0;
147 }
148
149 settings.unavailableAsFailed = info->unavailableAsFailed;
150 sensor->initFromSettings(settings, failed);
151
152 return sensor;
153 }
154
DbusPassive(sdbusplus::bus_t & bus,const std::string & type,const std::string & id,std::unique_ptr<DbusHelperInterface> helper,bool objectMissing,const std::string & path,const std::shared_ptr<DbusPassiveRedundancy> & redundancy)155 DbusPassive::DbusPassive(
156 sdbusplus::bus_t& bus, const std::string& type, const std::string& id,
157 std::unique_ptr<DbusHelperInterface> helper, bool objectMissing,
158 const std::string& path,
159 const std::shared_ptr<DbusPassiveRedundancy>& redundancy) :
160 ReadInterface(), _signal(bus, getMatch(path), dbusHandleSignal, this),
161 _id(id), _helper(std::move(helper)), _objectMissing(objectMissing),
162 path(path), redundancy(redundancy)
163
164 {
165 // Cache this type knowledge, to avoid repeated string comparison
166 _typeMargin = (type == "margin");
167 _typeFan = (type == "fan");
168 }
169
read(void)170 ReadReturn DbusPassive::read(void)
171 {
172 std::lock_guard<std::mutex> guard(_lock);
173
174 ReadReturn r = {_value, _updated, _unscaled};
175
176 return r;
177 }
178
setValue(double value,double unscaled)179 void DbusPassive::setValue(double value, double unscaled)
180 {
181 std::lock_guard<std::mutex> guard(_lock);
182
183 _value = value;
184 _unscaled = unscaled;
185 _updated = std::chrono::high_resolution_clock::now();
186 }
187
setValue(double value)188 void DbusPassive::setValue(double value)
189 {
190 // First param is scaled, second param is unscaled, assume same here
191 setValue(value, value);
192 }
193
getFailed(void) const194 bool DbusPassive::getFailed(void) const
195 {
196 if (redundancy)
197 {
198 const std::set<std::string>& failures = redundancy->getFailed();
199 if (failures.find(path) != failures.end())
200 {
201 outputFailsafeLogWithSensor(_id, true, _id,
202 "The sensor path is marked redundant.");
203 return true;
204 }
205 }
206
207 /*
208 * If handle-missing-object-paths is enabled, and the expected D-Bus object
209 * path is not exported, this sensor is created to represent that condition.
210 * Indicate this sensor has failed so the zone enters failSafe mode.
211 */
212 if (_objectMissing)
213 {
214 outputFailsafeLogWithSensor(_id, true, _id,
215 "The sensor D-Bus object is missing.");
216 return true;
217 }
218
219 /*
220 * Unavailable thermal sensors, who are not present or
221 * power-state-not-matching, should not trigger the failSafe mode. For
222 * example, when a system stays at a powered-off state, its CPU Temp
223 * sensors will be unavailable, these unavailable sensors should not be
224 * treated as failed and trigger failSafe.
225 * This is important for systems whose Fans are always on.
226 */
227 if (!_typeFan && !_available && !_unavailableAsFailed)
228 {
229 return false;
230 }
231
232 // If a reading has came in,
233 // but its value bad in some way (determined by sensor type),
234 // indicate this sensor has failed,
235 // until another value comes in that is no longer bad.
236 // This is different from the overall _failed flag,
237 // which is set and cleared by other causes.
238 if (_badReading)
239 {
240 outputFailsafeLogWithSensor(_id, true, _id,
241 "The sensor has bad readings.");
242 return true;
243 }
244
245 // If a reading has came in, and it is not a bad reading,
246 // but it indicates there is no more thermal margin left,
247 // that is bad, something is wrong with the PID loops,
248 // they are not cooling the system, enable failsafe mode also.
249 if (_marginHot)
250 {
251 outputFailsafeLogWithSensor(_id, true, _id,
252 "The sensor has no thermal margin left.");
253 return true;
254 }
255
256 if (_failed)
257 {
258 outputFailsafeLogWithSensor(
259 _id, true, _id, "The sensor has failed with a critical issue.");
260 return true;
261 }
262
263 if (!_available)
264 {
265 outputFailsafeLogWithSensor(_id, true, _id,
266 "The sensor is unavailable.");
267 return true;
268 }
269
270 if (!_functional)
271 {
272 outputFailsafeLogWithSensor(_id, true, _id,
273 "The sensor is not functional.");
274 return true;
275 }
276
277 outputFailsafeLogWithSensor(_id, false, _id, "The sensor has recovered.");
278
279 return false;
280 }
281
getFailReason(void) const282 std::string DbusPassive::getFailReason(void) const
283 {
284 if (_objectMissing)
285 {
286 return "Sensor D-Bus object missing";
287 }
288 if (_badReading)
289 {
290 return "Sensor reading bad";
291 }
292 if (_marginHot)
293 {
294 return "Margin hot";
295 }
296 if (_failed)
297 {
298 return "Sensor threshold asserted";
299 }
300 if (!_available)
301 {
302 return "Sensor unavailable";
303 }
304 if (!_functional)
305 {
306 return "Sensor not functional";
307 }
308 return "Unknown";
309 }
310
setFailed(bool value)311 void DbusPassive::setFailed(bool value)
312 {
313 _failed = value;
314 }
315
setFunctional(bool value)316 void DbusPassive::setFunctional(bool value)
317 {
318 _functional = value;
319 }
320
setAvailable(bool value)321 void DbusPassive::setAvailable(bool value)
322 {
323 _available = value;
324 _availableOverridden = true;
325 }
326
initFromSettings(const SensorProperties & settings,bool failed)327 void DbusPassive::initFromSettings(const SensorProperties& settings,
328 bool failed)
329 {
330 _failed = failed;
331 _scale = settings.scale;
332 _min = settings.min * std::pow(10.0, _scale);
333 _max = settings.max * std::pow(10.0, _scale);
334 _unavailableAsFailed = settings.unavailableAsFailed;
335 setAvailableFromProperty(settings.available);
336
337 // Force value to be stored, otherwise member would be uninitialized
338 updateValue(settings.value, true);
339 }
340
setAvailableFromProperty(bool value)341 void DbusPassive::setAvailableFromProperty(bool value)
342 {
343 if (!_availableOverridden)
344 {
345 _available = value;
346 }
347 }
348
getScale(void)349 int64_t DbusPassive::getScale(void)
350 {
351 return _scale;
352 }
353
getID(void)354 std::string DbusPassive::getID(void)
355 {
356 return _id;
357 }
358
getMax(void)359 double DbusPassive::getMax(void)
360 {
361 return _max;
362 }
363
getMin(void)364 double DbusPassive::getMin(void)
365 {
366 return _min;
367 }
368
updateValue(double value,bool force)369 void DbusPassive::updateValue(double value, bool force)
370 {
371 _badReading = false;
372
373 // Do not let a NAN, or other floating-point oddity, be used to update
374 // the value, as that indicates the sensor has no valid reading.
375 if (!(std::isfinite(value)))
376 {
377 _badReading = true;
378
379 // Do not continue with a bad reading, unless caller forcing
380 if (!force)
381 {
382 return;
383 }
384 }
385
386 value *= std::pow(10.0, _scale);
387
388 auto unscaled = value;
389 scaleSensorReading(_min, _max, value);
390
391 if (_typeMargin)
392 {
393 _marginHot = false;
394
395 // Unlike an absolute temperature sensor,
396 // where 0 degrees C is a good reading,
397 // a value received of 0 (or negative) margin is worrisome,
398 // and should be flagged.
399 // Either it indicates margin not calculated properly,
400 // or somebody forgot to set the margin-zero setpoint,
401 // or the system is really overheating that much.
402 // This is a different condition from _failed
403 // and _badReading, so it merits its own flag.
404 // The sensor has not failed, the reading is good, but the zone
405 // still needs to know that it should go to failsafe mode.
406 if (unscaled <= 0.0)
407 {
408 _marginHot = true;
409 }
410 }
411
412 setValue(value, unscaled);
413 }
414
handleSensorValue(sdbusplus::message_t & msg,DbusPassive * owner)415 int handleSensorValue(sdbusplus::message_t& msg, DbusPassive* owner)
416 {
417 std::string msgSensor;
418 std::map<std::string, std::variant<int64_t, double, bool>> msgData;
419
420 msg.read(msgSensor, msgData);
421
422 if (msgSensor == SensorValue::interface)
423 {
424 auto valPropMap = msgData.find(SensorValue::property_names::value);
425 if (valPropMap != msgData.end())
426 {
427 double value =
428 std::visit(VariantToDoubleVisitor(), valPropMap->second);
429
430 owner->updateValue(value, false);
431 }
432 }
433 else if (msgSensor == SensorThresholdCritical::interface)
434 {
435 auto criticalAlarmLow = msgData.find(
436 SensorThresholdCritical::property_names::critical_alarm_low);
437 auto criticalAlarmHigh = msgData.find(
438 SensorThresholdCritical::property_names::critical_alarm_high);
439 if (criticalAlarmHigh == msgData.end() &&
440 criticalAlarmLow == msgData.end())
441 {
442 return 0;
443 }
444
445 bool asserted = false;
446 if (criticalAlarmLow != msgData.end())
447 {
448 asserted = std::get<bool>(criticalAlarmLow->second);
449 }
450
451 // checking both as in theory you could de-assert one threshold and
452 // assert the other at the same moment
453 if (!asserted && criticalAlarmHigh != msgData.end())
454 {
455 asserted = std::get<bool>(criticalAlarmHigh->second);
456 }
457 owner->setFailed(asserted);
458 }
459 #ifdef UNC_FAILSAFE
460 else if (msgSensor == SensorThresholdWarning::interface)
461 {
462 auto warningAlarmHigh = msgData.find(
463 SensorThresholdWarning::property_names::warning_alarm_high);
464 if (warningAlarmHigh == msgData.end())
465 {
466 return 0;
467 }
468
469 bool asserted = false;
470 if (warningAlarmHigh != msgData.end())
471 {
472 asserted = std::get<bool>(warningAlarmHigh->second);
473 }
474 owner->setFailed(asserted);
475 }
476 #endif
477 else if (msgSensor == StateDecoratorAvailability::interface)
478 {
479 auto available =
480 msgData.find(StateDecoratorAvailability::property_names::available);
481 if (available == msgData.end())
482 {
483 return 0;
484 }
485 bool asserted = std::get<bool>(available->second);
486 owner->setAvailable(asserted);
487 if (!asserted)
488 {
489 // A thermal controller will continue its PID calculation and not
490 // trigger a 'failsafe' when some inputs are unavailable.
491 // So, forced to clear the value here to prevent a historical
492 // value to participate in a latter PID calculation.
493 owner->updateValue(std::numeric_limits<double>::quiet_NaN(), true);
494 }
495 }
496 else if (msgSensor == StateDecoratorOperationalStatus::interface)
497 {
498 auto functional = msgData.find(
499 StateDecoratorOperationalStatus::property_names::functional);
500 if (functional == msgData.end())
501 {
502 return 0;
503 }
504 bool asserted = std::get<bool>(functional->second);
505 owner->setFunctional(asserted);
506 }
507
508 return 0;
509 }
510
dbusHandleSignal(sd_bus_message * msg,void * usrData,sd_bus_error * err)511 int dbusHandleSignal(sd_bus_message* msg, void* usrData,
512 [[maybe_unused]] sd_bus_error* err)
513 {
514 auto sdbpMsg = sdbusplus::message_t(msg);
515 DbusPassive* obj = static_cast<DbusPassive*>(usrData);
516
517 return handleSensorValue(sdbpMsg, obj);
518 }
519
520 } // namespace pid_control
521