1 /**
2 * Copyright 2017 Google Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 #include "config.h"
17
18 #include "dbuspassive.hpp"
19
20 #include "conf.hpp"
21 #include "dbushelper_interface.hpp"
22 #include "dbuspassiveredundancy.hpp"
23 #include "dbusutil.hpp"
24 #include "failsafeloggers/failsafe_logger_utility.hpp"
25 #include "interfaces.hpp"
26 #include "util.hpp"
27
28 #include <systemd/sd-bus.h>
29
30 #include <sdbusplus/bus.hpp>
31 #include <sdbusplus/message.hpp>
32
33 #include <chrono>
34 #include <cmath>
35 #include <cstdint>
36 #include <exception>
37 #include <limits>
38 #include <map>
39 #include <memory>
40 #include <mutex>
41 #include <set>
42 #include <string>
43 #include <utility>
44 #include <variant>
45
46 #include "failsafeloggers/failsafe_logger.cpp"
47
48 namespace pid_control
49 {
50
createDbusPassive(sdbusplus::bus_t & bus,const std::string & type,const std::string & id,std::unique_ptr<DbusHelperInterface> helper,const conf::SensorConfig * info,const std::shared_ptr<DbusPassiveRedundancy> & redundancy)51 std::unique_ptr<ReadInterface> DbusPassive::createDbusPassive(
52 sdbusplus::bus_t& bus, const std::string& type, const std::string& id,
53 std::unique_ptr<DbusHelperInterface> helper, const conf::SensorConfig* info,
54 const std::shared_ptr<DbusPassiveRedundancy>& redundancy)
55 {
56 if (helper == nullptr)
57 {
58 return nullptr;
59 }
60 if (!validType(type))
61 {
62 return nullptr;
63 }
64
65 /* Need to get the scale and initial value */
66 /* service == busname */
67 std::string path;
68 if (info->readPath.empty())
69 {
70 path = getSensorPath(type, id);
71 }
72 else
73 {
74 path = info->readPath;
75 }
76
77 SensorProperties settings;
78 bool failed;
79 bool objectMissing = false;
80 std::string service;
81
82 try
83 {
84 service = helper->getService(sensorintf, path);
85 }
86 catch (const std::exception& e)
87 {
88 #ifndef HANDLE_MISSING_OBJECT_PATHS
89 return nullptr;
90 #else
91 // CASE1: The sensor is not on DBus, but as it is not in the
92 // MissingIsAcceptable list, the sensor should be built with a failed
93 // state to send the zone to failsafe mode. Everything will recover if
94 // all important sensors are back to DBus. swampd will be informed
95 // through InterfacesAdded signals and the sensors will be built again.
96
97 // CASE2: The sensor is on D-Bus (getService succeeds) but getProperties
98 // fails (e.g., D-Bus error or property fetch failure). In this case,
99 // handle-missing-object-paths does not apply. The sensor build fails,
100 // and the control loop will keep restarting until getProperties
101 // succeeds.
102
103 // Only CASE1 may send the zone to failsafe mode if the sensor is not
104 // in MissingIsAcceptable. CASE2 results in continuous restart until
105 // recovery.
106
107 failed = true;
108 objectMissing = true;
109 settings.value = std::numeric_limits<double>::quiet_NaN();
110 settings.unit = getSensorUnit(type);
111 settings.available = false;
112 settings.unavailableAsFailed = true;
113 if (info->ignoreDbusMinMax)
114 {
115 settings.min = 0;
116 settings.max = 0;
117 }
118 std::cerr << "DbusPassive: Sensor " << path
119 << " is missing from D-Bus, build this sensor as failed\n";
120 return std::make_unique<DbusPassive>(
121 bus, type, id, std::move(helper), settings, failed, objectMissing,
122 path, redundancy);
123 #endif
124 }
125
126 try
127 {
128 helper->getProperties(service, path, &settings);
129 failed = helper->thresholdsAsserted(service, path);
130 }
131 catch (const std::exception& e)
132 {
133 return nullptr;
134 }
135
136 /* if these values are zero, they're ignored. */
137 if (info->ignoreDbusMinMax)
138 {
139 settings.min = 0;
140 settings.max = 0;
141 }
142
143 settings.unavailableAsFailed = info->unavailableAsFailed;
144
145 return std::make_unique<DbusPassive>(
146 bus, type, id, std::move(helper), settings, failed, objectMissing, path,
147 redundancy);
148 }
149
DbusPassive(sdbusplus::bus_t & bus,const std::string & type,const std::string & id,std::unique_ptr<DbusHelperInterface> helper,const SensorProperties & settings,bool failed,bool objectMissing,const std::string & path,const std::shared_ptr<DbusPassiveRedundancy> & redundancy)150 DbusPassive::DbusPassive(
151 sdbusplus::bus_t& bus, const std::string& type, const std::string& id,
152 std::unique_ptr<DbusHelperInterface> helper,
153 const SensorProperties& settings, bool failed, bool objectMissing,
154 const std::string& path,
155 const std::shared_ptr<DbusPassiveRedundancy>& redundancy) :
156 ReadInterface(), _signal(bus, getMatch(path), dbusHandleSignal, this),
157 _id(id), _helper(std::move(helper)), _failed(failed),
158 _objectMissing(objectMissing), path(path), redundancy(redundancy)
159
160 {
161 _scale = settings.scale;
162 _min = settings.min * std::pow(10.0, _scale);
163 _max = settings.max * std::pow(10.0, _scale);
164 _available = settings.available;
165 _unavailableAsFailed = settings.unavailableAsFailed;
166
167 // Cache this type knowledge, to avoid repeated string comparison
168 _typeMargin = (type == "margin");
169 _typeFan = (type == "fan");
170
171 // Force value to be stored, otherwise member would be uninitialized
172 updateValue(settings.value, true);
173 }
174
read(void)175 ReadReturn DbusPassive::read(void)
176 {
177 std::lock_guard<std::mutex> guard(_lock);
178
179 ReadReturn r = {_value, _updated, _unscaled};
180
181 return r;
182 }
183
setValue(double value,double unscaled)184 void DbusPassive::setValue(double value, double unscaled)
185 {
186 std::lock_guard<std::mutex> guard(_lock);
187
188 _value = value;
189 _unscaled = unscaled;
190 _updated = std::chrono::high_resolution_clock::now();
191 }
192
setValue(double value)193 void DbusPassive::setValue(double value)
194 {
195 // First param is scaled, second param is unscaled, assume same here
196 setValue(value, value);
197 }
198
getFailed(void) const199 bool DbusPassive::getFailed(void) const
200 {
201 if (redundancy)
202 {
203 const std::set<std::string>& failures = redundancy->getFailed();
204 if (failures.find(path) != failures.end())
205 {
206 outputFailsafeLogWithSensor(_id, true, _id,
207 "The sensor path is marked redundant.");
208 return true;
209 }
210 }
211
212 /*
213 * If handle-missing-object-paths is enabled, and the expected D-Bus object
214 * path is not exported, this sensor is created to represent that condition.
215 * Indicate this sensor has failed so the zone enters failSafe mode.
216 */
217 if (_objectMissing)
218 {
219 outputFailsafeLogWithSensor(_id, true, _id,
220 "The sensor D-Bus object is missing.");
221 return true;
222 }
223
224 /*
225 * Unavailable thermal sensors, who are not present or
226 * power-state-not-matching, should not trigger the failSafe mode. For
227 * example, when a system stays at a powered-off state, its CPU Temp
228 * sensors will be unavailable, these unavailable sensors should not be
229 * treated as failed and trigger failSafe.
230 * This is important for systems whose Fans are always on.
231 */
232 if (!_typeFan && !_available && !_unavailableAsFailed)
233 {
234 return false;
235 }
236
237 // If a reading has came in,
238 // but its value bad in some way (determined by sensor type),
239 // indicate this sensor has failed,
240 // until another value comes in that is no longer bad.
241 // This is different from the overall _failed flag,
242 // which is set and cleared by other causes.
243 if (_badReading)
244 {
245 outputFailsafeLogWithSensor(_id, true, _id,
246 "The sensor has bad readings.");
247 return true;
248 }
249
250 // If a reading has came in, and it is not a bad reading,
251 // but it indicates there is no more thermal margin left,
252 // that is bad, something is wrong with the PID loops,
253 // they are not cooling the system, enable failsafe mode also.
254 if (_marginHot)
255 {
256 outputFailsafeLogWithSensor(_id, true, _id,
257 "The sensor has no thermal margin left.");
258 return true;
259 }
260
261 if (_failed)
262 {
263 outputFailsafeLogWithSensor(
264 _id, true, _id, "The sensor has failed with a critical issue.");
265 return true;
266 }
267
268 if (!_available)
269 {
270 outputFailsafeLogWithSensor(_id, true, _id,
271 "The sensor is unavailable.");
272 return true;
273 }
274
275 if (!_functional)
276 {
277 outputFailsafeLogWithSensor(_id, true, _id,
278 "The sensor is not functional.");
279 return true;
280 }
281
282 outputFailsafeLogWithSensor(_id, false, _id, "The sensor has recovered.");
283
284 return false;
285 }
286
getFailReason(void) const287 std::string DbusPassive::getFailReason(void) const
288 {
289 if (_objectMissing)
290 {
291 return "Sensor D-Bus object missing";
292 }
293 if (_badReading)
294 {
295 return "Sensor reading bad";
296 }
297 if (_marginHot)
298 {
299 return "Margin hot";
300 }
301 if (_failed)
302 {
303 return "Sensor threshold asserted";
304 }
305 if (!_available)
306 {
307 return "Sensor unavailable";
308 }
309 if (!_functional)
310 {
311 return "Sensor not functional";
312 }
313 return "Unknown";
314 }
315
setFailed(bool value)316 void DbusPassive::setFailed(bool value)
317 {
318 _failed = value;
319 }
320
setFunctional(bool value)321 void DbusPassive::setFunctional(bool value)
322 {
323 _functional = value;
324 }
325
setAvailable(bool value)326 void DbusPassive::setAvailable(bool value)
327 {
328 _available = value;
329 }
330
getScale(void)331 int64_t DbusPassive::getScale(void)
332 {
333 return _scale;
334 }
335
getID(void)336 std::string DbusPassive::getID(void)
337 {
338 return _id;
339 }
340
getMax(void)341 double DbusPassive::getMax(void)
342 {
343 return _max;
344 }
345
getMin(void)346 double DbusPassive::getMin(void)
347 {
348 return _min;
349 }
350
updateValue(double value,bool force)351 void DbusPassive::updateValue(double value, bool force)
352 {
353 _badReading = false;
354
355 // Do not let a NAN, or other floating-point oddity, be used to update
356 // the value, as that indicates the sensor has no valid reading.
357 if (!(std::isfinite(value)))
358 {
359 _badReading = true;
360
361 // Do not continue with a bad reading, unless caller forcing
362 if (!force)
363 {
364 return;
365 }
366 }
367
368 value *= std::pow(10.0, _scale);
369
370 auto unscaled = value;
371 scaleSensorReading(_min, _max, value);
372
373 if (_typeMargin)
374 {
375 _marginHot = false;
376
377 // Unlike an absolute temperature sensor,
378 // where 0 degrees C is a good reading,
379 // a value received of 0 (or negative) margin is worrisome,
380 // and should be flagged.
381 // Either it indicates margin not calculated properly,
382 // or somebody forgot to set the margin-zero setpoint,
383 // or the system is really overheating that much.
384 // This is a different condition from _failed
385 // and _badReading, so it merits its own flag.
386 // The sensor has not failed, the reading is good, but the zone
387 // still needs to know that it should go to failsafe mode.
388 if (unscaled <= 0.0)
389 {
390 _marginHot = true;
391 }
392 }
393
394 setValue(value, unscaled);
395 }
396
handleSensorValue(sdbusplus::message_t & msg,DbusPassive * owner)397 int handleSensorValue(sdbusplus::message_t& msg, DbusPassive* owner)
398 {
399 std::string msgSensor;
400 std::map<std::string, std::variant<int64_t, double, bool>> msgData;
401
402 msg.read(msgSensor, msgData);
403
404 if (msgSensor == "xyz.openbmc_project.Sensor.Value")
405 {
406 auto valPropMap = msgData.find("Value");
407 if (valPropMap != msgData.end())
408 {
409 double value =
410 std::visit(VariantToDoubleVisitor(), valPropMap->second);
411
412 owner->updateValue(value, false);
413 }
414 }
415 else if (msgSensor == "xyz.openbmc_project.Sensor.Threshold.Critical")
416 {
417 auto criticalAlarmLow = msgData.find("CriticalAlarmLow");
418 auto criticalAlarmHigh = msgData.find("CriticalAlarmHigh");
419 if (criticalAlarmHigh == msgData.end() &&
420 criticalAlarmLow == msgData.end())
421 {
422 return 0;
423 }
424
425 bool asserted = false;
426 if (criticalAlarmLow != msgData.end())
427 {
428 asserted = std::get<bool>(criticalAlarmLow->second);
429 }
430
431 // checking both as in theory you could de-assert one threshold and
432 // assert the other at the same moment
433 if (!asserted && criticalAlarmHigh != msgData.end())
434 {
435 asserted = std::get<bool>(criticalAlarmHigh->second);
436 }
437 owner->setFailed(asserted);
438 }
439 #ifdef UNC_FAILSAFE
440 else if (msgSensor == "xyz.openbmc_project.Sensor.Threshold.Warning")
441 {
442 auto warningAlarmHigh = msgData.find("WarningAlarmHigh");
443 if (warningAlarmHigh == msgData.end())
444 {
445 return 0;
446 }
447
448 bool asserted = false;
449 if (warningAlarmHigh != msgData.end())
450 {
451 asserted = std::get<bool>(warningAlarmHigh->second);
452 }
453 owner->setFailed(asserted);
454 }
455 #endif
456 else if (msgSensor == "xyz.openbmc_project.State.Decorator.Availability")
457 {
458 auto available = msgData.find("Available");
459 if (available == msgData.end())
460 {
461 return 0;
462 }
463 bool asserted = std::get<bool>(available->second);
464 owner->setAvailable(asserted);
465 if (!asserted)
466 {
467 // A thermal controller will continue its PID calculation and not
468 // trigger a 'failsafe' when some inputs are unavailable.
469 // So, forced to clear the value here to prevent a historical
470 // value to participate in a latter PID calculation.
471 owner->updateValue(std::numeric_limits<double>::quiet_NaN(), true);
472 }
473 }
474 else if (msgSensor ==
475 "xyz.openbmc_project.State.Decorator.OperationalStatus")
476 {
477 auto functional = msgData.find("Functional");
478 if (functional == msgData.end())
479 {
480 return 0;
481 }
482 bool asserted = std::get<bool>(functional->second);
483 owner->setFunctional(asserted);
484 }
485
486 return 0;
487 }
488
dbusHandleSignal(sd_bus_message * msg,void * usrData,sd_bus_error * err)489 int dbusHandleSignal(sd_bus_message* msg, void* usrData,
490 [[maybe_unused]] sd_bus_error* err)
491 {
492 auto sdbpMsg = sdbusplus::message_t(msg);
493 DbusPassive* obj = static_cast<DbusPassive*>(usrData);
494
495 return handleSensorValue(sdbpMsg, obj);
496 }
497
498 } // namespace pid_control
499