1 // SPDX-License-Identifier: Apache-2.0 2 // SPDX-FileCopyrightText: Copyright 2017 Google Inc 3 4 #include "config.h" 5 6 #include "dbuspassive.hpp" 7 8 #include "conf.hpp" 9 #include "dbushelper_interface.hpp" 10 #include "dbuspassiveredundancy.hpp" 11 #include "dbusutil.hpp" 12 #include "failsafeloggers/failsafe_logger_utility.hpp" 13 #include "interfaces.hpp" 14 #include "util.hpp" 15 16 #include <systemd/sd-bus.h> 17 18 #include <sdbusplus/bus.hpp> 19 #include <sdbusplus/message.hpp> 20 21 #include <chrono> 22 #include <cmath> 23 #include <cstdint> 24 #include <exception> 25 #include <limits> 26 #include <map> 27 #include <memory> 28 #include <mutex> 29 #include <set> 30 #include <string> 31 #include <utility> 32 #include <variant> 33 34 #include "failsafeloggers/failsafe_logger.cpp" 35 36 namespace pid_control 37 { 38 39 std::unique_ptr<ReadInterface> DbusPassive::createDbusPassive( 40 sdbusplus::bus_t& bus, const std::string& type, const std::string& id, 41 std::unique_ptr<DbusHelperInterface> helper, const conf::SensorConfig* info, 42 const std::shared_ptr<DbusPassiveRedundancy>& redundancy) 43 { 44 if (helper == nullptr) 45 { 46 return nullptr; 47 } 48 if (!validType(type)) 49 { 50 return nullptr; 51 } 52 53 /* Need to get the scale and initial value */ 54 /* service == busname */ 55 std::string path; 56 if (info->readPath.empty()) 57 { 58 path = getSensorPath(type, id); 59 } 60 else 61 { 62 path = info->readPath; 63 } 64 65 SensorProperties settings; 66 bool failed; 67 bool objectMissing = false; 68 std::string service; 69 70 try 71 { 72 service = helper->getService(sensorintf, path); 73 } 74 catch (const std::exception& e) 75 { 76 #ifndef HANDLE_MISSING_OBJECT_PATHS 77 return nullptr; 78 #else 79 // CASE1: The sensor is not on DBus, but as it is not in the 80 // MissingIsAcceptable list, the sensor should be built with a failed 81 // state to send the zone to failsafe mode. Everything will recover if 82 // all important sensors are back to DBus. swampd will be informed 83 // through InterfacesAdded signals and the sensors will be built again. 84 85 // CASE2: The sensor is on D-Bus (getService succeeds) but getProperties 86 // fails (e.g., D-Bus error or property fetch failure). In this case, 87 // handle-missing-object-paths does not apply. The sensor build fails, 88 // and the control loop will keep restarting until getProperties 89 // succeeds. 90 91 // Only CASE1 may send the zone to failsafe mode if the sensor is not 92 // in MissingIsAcceptable. CASE2 results in continuous restart until 93 // recovery. 94 95 failed = true; 96 objectMissing = true; 97 settings.value = std::numeric_limits<double>::quiet_NaN(); 98 settings.unit = getSensorUnit(type); 99 settings.available = false; 100 settings.unavailableAsFailed = true; 101 if (info->ignoreDbusMinMax) 102 { 103 settings.min = 0; 104 settings.max = 0; 105 } 106 std::cerr << "DbusPassive: Sensor " << path 107 << " is missing from D-Bus, build this sensor as failed\n"; 108 return std::make_unique<DbusPassive>( 109 bus, type, id, std::move(helper), settings, failed, objectMissing, 110 path, redundancy); 111 #endif 112 } 113 114 try 115 { 116 helper->getProperties(service, path, &settings); 117 failed = helper->thresholdsAsserted(service, path); 118 } 119 catch (const std::exception& e) 120 { 121 return nullptr; 122 } 123 124 /* if these values are zero, they're ignored. */ 125 if (info->ignoreDbusMinMax) 126 { 127 settings.min = 0; 128 settings.max = 0; 129 } 130 131 settings.unavailableAsFailed = info->unavailableAsFailed; 132 133 return std::make_unique<DbusPassive>( 134 bus, type, id, std::move(helper), settings, failed, objectMissing, path, 135 redundancy); 136 } 137 138 DbusPassive::DbusPassive( 139 sdbusplus::bus_t& bus, const std::string& type, const std::string& id, 140 std::unique_ptr<DbusHelperInterface> helper, 141 const SensorProperties& settings, bool failed, bool objectMissing, 142 const std::string& path, 143 const std::shared_ptr<DbusPassiveRedundancy>& redundancy) : 144 ReadInterface(), _signal(bus, getMatch(path), dbusHandleSignal, this), 145 _id(id), _helper(std::move(helper)), _failed(failed), 146 _objectMissing(objectMissing), path(path), redundancy(redundancy) 147 148 { 149 _scale = settings.scale; 150 _min = settings.min * std::pow(10.0, _scale); 151 _max = settings.max * std::pow(10.0, _scale); 152 _available = settings.available; 153 _unavailableAsFailed = settings.unavailableAsFailed; 154 155 // Cache this type knowledge, to avoid repeated string comparison 156 _typeMargin = (type == "margin"); 157 _typeFan = (type == "fan"); 158 159 // Force value to be stored, otherwise member would be uninitialized 160 updateValue(settings.value, true); 161 } 162 163 ReadReturn DbusPassive::read(void) 164 { 165 std::lock_guard<std::mutex> guard(_lock); 166 167 ReadReturn r = {_value, _updated, _unscaled}; 168 169 return r; 170 } 171 172 void DbusPassive::setValue(double value, double unscaled) 173 { 174 std::lock_guard<std::mutex> guard(_lock); 175 176 _value = value; 177 _unscaled = unscaled; 178 _updated = std::chrono::high_resolution_clock::now(); 179 } 180 181 void DbusPassive::setValue(double value) 182 { 183 // First param is scaled, second param is unscaled, assume same here 184 setValue(value, value); 185 } 186 187 bool DbusPassive::getFailed(void) const 188 { 189 if (redundancy) 190 { 191 const std::set<std::string>& failures = redundancy->getFailed(); 192 if (failures.find(path) != failures.end()) 193 { 194 outputFailsafeLogWithSensor(_id, true, _id, 195 "The sensor path is marked redundant."); 196 return true; 197 } 198 } 199 200 /* 201 * If handle-missing-object-paths is enabled, and the expected D-Bus object 202 * path is not exported, this sensor is created to represent that condition. 203 * Indicate this sensor has failed so the zone enters failSafe mode. 204 */ 205 if (_objectMissing) 206 { 207 outputFailsafeLogWithSensor(_id, true, _id, 208 "The sensor D-Bus object is missing."); 209 return true; 210 } 211 212 /* 213 * Unavailable thermal sensors, who are not present or 214 * power-state-not-matching, should not trigger the failSafe mode. For 215 * example, when a system stays at a powered-off state, its CPU Temp 216 * sensors will be unavailable, these unavailable sensors should not be 217 * treated as failed and trigger failSafe. 218 * This is important for systems whose Fans are always on. 219 */ 220 if (!_typeFan && !_available && !_unavailableAsFailed) 221 { 222 return false; 223 } 224 225 // If a reading has came in, 226 // but its value bad in some way (determined by sensor type), 227 // indicate this sensor has failed, 228 // until another value comes in that is no longer bad. 229 // This is different from the overall _failed flag, 230 // which is set and cleared by other causes. 231 if (_badReading) 232 { 233 outputFailsafeLogWithSensor(_id, true, _id, 234 "The sensor has bad readings."); 235 return true; 236 } 237 238 // If a reading has came in, and it is not a bad reading, 239 // but it indicates there is no more thermal margin left, 240 // that is bad, something is wrong with the PID loops, 241 // they are not cooling the system, enable failsafe mode also. 242 if (_marginHot) 243 { 244 outputFailsafeLogWithSensor(_id, true, _id, 245 "The sensor has no thermal margin left."); 246 return true; 247 } 248 249 if (_failed) 250 { 251 outputFailsafeLogWithSensor( 252 _id, true, _id, "The sensor has failed with a critical issue."); 253 return true; 254 } 255 256 if (!_available) 257 { 258 outputFailsafeLogWithSensor(_id, true, _id, 259 "The sensor is unavailable."); 260 return true; 261 } 262 263 if (!_functional) 264 { 265 outputFailsafeLogWithSensor(_id, true, _id, 266 "The sensor is not functional."); 267 return true; 268 } 269 270 outputFailsafeLogWithSensor(_id, false, _id, "The sensor has recovered."); 271 272 return false; 273 } 274 275 std::string DbusPassive::getFailReason(void) const 276 { 277 if (_objectMissing) 278 { 279 return "Sensor D-Bus object missing"; 280 } 281 if (_badReading) 282 { 283 return "Sensor reading bad"; 284 } 285 if (_marginHot) 286 { 287 return "Margin hot"; 288 } 289 if (_failed) 290 { 291 return "Sensor threshold asserted"; 292 } 293 if (!_available) 294 { 295 return "Sensor unavailable"; 296 } 297 if (!_functional) 298 { 299 return "Sensor not functional"; 300 } 301 return "Unknown"; 302 } 303 304 void DbusPassive::setFailed(bool value) 305 { 306 _failed = value; 307 } 308 309 void DbusPassive::setFunctional(bool value) 310 { 311 _functional = value; 312 } 313 314 void DbusPassive::setAvailable(bool value) 315 { 316 _available = value; 317 } 318 319 int64_t DbusPassive::getScale(void) 320 { 321 return _scale; 322 } 323 324 std::string DbusPassive::getID(void) 325 { 326 return _id; 327 } 328 329 double DbusPassive::getMax(void) 330 { 331 return _max; 332 } 333 334 double DbusPassive::getMin(void) 335 { 336 return _min; 337 } 338 339 void DbusPassive::updateValue(double value, bool force) 340 { 341 _badReading = false; 342 343 // Do not let a NAN, or other floating-point oddity, be used to update 344 // the value, as that indicates the sensor has no valid reading. 345 if (!(std::isfinite(value))) 346 { 347 _badReading = true; 348 349 // Do not continue with a bad reading, unless caller forcing 350 if (!force) 351 { 352 return; 353 } 354 } 355 356 value *= std::pow(10.0, _scale); 357 358 auto unscaled = value; 359 scaleSensorReading(_min, _max, value); 360 361 if (_typeMargin) 362 { 363 _marginHot = false; 364 365 // Unlike an absolute temperature sensor, 366 // where 0 degrees C is a good reading, 367 // a value received of 0 (or negative) margin is worrisome, 368 // and should be flagged. 369 // Either it indicates margin not calculated properly, 370 // or somebody forgot to set the margin-zero setpoint, 371 // or the system is really overheating that much. 372 // This is a different condition from _failed 373 // and _badReading, so it merits its own flag. 374 // The sensor has not failed, the reading is good, but the zone 375 // still needs to know that it should go to failsafe mode. 376 if (unscaled <= 0.0) 377 { 378 _marginHot = true; 379 } 380 } 381 382 setValue(value, unscaled); 383 } 384 385 int handleSensorValue(sdbusplus::message_t& msg, DbusPassive* owner) 386 { 387 std::string msgSensor; 388 std::map<std::string, std::variant<int64_t, double, bool>> msgData; 389 390 msg.read(msgSensor, msgData); 391 392 if (msgSensor == "xyz.openbmc_project.Sensor.Value") 393 { 394 auto valPropMap = msgData.find("Value"); 395 if (valPropMap != msgData.end()) 396 { 397 double value = 398 std::visit(VariantToDoubleVisitor(), valPropMap->second); 399 400 owner->updateValue(value, false); 401 } 402 } 403 else if (msgSensor == "xyz.openbmc_project.Sensor.Threshold.Critical") 404 { 405 auto criticalAlarmLow = msgData.find("CriticalAlarmLow"); 406 auto criticalAlarmHigh = msgData.find("CriticalAlarmHigh"); 407 if (criticalAlarmHigh == msgData.end() && 408 criticalAlarmLow == msgData.end()) 409 { 410 return 0; 411 } 412 413 bool asserted = false; 414 if (criticalAlarmLow != msgData.end()) 415 { 416 asserted = std::get<bool>(criticalAlarmLow->second); 417 } 418 419 // checking both as in theory you could de-assert one threshold and 420 // assert the other at the same moment 421 if (!asserted && criticalAlarmHigh != msgData.end()) 422 { 423 asserted = std::get<bool>(criticalAlarmHigh->second); 424 } 425 owner->setFailed(asserted); 426 } 427 #ifdef UNC_FAILSAFE 428 else if (msgSensor == "xyz.openbmc_project.Sensor.Threshold.Warning") 429 { 430 auto warningAlarmHigh = msgData.find("WarningAlarmHigh"); 431 if (warningAlarmHigh == msgData.end()) 432 { 433 return 0; 434 } 435 436 bool asserted = false; 437 if (warningAlarmHigh != msgData.end()) 438 { 439 asserted = std::get<bool>(warningAlarmHigh->second); 440 } 441 owner->setFailed(asserted); 442 } 443 #endif 444 else if (msgSensor == "xyz.openbmc_project.State.Decorator.Availability") 445 { 446 auto available = msgData.find("Available"); 447 if (available == msgData.end()) 448 { 449 return 0; 450 } 451 bool asserted = std::get<bool>(available->second); 452 owner->setAvailable(asserted); 453 if (!asserted) 454 { 455 // A thermal controller will continue its PID calculation and not 456 // trigger a 'failsafe' when some inputs are unavailable. 457 // So, forced to clear the value here to prevent a historical 458 // value to participate in a latter PID calculation. 459 owner->updateValue(std::numeric_limits<double>::quiet_NaN(), true); 460 } 461 } 462 else if (msgSensor == 463 "xyz.openbmc_project.State.Decorator.OperationalStatus") 464 { 465 auto functional = msgData.find("Functional"); 466 if (functional == msgData.end()) 467 { 468 return 0; 469 } 470 bool asserted = std::get<bool>(functional->second); 471 owner->setFunctional(asserted); 472 } 473 474 return 0; 475 } 476 477 int dbusHandleSignal(sd_bus_message* msg, void* usrData, 478 [[maybe_unused]] sd_bus_error* err) 479 { 480 auto sdbpMsg = sdbusplus::message_t(msg); 481 DbusPassive* obj = static_cast<DbusPassive*>(usrData); 482 483 return handleSensorValue(sdbpMsg, obj); 484 } 485 486 } // namespace pid_control 487