1 /** 2 * Copyright 2017 Google Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 #include "config.h" 17 18 #include "dbuspassive.hpp" 19 20 #include "conf.hpp" 21 #include "dbushelper_interface.hpp" 22 #include "dbuspassiveredundancy.hpp" 23 #include "dbusutil.hpp" 24 #include "failsafeloggers/failsafe_logger_utility.hpp" 25 #include "interfaces.hpp" 26 #include "util.hpp" 27 28 #include <systemd/sd-bus.h> 29 30 #include <sdbusplus/bus.hpp> 31 #include <sdbusplus/message.hpp> 32 33 #include <chrono> 34 #include <cmath> 35 #include <cstdint> 36 #include <exception> 37 #include <limits> 38 #include <map> 39 #include <memory> 40 #include <mutex> 41 #include <set> 42 #include <string> 43 #include <utility> 44 #include <variant> 45 46 #include "failsafeloggers/failsafe_logger.cpp" 47 48 namespace pid_control 49 { 50 51 std::unique_ptr<ReadInterface> DbusPassive::createDbusPassive( 52 sdbusplus::bus_t& bus, const std::string& type, const std::string& id, 53 std::unique_ptr<DbusHelperInterface> helper, const conf::SensorConfig* info, 54 const std::shared_ptr<DbusPassiveRedundancy>& redundancy) 55 { 56 if (helper == nullptr) 57 { 58 return nullptr; 59 } 60 if (!validType(type)) 61 { 62 return nullptr; 63 } 64 65 /* Need to get the scale and initial value */ 66 /* service == busname */ 67 std::string path; 68 if (info->readPath.empty()) 69 { 70 path = getSensorPath(type, id); 71 } 72 else 73 { 74 path = info->readPath; 75 } 76 77 SensorProperties settings; 78 bool failed; 79 bool objectMissing = false; 80 std::string service; 81 82 try 83 { 84 service = helper->getService(sensorintf, path); 85 } 86 catch (const std::exception& e) 87 { 88 #ifndef HANDLE_MISSING_OBJECT_PATHS 89 return nullptr; 90 #else 91 // CASE1: The sensor is not on DBus, but as it is not in the 92 // MissingIsAcceptable list, the sensor should be built with a failed 93 // state to send the zone to failsafe mode. Everything will recover if 94 // all important sensors are back to DBus. swampd will be informed 95 // through InterfacesAdded signals and the sensors will be built again. 96 97 // CASE2: The sensor is on D-Bus (getService succeeds) but getProperties 98 // fails (e.g., D-Bus error or property fetch failure). In this case, 99 // handle-missing-object-paths does not apply. The sensor build fails, 100 // and the control loop will keep restarting until getProperties 101 // succeeds. 102 103 // Only CASE1 may send the zone to failsafe mode if the sensor is not 104 // in MissingIsAcceptable. CASE2 results in continuous restart until 105 // recovery. 106 107 failed = true; 108 objectMissing = true; 109 settings.value = std::numeric_limits<double>::quiet_NaN(); 110 settings.unit = getSensorUnit(type); 111 settings.available = false; 112 settings.unavailableAsFailed = true; 113 if (info->ignoreDbusMinMax) 114 { 115 settings.min = 0; 116 settings.max = 0; 117 } 118 std::cerr << "DbusPassive: Sensor " << path 119 << " is missing from D-Bus, build this sensor as failed\n"; 120 return std::make_unique<DbusPassive>( 121 bus, type, id, std::move(helper), settings, failed, objectMissing, 122 path, redundancy); 123 #endif 124 } 125 126 try 127 { 128 helper->getProperties(service, path, &settings); 129 failed = helper->thresholdsAsserted(service, path); 130 } 131 catch (const std::exception& e) 132 { 133 return nullptr; 134 } 135 136 /* if these values are zero, they're ignored. */ 137 if (info->ignoreDbusMinMax) 138 { 139 settings.min = 0; 140 settings.max = 0; 141 } 142 143 settings.unavailableAsFailed = info->unavailableAsFailed; 144 145 return std::make_unique<DbusPassive>( 146 bus, type, id, std::move(helper), settings, failed, objectMissing, path, 147 redundancy); 148 } 149 150 DbusPassive::DbusPassive( 151 sdbusplus::bus_t& bus, const std::string& type, const std::string& id, 152 std::unique_ptr<DbusHelperInterface> helper, 153 const SensorProperties& settings, bool failed, bool objectMissing, 154 const std::string& path, 155 const std::shared_ptr<DbusPassiveRedundancy>& redundancy) : 156 ReadInterface(), _signal(bus, getMatch(path), dbusHandleSignal, this), 157 _id(id), _helper(std::move(helper)), _failed(failed), 158 _objectMissing(objectMissing), path(path), redundancy(redundancy) 159 160 { 161 _scale = settings.scale; 162 _min = settings.min * std::pow(10.0, _scale); 163 _max = settings.max * std::pow(10.0, _scale); 164 _available = settings.available; 165 _unavailableAsFailed = settings.unavailableAsFailed; 166 167 // Cache this type knowledge, to avoid repeated string comparison 168 _typeMargin = (type == "margin"); 169 _typeFan = (type == "fan"); 170 171 // Force value to be stored, otherwise member would be uninitialized 172 updateValue(settings.value, true); 173 } 174 175 ReadReturn DbusPassive::read(void) 176 { 177 std::lock_guard<std::mutex> guard(_lock); 178 179 ReadReturn r = {_value, _updated, _unscaled}; 180 181 return r; 182 } 183 184 void DbusPassive::setValue(double value, double unscaled) 185 { 186 std::lock_guard<std::mutex> guard(_lock); 187 188 _value = value; 189 _unscaled = unscaled; 190 _updated = std::chrono::high_resolution_clock::now(); 191 } 192 193 void DbusPassive::setValue(double value) 194 { 195 // First param is scaled, second param is unscaled, assume same here 196 setValue(value, value); 197 } 198 199 bool DbusPassive::getFailed(void) const 200 { 201 if (redundancy) 202 { 203 const std::set<std::string>& failures = redundancy->getFailed(); 204 if (failures.find(path) != failures.end()) 205 { 206 outputFailsafeLogWithSensor(_id, true, _id, 207 "The sensor path is marked redundant."); 208 return true; 209 } 210 } 211 212 /* 213 * If handle-missing-object-paths is enabled, and the expected D-Bus object 214 * path is not exported, this sensor is created to represent that condition. 215 * Indicate this sensor has failed so the zone enters failSafe mode. 216 */ 217 if (_objectMissing) 218 { 219 outputFailsafeLogWithSensor(_id, true, _id, 220 "The sensor D-Bus object is missing."); 221 return true; 222 } 223 224 /* 225 * Unavailable thermal sensors, who are not present or 226 * power-state-not-matching, should not trigger the failSafe mode. For 227 * example, when a system stays at a powered-off state, its CPU Temp 228 * sensors will be unavailable, these unavailable sensors should not be 229 * treated as failed and trigger failSafe. 230 * This is important for systems whose Fans are always on. 231 */ 232 if (!_typeFan && !_available && !_unavailableAsFailed) 233 { 234 return false; 235 } 236 237 // If a reading has came in, 238 // but its value bad in some way (determined by sensor type), 239 // indicate this sensor has failed, 240 // until another value comes in that is no longer bad. 241 // This is different from the overall _failed flag, 242 // which is set and cleared by other causes. 243 if (_badReading) 244 { 245 outputFailsafeLogWithSensor(_id, true, _id, 246 "The sensor has bad readings."); 247 return true; 248 } 249 250 // If a reading has came in, and it is not a bad reading, 251 // but it indicates there is no more thermal margin left, 252 // that is bad, something is wrong with the PID loops, 253 // they are not cooling the system, enable failsafe mode also. 254 if (_marginHot) 255 { 256 outputFailsafeLogWithSensor(_id, true, _id, 257 "The sensor has no thermal margin left."); 258 return true; 259 } 260 261 if (_failed) 262 { 263 outputFailsafeLogWithSensor( 264 _id, true, _id, "The sensor has failed with a critical issue."); 265 return true; 266 } 267 268 if (!_available) 269 { 270 outputFailsafeLogWithSensor(_id, true, _id, 271 "The sensor is unavailable."); 272 return true; 273 } 274 275 if (!_functional) 276 { 277 outputFailsafeLogWithSensor(_id, true, _id, 278 "The sensor is not functional."); 279 return true; 280 } 281 282 outputFailsafeLogWithSensor(_id, false, _id, "The sensor has recovered."); 283 284 return false; 285 } 286 287 std::string DbusPassive::getFailReason(void) const 288 { 289 if (_objectMissing) 290 { 291 return "Sensor D-Bus object missing"; 292 } 293 if (_badReading) 294 { 295 return "Sensor reading bad"; 296 } 297 if (_marginHot) 298 { 299 return "Margin hot"; 300 } 301 if (_failed) 302 { 303 return "Sensor threshold asserted"; 304 } 305 if (!_available) 306 { 307 return "Sensor unavailable"; 308 } 309 if (!_functional) 310 { 311 return "Sensor not functional"; 312 } 313 return "Unknown"; 314 } 315 316 void DbusPassive::setFailed(bool value) 317 { 318 _failed = value; 319 } 320 321 void DbusPassive::setFunctional(bool value) 322 { 323 _functional = value; 324 } 325 326 void DbusPassive::setAvailable(bool value) 327 { 328 _available = value; 329 } 330 331 int64_t DbusPassive::getScale(void) 332 { 333 return _scale; 334 } 335 336 std::string DbusPassive::getID(void) 337 { 338 return _id; 339 } 340 341 double DbusPassive::getMax(void) 342 { 343 return _max; 344 } 345 346 double DbusPassive::getMin(void) 347 { 348 return _min; 349 } 350 351 void DbusPassive::updateValue(double value, bool force) 352 { 353 _badReading = false; 354 355 // Do not let a NAN, or other floating-point oddity, be used to update 356 // the value, as that indicates the sensor has no valid reading. 357 if (!(std::isfinite(value))) 358 { 359 _badReading = true; 360 361 // Do not continue with a bad reading, unless caller forcing 362 if (!force) 363 { 364 return; 365 } 366 } 367 368 value *= std::pow(10.0, _scale); 369 370 auto unscaled = value; 371 scaleSensorReading(_min, _max, value); 372 373 if (_typeMargin) 374 { 375 _marginHot = false; 376 377 // Unlike an absolute temperature sensor, 378 // where 0 degrees C is a good reading, 379 // a value received of 0 (or negative) margin is worrisome, 380 // and should be flagged. 381 // Either it indicates margin not calculated properly, 382 // or somebody forgot to set the margin-zero setpoint, 383 // or the system is really overheating that much. 384 // This is a different condition from _failed 385 // and _badReading, so it merits its own flag. 386 // The sensor has not failed, the reading is good, but the zone 387 // still needs to know that it should go to failsafe mode. 388 if (unscaled <= 0.0) 389 { 390 _marginHot = true; 391 } 392 } 393 394 setValue(value, unscaled); 395 } 396 397 int handleSensorValue(sdbusplus::message_t& msg, DbusPassive* owner) 398 { 399 std::string msgSensor; 400 std::map<std::string, std::variant<int64_t, double, bool>> msgData; 401 402 msg.read(msgSensor, msgData); 403 404 if (msgSensor == "xyz.openbmc_project.Sensor.Value") 405 { 406 auto valPropMap = msgData.find("Value"); 407 if (valPropMap != msgData.end()) 408 { 409 double value = 410 std::visit(VariantToDoubleVisitor(), valPropMap->second); 411 412 owner->updateValue(value, false); 413 } 414 } 415 else if (msgSensor == "xyz.openbmc_project.Sensor.Threshold.Critical") 416 { 417 auto criticalAlarmLow = msgData.find("CriticalAlarmLow"); 418 auto criticalAlarmHigh = msgData.find("CriticalAlarmHigh"); 419 if (criticalAlarmHigh == msgData.end() && 420 criticalAlarmLow == msgData.end()) 421 { 422 return 0; 423 } 424 425 bool asserted = false; 426 if (criticalAlarmLow != msgData.end()) 427 { 428 asserted = std::get<bool>(criticalAlarmLow->second); 429 } 430 431 // checking both as in theory you could de-assert one threshold and 432 // assert the other at the same moment 433 if (!asserted && criticalAlarmHigh != msgData.end()) 434 { 435 asserted = std::get<bool>(criticalAlarmHigh->second); 436 } 437 owner->setFailed(asserted); 438 } 439 #ifdef UNC_FAILSAFE 440 else if (msgSensor == "xyz.openbmc_project.Sensor.Threshold.Warning") 441 { 442 auto warningAlarmHigh = msgData.find("WarningAlarmHigh"); 443 if (warningAlarmHigh == msgData.end()) 444 { 445 return 0; 446 } 447 448 bool asserted = false; 449 if (warningAlarmHigh != msgData.end()) 450 { 451 asserted = std::get<bool>(warningAlarmHigh->second); 452 } 453 owner->setFailed(asserted); 454 } 455 #endif 456 else if (msgSensor == "xyz.openbmc_project.State.Decorator.Availability") 457 { 458 auto available = msgData.find("Available"); 459 if (available == msgData.end()) 460 { 461 return 0; 462 } 463 bool asserted = std::get<bool>(available->second); 464 owner->setAvailable(asserted); 465 if (!asserted) 466 { 467 // A thermal controller will continue its PID calculation and not 468 // trigger a 'failsafe' when some inputs are unavailable. 469 // So, forced to clear the value here to prevent a historical 470 // value to participate in a latter PID calculation. 471 owner->updateValue(std::numeric_limits<double>::quiet_NaN(), true); 472 } 473 } 474 else if (msgSensor == 475 "xyz.openbmc_project.State.Decorator.OperationalStatus") 476 { 477 auto functional = msgData.find("Functional"); 478 if (functional == msgData.end()) 479 { 480 return 0; 481 } 482 bool asserted = std::get<bool>(functional->second); 483 owner->setFunctional(asserted); 484 } 485 486 return 0; 487 } 488 489 int dbusHandleSignal(sd_bus_message* msg, void* usrData, 490 [[maybe_unused]] sd_bus_error* err) 491 { 492 auto sdbpMsg = sdbusplus::message_t(msg); 493 DbusPassive* obj = static_cast<DbusPassive*>(usrData); 494 495 return handleSensorValue(sdbpMsg, obj); 496 } 497 498 } // namespace pid_control 499