1 /** 2 * Copyright 2017 Google Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 #include "config.h" 17 18 #include "dbuspassive.hpp" 19 20 #include "conf.hpp" 21 #include "dbushelper_interface.hpp" 22 #include "dbuspassiveredundancy.hpp" 23 #include "dbusutil.hpp" 24 #include "failsafeloggers/failsafe_logger_utility.hpp" 25 #include "interfaces.hpp" 26 #include "util.hpp" 27 28 #include <systemd/sd-bus.h> 29 30 #include <sdbusplus/bus.hpp> 31 #include <sdbusplus/message.hpp> 32 33 #include <chrono> 34 #include <cmath> 35 #include <cstdint> 36 #include <exception> 37 #include <limits> 38 #include <map> 39 #include <memory> 40 #include <mutex> 41 #include <set> 42 #include <string> 43 #include <utility> 44 #include <variant> 45 46 #include "failsafeloggers/failsafe_logger.cpp" 47 48 namespace pid_control 49 { 50 51 std::unique_ptr<ReadInterface> DbusPassive::createDbusPassive( 52 sdbusplus::bus_t& bus, const std::string& type, const std::string& id, 53 std::unique_ptr<DbusHelperInterface> helper, const conf::SensorConfig* info, 54 const std::shared_ptr<DbusPassiveRedundancy>& redundancy) 55 { 56 if (helper == nullptr) 57 { 58 return nullptr; 59 } 60 if (!validType(type)) 61 { 62 return nullptr; 63 } 64 65 /* Need to get the scale and initial value */ 66 /* service == busname */ 67 std::string path; 68 if (info->readPath.empty()) 69 { 70 path = getSensorPath(type, id); 71 } 72 else 73 { 74 path = info->readPath; 75 } 76 77 SensorProperties settings; 78 bool failed; 79 std::string service; 80 81 try 82 { 83 service = helper->getService(sensorintf, path); 84 } 85 catch (const std::exception& e) 86 { 87 #ifndef HANDLE_MISSING_OBJECT_PATHS 88 return nullptr; 89 #else 90 // CASE1: The sensor is not on DBus, but as it is not in the 91 // MissingIsAcceptable list, the sensor should be built with a failed 92 // state to send the zone to failsafe mode. Everything will recover if 93 // all important sensors are back to DBus. swampd will be informed 94 // through InterfacesAdded signals and the sensors will be built again. 95 96 // CASE2: The sensor is on D-Bus (getService succeeds) but getProperties 97 // fails (e.g., D-Bus error or property fetch failure). In this case, 98 // handle-missing-object-paths does not apply. The sensor build fails, 99 // and the control loop will keep restarting until getProperties 100 // succeeds. 101 102 // Only CASE1 may send the zone to failsafe mode if the sensor is not 103 // in MissingIsAcceptable. CASE2 results in continuous restart until 104 // recovery. 105 106 failed = true; 107 settings.value = std::numeric_limits<double>::quiet_NaN(); 108 settings.unit = getSensorUnit(type); 109 settings.available = false; 110 settings.unavailableAsFailed = true; 111 if (info->ignoreDbusMinMax) 112 { 113 settings.min = 0; 114 settings.max = 0; 115 } 116 std::cerr << "DbusPassive: Sensor " << path 117 << " is missing from D-Bus, build this sensor as failed\n"; 118 return std::make_unique<DbusPassive>( 119 bus, type, id, std::move(helper), settings, failed, path, 120 redundancy); 121 #endif 122 } 123 124 try 125 { 126 helper->getProperties(service, path, &settings); 127 failed = helper->thresholdsAsserted(service, path); 128 } 129 catch (const std::exception& e) 130 { 131 return nullptr; 132 } 133 134 /* if these values are zero, they're ignored. */ 135 if (info->ignoreDbusMinMax) 136 { 137 settings.min = 0; 138 settings.max = 0; 139 } 140 141 settings.unavailableAsFailed = info->unavailableAsFailed; 142 143 return std::make_unique<DbusPassive>(bus, type, id, std::move(helper), 144 settings, failed, path, redundancy); 145 } 146 147 DbusPassive::DbusPassive( 148 sdbusplus::bus_t& bus, const std::string& type, const std::string& id, 149 std::unique_ptr<DbusHelperInterface> helper, 150 const SensorProperties& settings, bool failed, const std::string& path, 151 const std::shared_ptr<DbusPassiveRedundancy>& redundancy) : 152 ReadInterface(), _signal(bus, getMatch(path), dbusHandleSignal, this), 153 _id(id), _helper(std::move(helper)), _failed(failed), path(path), 154 redundancy(redundancy) 155 156 { 157 _scale = settings.scale; 158 _min = settings.min * std::pow(10.0, _scale); 159 _max = settings.max * std::pow(10.0, _scale); 160 _available = settings.available; 161 _unavailableAsFailed = settings.unavailableAsFailed; 162 163 // Cache this type knowledge, to avoid repeated string comparison 164 _typeMargin = (type == "margin"); 165 _typeFan = (type == "fan"); 166 167 // Force value to be stored, otherwise member would be uninitialized 168 updateValue(settings.value, true); 169 } 170 171 ReadReturn DbusPassive::read(void) 172 { 173 std::lock_guard<std::mutex> guard(_lock); 174 175 ReadReturn r = {_value, _updated, _unscaled}; 176 177 return r; 178 } 179 180 void DbusPassive::setValue(double value, double unscaled) 181 { 182 std::lock_guard<std::mutex> guard(_lock); 183 184 _value = value; 185 _unscaled = unscaled; 186 _updated = std::chrono::high_resolution_clock::now(); 187 } 188 189 void DbusPassive::setValue(double value) 190 { 191 // First param is scaled, second param is unscaled, assume same here 192 setValue(value, value); 193 } 194 195 bool DbusPassive::getFailed(void) const 196 { 197 if (redundancy) 198 { 199 const std::set<std::string>& failures = redundancy->getFailed(); 200 if (failures.find(path) != failures.end()) 201 { 202 outputFailsafeLogWithSensor(_id, true, _id, 203 "The sensor path is marked redundant."); 204 return true; 205 } 206 } 207 208 /* 209 * Unavailable thermal sensors, who are not present or 210 * power-state-not-matching, should not trigger the failSafe mode. For 211 * example, when a system stays at a powered-off state, its CPU Temp 212 * sensors will be unavailable, these unavailable sensors should not be 213 * treated as failed and trigger failSafe. 214 * This is important for systems whose Fans are always on. 215 */ 216 if (!_typeFan && !_available && !_unavailableAsFailed) 217 { 218 return false; 219 } 220 221 // If a reading has came in, 222 // but its value bad in some way (determined by sensor type), 223 // indicate this sensor has failed, 224 // until another value comes in that is no longer bad. 225 // This is different from the overall _failed flag, 226 // which is set and cleared by other causes. 227 if (_badReading) 228 { 229 outputFailsafeLogWithSensor(_id, true, _id, 230 "The sensor has bad readings."); 231 return true; 232 } 233 234 // If a reading has came in, and it is not a bad reading, 235 // but it indicates there is no more thermal margin left, 236 // that is bad, something is wrong with the PID loops, 237 // they are not cooling the system, enable failsafe mode also. 238 if (_marginHot) 239 { 240 outputFailsafeLogWithSensor(_id, true, _id, 241 "The sensor has no thermal margin left."); 242 return true; 243 } 244 245 if (_failed) 246 { 247 outputFailsafeLogWithSensor( 248 _id, true, _id, "The sensor has failed with a critical issue."); 249 return true; 250 } 251 252 if (!_available) 253 { 254 outputFailsafeLogWithSensor(_id, true, _id, 255 "The sensor is unavailable."); 256 return true; 257 } 258 259 if (!_functional) 260 { 261 outputFailsafeLogWithSensor(_id, true, _id, 262 "The sensor is not functional."); 263 return true; 264 } 265 266 outputFailsafeLogWithSensor(_id, false, _id, "The sensor has recovered."); 267 268 return false; 269 } 270 271 std::string DbusPassive::getFailReason(void) const 272 { 273 if (_badReading) 274 { 275 return "Sensor reading bad"; 276 } 277 if (_marginHot) 278 { 279 return "Margin hot"; 280 } 281 if (_failed) 282 { 283 return "Sensor threshold asserted"; 284 } 285 if (!_available) 286 { 287 return "Sensor unavailable"; 288 } 289 if (!_functional) 290 { 291 return "Sensor not functional"; 292 } 293 return "Unknown"; 294 } 295 296 void DbusPassive::setFailed(bool value) 297 { 298 _failed = value; 299 } 300 301 void DbusPassive::setFunctional(bool value) 302 { 303 _functional = value; 304 } 305 306 void DbusPassive::setAvailable(bool value) 307 { 308 _available = value; 309 } 310 311 int64_t DbusPassive::getScale(void) 312 { 313 return _scale; 314 } 315 316 std::string DbusPassive::getID(void) 317 { 318 return _id; 319 } 320 321 double DbusPassive::getMax(void) 322 { 323 return _max; 324 } 325 326 double DbusPassive::getMin(void) 327 { 328 return _min; 329 } 330 331 void DbusPassive::updateValue(double value, bool force) 332 { 333 _badReading = false; 334 335 // Do not let a NAN, or other floating-point oddity, be used to update 336 // the value, as that indicates the sensor has no valid reading. 337 if (!(std::isfinite(value))) 338 { 339 _badReading = true; 340 341 // Do not continue with a bad reading, unless caller forcing 342 if (!force) 343 { 344 return; 345 } 346 } 347 348 value *= std::pow(10.0, _scale); 349 350 auto unscaled = value; 351 scaleSensorReading(_min, _max, value); 352 353 if (_typeMargin) 354 { 355 _marginHot = false; 356 357 // Unlike an absolute temperature sensor, 358 // where 0 degrees C is a good reading, 359 // a value received of 0 (or negative) margin is worrisome, 360 // and should be flagged. 361 // Either it indicates margin not calculated properly, 362 // or somebody forgot to set the margin-zero setpoint, 363 // or the system is really overheating that much. 364 // This is a different condition from _failed 365 // and _badReading, so it merits its own flag. 366 // The sensor has not failed, the reading is good, but the zone 367 // still needs to know that it should go to failsafe mode. 368 if (unscaled <= 0.0) 369 { 370 _marginHot = true; 371 } 372 } 373 374 setValue(value, unscaled); 375 } 376 377 int handleSensorValue(sdbusplus::message_t& msg, DbusPassive* owner) 378 { 379 std::string msgSensor; 380 std::map<std::string, std::variant<int64_t, double, bool>> msgData; 381 382 msg.read(msgSensor, msgData); 383 384 if (msgSensor == "xyz.openbmc_project.Sensor.Value") 385 { 386 auto valPropMap = msgData.find("Value"); 387 if (valPropMap != msgData.end()) 388 { 389 double value = 390 std::visit(VariantToDoubleVisitor(), valPropMap->second); 391 392 owner->updateValue(value, false); 393 } 394 } 395 else if (msgSensor == "xyz.openbmc_project.Sensor.Threshold.Critical") 396 { 397 auto criticalAlarmLow = msgData.find("CriticalAlarmLow"); 398 auto criticalAlarmHigh = msgData.find("CriticalAlarmHigh"); 399 if (criticalAlarmHigh == msgData.end() && 400 criticalAlarmLow == msgData.end()) 401 { 402 return 0; 403 } 404 405 bool asserted = false; 406 if (criticalAlarmLow != msgData.end()) 407 { 408 asserted = std::get<bool>(criticalAlarmLow->second); 409 } 410 411 // checking both as in theory you could de-assert one threshold and 412 // assert the other at the same moment 413 if (!asserted && criticalAlarmHigh != msgData.end()) 414 { 415 asserted = std::get<bool>(criticalAlarmHigh->second); 416 } 417 owner->setFailed(asserted); 418 } 419 #ifdef UNC_FAILSAFE 420 else if (msgSensor == "xyz.openbmc_project.Sensor.Threshold.Warning") 421 { 422 auto warningAlarmHigh = msgData.find("WarningAlarmHigh"); 423 if (warningAlarmHigh == msgData.end()) 424 { 425 return 0; 426 } 427 428 bool asserted = false; 429 if (warningAlarmHigh != msgData.end()) 430 { 431 asserted = std::get<bool>(warningAlarmHigh->second); 432 } 433 owner->setFailed(asserted); 434 } 435 #endif 436 else if (msgSensor == "xyz.openbmc_project.State.Decorator.Availability") 437 { 438 auto available = msgData.find("Available"); 439 if (available == msgData.end()) 440 { 441 return 0; 442 } 443 bool asserted = std::get<bool>(available->second); 444 owner->setAvailable(asserted); 445 if (!asserted) 446 { 447 // A thermal controller will continue its PID calculation and not 448 // trigger a 'failsafe' when some inputs are unavailable. 449 // So, forced to clear the value here to prevent a historical 450 // value to participate in a latter PID calculation. 451 owner->updateValue(std::numeric_limits<double>::quiet_NaN(), true); 452 } 453 } 454 else if (msgSensor == 455 "xyz.openbmc_project.State.Decorator.OperationalStatus") 456 { 457 auto functional = msgData.find("Functional"); 458 if (functional == msgData.end()) 459 { 460 return 0; 461 } 462 bool asserted = std::get<bool>(functional->second); 463 owner->setFunctional(asserted); 464 } 465 466 return 0; 467 } 468 469 int dbusHandleSignal(sd_bus_message* msg, void* usrData, 470 [[maybe_unused]] sd_bus_error* err) 471 { 472 auto sdbpMsg = sdbusplus::message_t(msg); 473 DbusPassive* obj = static_cast<DbusPassive*>(usrData); 474 475 return handleSensorValue(sdbpMsg, obj); 476 } 477 478 } // namespace pid_control 479