1 /** 2 * Copyright 2017 Google Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include "dbuspassive.hpp" 18 19 #include "conf.hpp" 20 #include "dbushelper_interface.hpp" 21 #include "dbuspassiveredundancy.hpp" 22 #include "dbusutil.hpp" 23 #include "failsafeloggers/failsafe_logger_utility.hpp" 24 #include "interfaces.hpp" 25 #include "util.hpp" 26 27 #include <systemd/sd-bus.h> 28 29 #include <sdbusplus/bus.hpp> 30 #include <sdbusplus/message.hpp> 31 32 #include <chrono> 33 #include <cmath> 34 #include <cstdint> 35 #include <exception> 36 #include <limits> 37 #include <map> 38 #include <memory> 39 #include <mutex> 40 #include <set> 41 #include <string> 42 #include <utility> 43 #include <variant> 44 45 #include "failsafeloggers/failsafe_logger.cpp" 46 47 namespace pid_control 48 { 49 50 std::unique_ptr<ReadInterface> DbusPassive::createDbusPassive( 51 sdbusplus::bus_t& bus, const std::string& type, const std::string& id, 52 std::unique_ptr<DbusHelperInterface> helper, const conf::SensorConfig* info, 53 const std::shared_ptr<DbusPassiveRedundancy>& redundancy) 54 { 55 if (helper == nullptr) 56 { 57 return nullptr; 58 } 59 if (!validType(type)) 60 { 61 return nullptr; 62 } 63 64 /* Need to get the scale and initial value */ 65 /* service == busname */ 66 std::string path; 67 if (info->readPath.empty()) 68 { 69 path = getSensorPath(type, id); 70 } 71 else 72 { 73 path = info->readPath; 74 } 75 76 SensorProperties settings; 77 bool failed; 78 std::string service; 79 80 try 81 { 82 service = helper->getService(sensorintf, path); 83 } 84 catch (const std::exception& e) 85 { 86 #ifndef HANDLE_MISSING_OBJECT_PATHS 87 return nullptr; 88 #else 89 // CASE1: The sensor is not on DBus, but as it is not in the 90 // MissingIsAcceptable list, the sensor should be built with a failed 91 // state to send the zone to failsafe mode. Everything will recover if 92 // all important sensors are back to DBus. swampd will be informed 93 // through InterfacesAdded signals and the sensors will be built again. 94 95 // CASE2: The sensor is on D-Bus (getService succeeds) but getProperties 96 // fails (e.g., D-Bus error or property fetch failure). In this case, 97 // handle-missing-object-paths does not apply. The sensor build fails, 98 // and the control loop will keep restarting until getProperties 99 // succeeds. 100 101 // Only CASE1 may send the zone to failsafe mode if the sensor is not 102 // in MissingIsAcceptable. CASE2 results in continuous restart until 103 // recovery. 104 105 failed = true; 106 settings.value = std::numeric_limits<double>::quiet_NaN(); 107 settings.unit = getSensorUnit(type); 108 settings.available = false; 109 settings.unavailableAsFailed = true; 110 if (info->ignoreDbusMinMax) 111 { 112 settings.min = 0; 113 settings.max = 0; 114 } 115 std::cerr << "DbusPassive: Sensor " << path 116 << " is missing from D-Bus, build this sensor as failed\n"; 117 return std::make_unique<DbusPassive>( 118 bus, type, id, std::move(helper), settings, failed, path, 119 redundancy); 120 #endif 121 } 122 123 try 124 { 125 helper->getProperties(service, path, &settings); 126 failed = helper->thresholdsAsserted(service, path); 127 } 128 catch (const std::exception& e) 129 { 130 return nullptr; 131 } 132 133 /* if these values are zero, they're ignored. */ 134 if (info->ignoreDbusMinMax) 135 { 136 settings.min = 0; 137 settings.max = 0; 138 } 139 140 settings.unavailableAsFailed = info->unavailableAsFailed; 141 142 return std::make_unique<DbusPassive>(bus, type, id, std::move(helper), 143 settings, failed, path, redundancy); 144 } 145 146 DbusPassive::DbusPassive( 147 sdbusplus::bus_t& bus, const std::string& type, const std::string& id, 148 std::unique_ptr<DbusHelperInterface> helper, 149 const SensorProperties& settings, bool failed, const std::string& path, 150 const std::shared_ptr<DbusPassiveRedundancy>& redundancy) : 151 ReadInterface(), _signal(bus, getMatch(path), dbusHandleSignal, this), 152 _id(id), _helper(std::move(helper)), _failed(failed), path(path), 153 redundancy(redundancy) 154 155 { 156 _scale = settings.scale; 157 _min = settings.min * std::pow(10.0, _scale); 158 _max = settings.max * std::pow(10.0, _scale); 159 _available = settings.available; 160 _unavailableAsFailed = settings.unavailableAsFailed; 161 162 // Cache this type knowledge, to avoid repeated string comparison 163 _typeMargin = (type == "margin"); 164 _typeFan = (type == "fan"); 165 166 // Force value to be stored, otherwise member would be uninitialized 167 updateValue(settings.value, true); 168 } 169 170 ReadReturn DbusPassive::read(void) 171 { 172 std::lock_guard<std::mutex> guard(_lock); 173 174 ReadReturn r = {_value, _updated, _unscaled}; 175 176 return r; 177 } 178 179 void DbusPassive::setValue(double value, double unscaled) 180 { 181 std::lock_guard<std::mutex> guard(_lock); 182 183 _value = value; 184 _unscaled = unscaled; 185 _updated = std::chrono::high_resolution_clock::now(); 186 } 187 188 void DbusPassive::setValue(double value) 189 { 190 // First param is scaled, second param is unscaled, assume same here 191 setValue(value, value); 192 } 193 194 bool DbusPassive::getFailed(void) const 195 { 196 if (redundancy) 197 { 198 const std::set<std::string>& failures = redundancy->getFailed(); 199 if (failures.find(path) != failures.end()) 200 { 201 outputFailsafeLogWithSensor(_id, true, _id, 202 "The sensor path is marked redundant."); 203 return true; 204 } 205 } 206 207 /* 208 * Unavailable thermal sensors, who are not present or 209 * power-state-not-matching, should not trigger the failSafe mode. For 210 * example, when a system stays at a powered-off state, its CPU Temp 211 * sensors will be unavailable, these unavailable sensors should not be 212 * treated as failed and trigger failSafe. 213 * This is important for systems whose Fans are always on. 214 */ 215 if (!_typeFan && !_available && !_unavailableAsFailed) 216 { 217 return false; 218 } 219 220 // If a reading has came in, 221 // but its value bad in some way (determined by sensor type), 222 // indicate this sensor has failed, 223 // until another value comes in that is no longer bad. 224 // This is different from the overall _failed flag, 225 // which is set and cleared by other causes. 226 if (_badReading) 227 { 228 outputFailsafeLogWithSensor(_id, true, _id, 229 "The sensor has bad readings."); 230 return true; 231 } 232 233 // If a reading has came in, and it is not a bad reading, 234 // but it indicates there is no more thermal margin left, 235 // that is bad, something is wrong with the PID loops, 236 // they are not cooling the system, enable failsafe mode also. 237 if (_marginHot) 238 { 239 outputFailsafeLogWithSensor(_id, true, _id, 240 "The sensor has no thermal margin left."); 241 return true; 242 } 243 244 if (_failed) 245 { 246 outputFailsafeLogWithSensor( 247 _id, true, _id, "The sensor has failed with a critical issue."); 248 return true; 249 } 250 251 if (!_available) 252 { 253 outputFailsafeLogWithSensor(_id, true, _id, 254 "The sensor is unavailable."); 255 return true; 256 } 257 258 if (!_functional) 259 { 260 outputFailsafeLogWithSensor(_id, true, _id, 261 "The sensor is not functional."); 262 return true; 263 } 264 265 outputFailsafeLogWithSensor(_id, false, _id, "The sensor has recovered."); 266 267 return false; 268 } 269 270 std::string DbusPassive::getFailReason(void) const 271 { 272 if (_badReading) 273 { 274 return "Sensor reading bad"; 275 } 276 if (_marginHot) 277 { 278 return "Margin hot"; 279 } 280 if (_failed) 281 { 282 return "Sensor threshold asserted"; 283 } 284 if (!_available) 285 { 286 return "Sensor unavailable"; 287 } 288 if (!_functional) 289 { 290 return "Sensor not functional"; 291 } 292 return "Unknown"; 293 } 294 295 void DbusPassive::setFailed(bool value) 296 { 297 _failed = value; 298 } 299 300 void DbusPassive::setFunctional(bool value) 301 { 302 _functional = value; 303 } 304 305 void DbusPassive::setAvailable(bool value) 306 { 307 _available = value; 308 } 309 310 int64_t DbusPassive::getScale(void) 311 { 312 return _scale; 313 } 314 315 std::string DbusPassive::getID(void) 316 { 317 return _id; 318 } 319 320 double DbusPassive::getMax(void) 321 { 322 return _max; 323 } 324 325 double DbusPassive::getMin(void) 326 { 327 return _min; 328 } 329 330 void DbusPassive::updateValue(double value, bool force) 331 { 332 _badReading = false; 333 334 // Do not let a NAN, or other floating-point oddity, be used to update 335 // the value, as that indicates the sensor has no valid reading. 336 if (!(std::isfinite(value))) 337 { 338 _badReading = true; 339 340 // Do not continue with a bad reading, unless caller forcing 341 if (!force) 342 { 343 return; 344 } 345 } 346 347 value *= std::pow(10.0, _scale); 348 349 auto unscaled = value; 350 scaleSensorReading(_min, _max, value); 351 352 if (_typeMargin) 353 { 354 _marginHot = false; 355 356 // Unlike an absolute temperature sensor, 357 // where 0 degrees C is a good reading, 358 // a value received of 0 (or negative) margin is worrisome, 359 // and should be flagged. 360 // Either it indicates margin not calculated properly, 361 // or somebody forgot to set the margin-zero setpoint, 362 // or the system is really overheating that much. 363 // This is a different condition from _failed 364 // and _badReading, so it merits its own flag. 365 // The sensor has not failed, the reading is good, but the zone 366 // still needs to know that it should go to failsafe mode. 367 if (unscaled <= 0.0) 368 { 369 _marginHot = true; 370 } 371 } 372 373 setValue(value, unscaled); 374 } 375 376 int handleSensorValue(sdbusplus::message_t& msg, DbusPassive* owner) 377 { 378 std::string msgSensor; 379 std::map<std::string, std::variant<int64_t, double, bool>> msgData; 380 381 msg.read(msgSensor, msgData); 382 383 if (msgSensor == "xyz.openbmc_project.Sensor.Value") 384 { 385 auto valPropMap = msgData.find("Value"); 386 if (valPropMap != msgData.end()) 387 { 388 double value = 389 std::visit(VariantToDoubleVisitor(), valPropMap->second); 390 391 owner->updateValue(value, false); 392 } 393 } 394 else if (msgSensor == "xyz.openbmc_project.Sensor.Threshold.Critical") 395 { 396 auto criticalAlarmLow = msgData.find("CriticalAlarmLow"); 397 auto criticalAlarmHigh = msgData.find("CriticalAlarmHigh"); 398 if (criticalAlarmHigh == msgData.end() && 399 criticalAlarmLow == msgData.end()) 400 { 401 return 0; 402 } 403 404 bool asserted = false; 405 if (criticalAlarmLow != msgData.end()) 406 { 407 asserted = std::get<bool>(criticalAlarmLow->second); 408 } 409 410 // checking both as in theory you could de-assert one threshold and 411 // assert the other at the same moment 412 if (!asserted && criticalAlarmHigh != msgData.end()) 413 { 414 asserted = std::get<bool>(criticalAlarmHigh->second); 415 } 416 owner->setFailed(asserted); 417 } 418 #ifdef UNC_FAILSAFE 419 else if (msgSensor == "xyz.openbmc_project.Sensor.Threshold.Warning") 420 { 421 auto warningAlarmHigh = msgData.find("WarningAlarmHigh"); 422 if (warningAlarmHigh == msgData.end()) 423 { 424 return 0; 425 } 426 427 bool asserted = false; 428 if (warningAlarmHigh != msgData.end()) 429 { 430 asserted = std::get<bool>(warningAlarmHigh->second); 431 } 432 owner->setFailed(asserted); 433 } 434 #endif 435 else if (msgSensor == "xyz.openbmc_project.State.Decorator.Availability") 436 { 437 auto available = msgData.find("Available"); 438 if (available == msgData.end()) 439 { 440 return 0; 441 } 442 bool asserted = std::get<bool>(available->second); 443 owner->setAvailable(asserted); 444 if (!asserted) 445 { 446 // A thermal controller will continue its PID calculation and not 447 // trigger a 'failsafe' when some inputs are unavailable. 448 // So, forced to clear the value here to prevent a historical 449 // value to participate in a latter PID calculation. 450 owner->updateValue(std::numeric_limits<double>::quiet_NaN(), true); 451 } 452 } 453 else if (msgSensor == 454 "xyz.openbmc_project.State.Decorator.OperationalStatus") 455 { 456 auto functional = msgData.find("Functional"); 457 if (functional == msgData.end()) 458 { 459 return 0; 460 } 461 bool asserted = std::get<bool>(functional->second); 462 owner->setFunctional(asserted); 463 } 464 465 return 0; 466 } 467 468 int dbusHandleSignal(sd_bus_message* msg, void* usrData, 469 [[maybe_unused]] sd_bus_error* err) 470 { 471 auto sdbpMsg = sdbusplus::message_t(msg); 472 DbusPassive* obj = static_cast<DbusPassive*>(usrData); 473 474 return handleSensorValue(sdbpMsg, obj); 475 } 476 477 } // namespace pid_control 478