1 #include "config.h" 2 3 #include "log_manager.hpp" 4 5 #include "elog_entry.hpp" 6 #include "elog_meta.hpp" 7 #include "elog_serialize.hpp" 8 #include "extensions.hpp" 9 #include "util.hpp" 10 11 #include <poll.h> 12 #include <sys/inotify.h> 13 #include <systemd/sd-bus.h> 14 #include <systemd/sd-journal.h> 15 #include <unistd.h> 16 17 #include <cassert> 18 #include <chrono> 19 #include <cstdio> 20 #include <cstring> 21 #include <fstream> 22 #include <functional> 23 #include <future> 24 #include <iostream> 25 #include <map> 26 #include <phosphor-logging/log.hpp> 27 #include <sdbusplus/vtable.hpp> 28 #include <set> 29 #include <string> 30 #include <vector> 31 #include <xyz/openbmc_project/State/Host/server.hpp> 32 33 using namespace phosphor::logging; 34 using namespace std::chrono; 35 using sdbusplus::exception::SdBusError; 36 extern const std::map<metadata::Metadata, 37 std::function<metadata::associations::Type>> 38 meta; 39 40 namespace phosphor 41 { 42 namespace logging 43 { 44 namespace internal 45 { 46 inline auto getLevel(const std::string& errMsg) 47 { 48 auto reqLevel = Entry::Level::Error; // Default to Error 49 50 auto levelmap = g_errLevelMap.find(errMsg); 51 if (levelmap != g_errLevelMap.end()) 52 { 53 reqLevel = static_cast<Entry::Level>(levelmap->second); 54 } 55 56 return reqLevel; 57 } 58 59 int Manager::getRealErrSize() 60 { 61 return realErrors.size(); 62 } 63 64 int Manager::getInfoErrSize() 65 { 66 return infoErrors.size(); 67 } 68 69 void Manager::commit(uint64_t transactionId, std::string errMsg) 70 { 71 auto level = getLevel(errMsg); 72 _commit(transactionId, std::move(errMsg), level); 73 } 74 75 void Manager::commitWithLvl(uint64_t transactionId, std::string errMsg, 76 uint32_t errLvl) 77 { 78 _commit(transactionId, std::move(errMsg), 79 static_cast<Entry::Level>(errLvl)); 80 } 81 82 void Manager::_commit(uint64_t transactionId, std::string&& errMsg, 83 Entry::Level errLvl) 84 { 85 constexpr const auto transactionIdVar = "TRANSACTION_ID"; 86 // Length of 'TRANSACTION_ID' string. 87 constexpr const auto transactionIdVarSize = std::strlen(transactionIdVar); 88 // Length of 'TRANSACTION_ID=' string. 89 constexpr const auto transactionIdVarOffset = transactionIdVarSize + 1; 90 91 // Flush all the pending log messages into the journal 92 journalSync(); 93 94 sd_journal* j = nullptr; 95 int rc = sd_journal_open(&j, SD_JOURNAL_LOCAL_ONLY); 96 if (rc < 0) 97 { 98 logging::log<logging::level::ERR>( 99 "Failed to open journal", 100 logging::entry("DESCRIPTION=%s", strerror(-rc))); 101 return; 102 } 103 104 std::string transactionIdStr = std::to_string(transactionId); 105 std::set<std::string> metalist; 106 auto metamap = g_errMetaMap.find(errMsg); 107 if (metamap != g_errMetaMap.end()) 108 { 109 metalist.insert(metamap->second.begin(), metamap->second.end()); 110 } 111 112 // Add _PID field information in AdditionalData. 113 metalist.insert("_PID"); 114 115 std::vector<std::string> additionalData; 116 117 // Read the journal from the end to get the most recent entry first. 118 // The result from the sd_journal_get_data() is of the form VARIABLE=value. 119 SD_JOURNAL_FOREACH_BACKWARDS(j) 120 { 121 const char* data = nullptr; 122 size_t length = 0; 123 124 // Look for the transaction id metadata variable 125 rc = sd_journal_get_data(j, transactionIdVar, (const void**)&data, 126 &length); 127 if (rc < 0) 128 { 129 // This journal entry does not have the TRANSACTION_ID 130 // metadata variable. 131 continue; 132 } 133 134 // journald does not guarantee that sd_journal_get_data() returns NULL 135 // terminated strings, so need to specify the size to use to compare, 136 // use the returned length instead of anything that relies on NULL 137 // terminators like strlen(). 138 // The data variable is in the form of 'TRANSACTION_ID=1234'. Remove 139 // the TRANSACTION_ID characters plus the (=) sign to do the comparison. 140 // 'data + transactionIdVarOffset' will be in the form of '1234'. 141 // 'length - transactionIdVarOffset' will be the length of '1234'. 142 if ((length <= (transactionIdVarOffset)) || 143 (transactionIdStr.compare(0, transactionIdStr.size(), 144 data + transactionIdVarOffset, 145 length - transactionIdVarOffset) != 0)) 146 { 147 // The value of the TRANSACTION_ID metadata is not the requested 148 // transaction id number. 149 continue; 150 } 151 152 // Search for all metadata variables in the current journal entry. 153 for (auto i = metalist.cbegin(); i != metalist.cend();) 154 { 155 rc = sd_journal_get_data(j, (*i).c_str(), (const void**)&data, 156 &length); 157 if (rc < 0) 158 { 159 // Metadata variable not found, check next metadata variable. 160 i++; 161 continue; 162 } 163 164 // Metadata variable found, save it and remove it from the set. 165 additionalData.emplace_back(data, length); 166 i = metalist.erase(i); 167 } 168 if (metalist.empty()) 169 { 170 // All metadata variables found, break out of journal loop. 171 break; 172 } 173 } 174 if (!metalist.empty()) 175 { 176 // Not all the metadata variables were found in the journal. 177 for (auto& metaVarStr : metalist) 178 { 179 logging::log<logging::level::INFO>( 180 "Failed to find metadata", 181 logging::entry("META_FIELD=%s", metaVarStr.c_str())); 182 } 183 } 184 185 sd_journal_close(j); 186 187 createEntry(errMsg, errLvl, additionalData); 188 } 189 190 void Manager::createEntry(std::string errMsg, Entry::Level errLvl, 191 std::vector<std::string> additionalData, 192 const FFDCEntries& ffdc) 193 { 194 if (!Extensions::disableDefaultLogCaps()) 195 { 196 if (errLvl < Entry::sevLowerLimit) 197 { 198 if (realErrors.size() >= ERROR_CAP) 199 { 200 erase(realErrors.front()); 201 } 202 } 203 else 204 { 205 if (infoErrors.size() >= ERROR_INFO_CAP) 206 { 207 erase(infoErrors.front()); 208 } 209 } 210 } 211 212 entryId++; 213 if (errLvl >= Entry::sevLowerLimit) 214 { 215 infoErrors.push_back(entryId); 216 } 217 else 218 { 219 realErrors.push_back(entryId); 220 } 221 auto ms = std::chrono::duration_cast<std::chrono::milliseconds>( 222 std::chrono::system_clock::now().time_since_epoch()) 223 .count(); 224 auto objPath = std::string(OBJ_ENTRY) + '/' + std::to_string(entryId); 225 226 AssociationList objects{}; 227 processMetadata(errMsg, additionalData, objects); 228 229 auto e = std::make_unique<Entry>(busLog, objPath, entryId, 230 ms, // Milliseconds since 1970 231 errLvl, std::move(errMsg), 232 std::move(additionalData), 233 std::move(objects), fwVersion, *this); 234 auto path = serialize(*e); 235 e->path(path); 236 237 if (isQuiesceOnErrorEnabled() && isCalloutPresent(*e)) 238 { 239 quiesceOnError(entryId); 240 } 241 242 // Add entry before calling the extensions so that they have access to it 243 entries.insert(std::make_pair(entryId, std::move(e))); 244 245 doExtensionLogCreate(*entries.find(entryId)->second, ffdc); 246 247 // Note: No need to close the file descriptors in the FFDC. 248 } 249 250 bool Manager::isQuiesceOnErrorEnabled() 251 { 252 std::variant<bool> property; 253 254 auto method = this->busLog.new_method_call( 255 "xyz.openbmc_project.Settings", "/xyz/openbmc_project/logging/settings", 256 "org.freedesktop.DBus.Properties", "Get"); 257 258 method.append("xyz.openbmc_project.Logging.Settings", "QuiesceOnHwError"); 259 260 try 261 { 262 auto reply = this->busLog.call(method); 263 reply.read(property); 264 } 265 catch (const SdBusError& e) 266 { 267 log<level::ERR>("Error reading QuiesceOnHwError property", 268 entry("ERROR=%s", e.what())); 269 throw; 270 } 271 272 return std::get<bool>(property); 273 } 274 275 bool Manager::isCalloutPresent(const Entry& entry) 276 { 277 for (const auto& c : entry.additionalData()) 278 { 279 if (c.find("CALLOUT_") != std::string::npos) 280 { 281 return true; 282 } 283 } 284 285 return false; 286 } 287 288 void Manager::findAndRemoveResolvedBlocks() 289 { 290 for (auto& entry : entries) 291 { 292 if (entry.second->resolved()) 293 { 294 checkAndRemoveBlockingError(entry.first); 295 } 296 } 297 } 298 299 void Manager::onEntryResolve(sdbusplus::message::message& msg) 300 { 301 using Interface = std::string; 302 using Property = std::string; 303 using Value = std::string; 304 using Properties = std::map<Property, std::variant<Value>>; 305 306 Interface interface; 307 Properties properties; 308 309 msg.read(interface, properties); 310 311 for (const auto& p : properties) 312 { 313 if (p.first == "Resolved") 314 { 315 findAndRemoveResolvedBlocks(); 316 return; 317 } 318 } 319 } 320 321 void Manager::checkAndQuiesceHost() 322 { 323 // First check host state 324 std::variant<std::string> property; 325 326 auto method = this->busLog.new_method_call( 327 "xyz.openbmc_project.State.Host", "/xyz/openbmc_project/state/host0", 328 "org.freedesktop.DBus.Properties", "Get"); 329 330 method.append("xyz.openbmc_project.State.Host", "CurrentHostState"); 331 332 try 333 { 334 auto reply = this->busLog.call(method); 335 reply.read(property); 336 } 337 catch (const SdBusError& e) 338 { 339 // Quiescing the host is a "best effort" type function. If unable to 340 // read the host state or it comes back empty, just return. 341 // The boot block object will still be created and the associations to 342 // find the log will be present. Don't want a dependency with 343 // phosphor-state-manager service 344 log<level::INFO>("Error reading QuiesceOnHwError property", 345 entry("ERROR=%s", e.what())); 346 return; 347 } 348 349 std::string hostState = std::get<std::string>(property); 350 351 // If host state is empty, do nothing 352 if (hostState.empty()) 353 { 354 return; 355 } 356 357 using Host = sdbusplus::xyz::openbmc_project::State::server::Host; 358 auto state = Host::convertHostStateFromString(hostState); 359 if (state != Host::HostState::Running) 360 { 361 return; 362 } 363 364 auto quiesce = this->busLog.new_method_call( 365 "org.freedesktop.systemd1", "/org/freedesktop/systemd1", 366 "org.freedesktop.systemd1.Manager", "StartUnit"); 367 368 quiesce.append("obmc-host-quiesce@0.target"); 369 quiesce.append("replace"); 370 371 this->busLog.call_noreply(quiesce); 372 } 373 374 void Manager::quiesceOnError(const uint32_t entryId) 375 { 376 // Verify we don't already have this entry blocking 377 auto it = find_if( 378 this->blockingErrors.begin(), this->blockingErrors.end(), 379 [&](std::unique_ptr<Block>& obj) { return obj->entryId == entryId; }); 380 if (it != this->blockingErrors.end()) 381 { 382 // Already recorded so just return 383 logging::log<logging::level::DEBUG>( 384 "QuiesceOnError set and callout present but entry already logged"); 385 return; 386 } 387 388 logging::log<logging::level::INFO>( 389 "QuiesceOnError set and callout present"); 390 391 auto blockPath = 392 std::string(OBJ_LOGGING) + "/block" + std::to_string(entryId); 393 auto blockObj = std::make_unique<Block>(this->busLog, blockPath, entryId); 394 this->blockingErrors.push_back(std::move(blockObj)); 395 396 // Register call back if log is resolved 397 using namespace sdbusplus::bus::match::rules; 398 auto entryPath = std::string(OBJ_ENTRY) + '/' + std::to_string(entryId); 399 auto callback = std::make_unique<sdbusplus::bus::match::match>( 400 this->busLog, 401 propertiesChanged(entryPath, "xyz.openbmc_project.Logging.Entry"), 402 std::bind(std::mem_fn(&Manager::onEntryResolve), this, 403 std::placeholders::_1)); 404 405 propChangedEntryCallback.insert( 406 std::make_pair(entryId, std::move(callback))); 407 408 checkAndQuiesceHost(); 409 } 410 411 void Manager::doExtensionLogCreate(const Entry& entry, const FFDCEntries& ffdc) 412 { 413 // Make the association <endpointpath>/<endpointtype> paths 414 std::vector<std::string> assocs; 415 for (const auto& [forwardType, reverseType, endpoint] : 416 entry.associations()) 417 { 418 std::string e{endpoint}; 419 e += '/' + reverseType; 420 assocs.push_back(e); 421 } 422 423 for (auto& create : Extensions::getCreateFunctions()) 424 { 425 try 426 { 427 create(entry.message(), entry.id(), entry.timestamp(), 428 entry.severity(), entry.additionalData(), assocs, ffdc); 429 } 430 catch (std::exception& e) 431 { 432 log<level::ERR>("An extension's create function threw an exception", 433 phosphor::logging::entry("ERROR=%s", e.what())); 434 } 435 } 436 } 437 438 void Manager::processMetadata(const std::string& errorName, 439 const std::vector<std::string>& additionalData, 440 AssociationList& objects) const 441 { 442 // additionalData is a list of "metadata=value" 443 constexpr auto separator = '='; 444 for (const auto& entryItem : additionalData) 445 { 446 auto found = entryItem.find(separator); 447 if (std::string::npos != found) 448 { 449 auto metadata = entryItem.substr(0, found); 450 auto iter = meta.find(metadata); 451 if (meta.end() != iter) 452 { 453 (iter->second)(metadata, additionalData, objects); 454 } 455 } 456 } 457 } 458 459 void Manager::checkAndRemoveBlockingError(uint32_t entryId) 460 { 461 // First look for blocking object and remove 462 auto it = find_if( 463 blockingErrors.begin(), blockingErrors.end(), 464 [&](std::unique_ptr<Block>& obj) { return obj->entryId == entryId; }); 465 if (it != blockingErrors.end()) 466 { 467 blockingErrors.erase(it); 468 } 469 470 // Now remove the callback looking for the error to be resolved 471 auto resolveFind = propChangedEntryCallback.find(entryId); 472 if (resolveFind != propChangedEntryCallback.end()) 473 { 474 propChangedEntryCallback.erase(resolveFind); 475 } 476 477 return; 478 } 479 480 void Manager::erase(uint32_t entryId) 481 { 482 auto entryFound = entries.find(entryId); 483 if (entries.end() != entryFound) 484 { 485 for (auto& func : Extensions::getDeleteProhibitedFunctions()) 486 { 487 try 488 { 489 bool prohibited = false; 490 func(entryId, prohibited); 491 if (prohibited) 492 { 493 // Future work remains to throw an error here. 494 return; 495 } 496 } 497 catch (std::exception& e) 498 { 499 log<level::ERR>( 500 "An extension's deleteProhibited function threw " 501 "an exception", 502 entry("ERROR=%s", e.what())); 503 } 504 } 505 506 // Delete the persistent representation of this error. 507 fs::path errorPath(ERRLOG_PERSIST_PATH); 508 errorPath /= std::to_string(entryId); 509 fs::remove(errorPath); 510 511 auto removeId = [](std::list<uint32_t>& ids, uint32_t id) { 512 auto it = std::find(ids.begin(), ids.end(), id); 513 if (it != ids.end()) 514 { 515 ids.erase(it); 516 } 517 }; 518 if (entryFound->second->severity() >= Entry::sevLowerLimit) 519 { 520 removeId(infoErrors, entryId); 521 } 522 else 523 { 524 removeId(realErrors, entryId); 525 } 526 entries.erase(entryFound); 527 528 checkAndRemoveBlockingError(entryId); 529 530 for (auto& remove : Extensions::getDeleteFunctions()) 531 { 532 try 533 { 534 remove(entryId); 535 } 536 catch (std::exception& e) 537 { 538 log<level::ERR>("An extension's delete function threw an " 539 "exception", 540 entry("ERROR=%s", e.what())); 541 } 542 } 543 } 544 else 545 { 546 logging::log<level::ERR>("Invalid entry ID to delete", 547 logging::entry("ID=%d", entryId)); 548 } 549 } 550 551 void Manager::restore() 552 { 553 auto sanity = [](const auto& id, const auto& restoredId) { 554 return id == restoredId; 555 }; 556 std::vector<uint32_t> errorIds; 557 558 fs::path dir(ERRLOG_PERSIST_PATH); 559 if (!fs::exists(dir) || fs::is_empty(dir)) 560 { 561 return; 562 } 563 564 for (auto& file : fs::directory_iterator(dir)) 565 { 566 auto id = file.path().filename().c_str(); 567 auto idNum = std::stol(id); 568 auto e = std::make_unique<Entry>( 569 busLog, std::string(OBJ_ENTRY) + '/' + id, idNum, *this); 570 if (deserialize(file.path(), *e)) 571 { 572 // validate the restored error entry id 573 if (sanity(static_cast<uint32_t>(idNum), e->id())) 574 { 575 e->path(file.path()); 576 e->emit_object_added(); 577 if (e->severity() >= Entry::sevLowerLimit) 578 { 579 infoErrors.push_back(idNum); 580 } 581 else 582 { 583 realErrors.push_back(idNum); 584 } 585 586 entries.insert(std::make_pair(idNum, std::move(e))); 587 errorIds.push_back(idNum); 588 } 589 else 590 { 591 logging::log<logging::level::ERR>( 592 "Failed in sanity check while restoring error entry. " 593 "Ignoring error entry", 594 logging::entry("ID_NUM=%d", idNum), 595 logging::entry("ENTRY_ID=%d", e->id())); 596 } 597 } 598 } 599 600 if (!errorIds.empty()) 601 { 602 entryId = *(std::max_element(errorIds.begin(), errorIds.end())); 603 } 604 } 605 606 void Manager::journalSync() 607 { 608 bool syncRequested = false; 609 auto fd = -1; 610 auto rc = -1; 611 auto wd = -1; 612 auto bus = sdbusplus::bus::new_default(); 613 614 auto start = 615 duration_cast<microseconds>(steady_clock::now().time_since_epoch()) 616 .count(); 617 618 // Each time an error log is committed, a request to sync the journal 619 // must occur and block that error log commit until it completes. A 5sec 620 // block is done to allow sufficient time for the journal to be synced. 621 // 622 // Number of loop iterations = 3 for the following reasons: 623 // Iteration #1: Requests a journal sync by killing the journald service. 624 // Iteration #2: Setup an inotify watch to monitor the synced file that 625 // journald updates with the timestamp the last time the 626 // journal was flushed. 627 // Iteration #3: Poll to wait until inotify reports an event which blocks 628 // the error log from being commited until the sync completes. 629 constexpr auto maxRetry = 3; 630 for (int i = 0; i < maxRetry; i++) 631 { 632 // Read timestamp from synced file 633 constexpr auto syncedPath = "/run/systemd/journal/synced"; 634 std::ifstream syncedFile(syncedPath); 635 if (syncedFile.fail()) 636 { 637 // If the synced file doesn't exist, a sync request will create it. 638 if (errno != ENOENT) 639 { 640 log<level::ERR>("Failed to open journal synced file", 641 entry("FILENAME=%s", syncedPath), 642 entry("ERRNO=%d", errno)); 643 return; 644 } 645 } 646 else 647 { 648 // Only read the synced file if it exists. 649 // See if a sync happened by now 650 std::string timestampStr; 651 std::getline(syncedFile, timestampStr); 652 auto timestamp = std::stoll(timestampStr); 653 if (timestamp >= start) 654 { 655 break; 656 } 657 } 658 659 // Let's ask for a sync, but only once 660 if (!syncRequested) 661 { 662 syncRequested = true; 663 664 constexpr auto JOURNAL_UNIT = "systemd-journald.service"; 665 auto signal = SIGRTMIN + 1; 666 667 auto method = bus.new_method_call(SYSTEMD_BUSNAME, SYSTEMD_PATH, 668 SYSTEMD_INTERFACE, "KillUnit"); 669 method.append(JOURNAL_UNIT, "main", signal); 670 bus.call(method); 671 if (method.is_method_error()) 672 { 673 log<level::ERR>("Failed to kill journal service"); 674 break; 675 } 676 continue; 677 } 678 679 // Let's install the inotify watch, if we didn't do that yet. This watch 680 // monitors the syncedFile for when journald updates it with a newer 681 // timestamp. This means the journal has been flushed. 682 if (fd < 0) 683 { 684 fd = inotify_init1(IN_NONBLOCK | IN_CLOEXEC); 685 if (fd < 0) 686 { 687 log<level::ERR>("Failed to create inotify watch", 688 entry("ERRNO=%d", errno)); 689 return; 690 } 691 692 constexpr auto JOURNAL_RUN_PATH = "/run/systemd/journal"; 693 wd = inotify_add_watch(fd, JOURNAL_RUN_PATH, 694 IN_MOVED_TO | IN_DONT_FOLLOW | IN_ONLYDIR); 695 if (wd < 0) 696 { 697 log<level::ERR>("Failed to watch journal directory", 698 entry("PATH=%s", JOURNAL_RUN_PATH), 699 entry("ERRNO=%d", errno)); 700 close(fd); 701 return; 702 } 703 continue; 704 } 705 706 // Let's wait until inotify reports an event 707 struct pollfd fds = { 708 .fd = fd, 709 .events = POLLIN, 710 }; 711 constexpr auto pollTimeout = 5; // 5 seconds 712 rc = poll(&fds, 1, pollTimeout * 1000); 713 if (rc < 0) 714 { 715 log<level::ERR>("Failed to add event", entry("ERRNO=%d", errno), 716 entry("ERR=%s", strerror(-rc))); 717 inotify_rm_watch(fd, wd); 718 close(fd); 719 return; 720 } 721 else if (rc == 0) 722 { 723 log<level::INFO>("Poll timeout, no new journal synced data", 724 entry("TIMEOUT=%d", pollTimeout)); 725 break; 726 } 727 728 // Read from the specified file descriptor until there is no new data, 729 // throwing away everything read since the timestamp will be read at the 730 // beginning of the loop. 731 constexpr auto maxBytes = 64; 732 uint8_t buffer[maxBytes]; 733 while (read(fd, buffer, maxBytes) > 0) 734 ; 735 } 736 737 if (fd != -1) 738 { 739 if (wd != -1) 740 { 741 inotify_rm_watch(fd, wd); 742 } 743 close(fd); 744 } 745 746 return; 747 } 748 749 std::string Manager::readFWVersion() 750 { 751 auto version = util::getOSReleaseValue("VERSION_ID"); 752 753 if (!version) 754 { 755 log<level::ERR>("Unable to read BMC firmware version"); 756 } 757 758 return version.value_or(""); 759 } 760 761 void Manager::create(const std::string& message, Entry::Level severity, 762 const std::map<std::string, std::string>& additionalData) 763 { 764 // Convert the map into a vector of "key=value" strings 765 std::vector<std::string> ad; 766 metadata::associations::combine(additionalData, ad); 767 768 createEntry(message, severity, ad); 769 } 770 771 void Manager::createWithFFDC( 772 const std::string& message, Entry::Level severity, 773 const std::map<std::string, std::string>& additionalData, 774 const FFDCEntries& ffdc) 775 { 776 // Convert the map into a vector of "key=value" strings 777 std::vector<std::string> ad; 778 metadata::associations::combine(additionalData, ad); 779 780 createEntry(message, severity, ad, ffdc); 781 } 782 783 } // namespace internal 784 } // namespace logging 785 } // namespace phosphor 786