1 #include "config.h" 2 3 #include "log_manager.hpp" 4 5 #include "elog_entry.hpp" 6 #include "elog_meta.hpp" 7 #include "elog_serialize.hpp" 8 #include "extensions.hpp" 9 #include "util.hpp" 10 11 #include <poll.h> 12 #include <sys/inotify.h> 13 #include <systemd/sd-bus.h> 14 #include <systemd/sd-journal.h> 15 #include <unistd.h> 16 17 #include <cassert> 18 #include <chrono> 19 #include <cstdio> 20 #include <cstring> 21 #include <fstream> 22 #include <functional> 23 #include <future> 24 #include <iostream> 25 #include <map> 26 #include <phosphor-logging/log.hpp> 27 #include <sdbusplus/vtable.hpp> 28 #include <set> 29 #include <string> 30 #include <vector> 31 #include <xyz/openbmc_project/State/Host/server.hpp> 32 33 using namespace phosphor::logging; 34 using namespace std::chrono; 35 using sdbusplus::exception::SdBusError; 36 extern const std::map<metadata::Metadata, 37 std::function<metadata::associations::Type>> 38 meta; 39 40 namespace phosphor 41 { 42 namespace logging 43 { 44 namespace internal 45 { 46 inline auto getLevel(const std::string& errMsg) 47 { 48 auto reqLevel = Entry::Level::Error; // Default to Error 49 50 auto levelmap = g_errLevelMap.find(errMsg); 51 if (levelmap != g_errLevelMap.end()) 52 { 53 reqLevel = static_cast<Entry::Level>(levelmap->second); 54 } 55 56 return reqLevel; 57 } 58 59 int Manager::getRealErrSize() 60 { 61 return realErrors.size(); 62 } 63 64 int Manager::getInfoErrSize() 65 { 66 return infoErrors.size(); 67 } 68 69 uint32_t Manager::commit(uint64_t transactionId, std::string errMsg) 70 { 71 auto level = getLevel(errMsg); 72 _commit(transactionId, std::move(errMsg), level); 73 return entryId; 74 } 75 76 uint32_t Manager::commitWithLvl(uint64_t transactionId, std::string errMsg, 77 uint32_t errLvl) 78 { 79 _commit(transactionId, std::move(errMsg), 80 static_cast<Entry::Level>(errLvl)); 81 return entryId; 82 } 83 84 void Manager::_commit(uint64_t transactionId [[maybe_unused]], 85 std::string&& errMsg, Entry::Level errLvl) 86 { 87 std::vector<std::string> additionalData{}; 88 89 // When running as a test-case, the system may have a LOT of journal 90 // data and we may not have permissions to do some of the journal sync 91 // operations. Just skip over them. 92 #ifndef TESTCASE 93 94 constexpr const auto transactionIdVar = "TRANSACTION_ID"; 95 // Length of 'TRANSACTION_ID' string. 96 constexpr const auto transactionIdVarSize = std::strlen(transactionIdVar); 97 // Length of 'TRANSACTION_ID=' string. 98 constexpr const auto transactionIdVarOffset = transactionIdVarSize + 1; 99 100 // Flush all the pending log messages into the journal 101 journalSync(); 102 103 sd_journal* j = nullptr; 104 int rc = sd_journal_open(&j, SD_JOURNAL_LOCAL_ONLY); 105 if (rc < 0) 106 { 107 logging::log<logging::level::ERR>( 108 "Failed to open journal", 109 logging::entry("DESCRIPTION=%s", strerror(-rc))); 110 return; 111 } 112 113 std::string transactionIdStr = std::to_string(transactionId); 114 std::set<std::string> metalist; 115 auto metamap = g_errMetaMap.find(errMsg); 116 if (metamap != g_errMetaMap.end()) 117 { 118 metalist.insert(metamap->second.begin(), metamap->second.end()); 119 } 120 121 // Add _PID field information in AdditionalData. 122 metalist.insert("_PID"); 123 124 // Read the journal from the end to get the most recent entry first. 125 // The result from the sd_journal_get_data() is of the form VARIABLE=value. 126 SD_JOURNAL_FOREACH_BACKWARDS(j) 127 { 128 const char* data = nullptr; 129 size_t length = 0; 130 131 // Look for the transaction id metadata variable 132 rc = sd_journal_get_data(j, transactionIdVar, (const void**)&data, 133 &length); 134 if (rc < 0) 135 { 136 // This journal entry does not have the TRANSACTION_ID 137 // metadata variable. 138 continue; 139 } 140 141 // journald does not guarantee that sd_journal_get_data() returns NULL 142 // terminated strings, so need to specify the size to use to compare, 143 // use the returned length instead of anything that relies on NULL 144 // terminators like strlen(). 145 // The data variable is in the form of 'TRANSACTION_ID=1234'. Remove 146 // the TRANSACTION_ID characters plus the (=) sign to do the comparison. 147 // 'data + transactionIdVarOffset' will be in the form of '1234'. 148 // 'length - transactionIdVarOffset' will be the length of '1234'. 149 if ((length <= (transactionIdVarOffset)) || 150 (transactionIdStr.compare(0, transactionIdStr.size(), 151 data + transactionIdVarOffset, 152 length - transactionIdVarOffset) != 0)) 153 { 154 // The value of the TRANSACTION_ID metadata is not the requested 155 // transaction id number. 156 continue; 157 } 158 159 // Search for all metadata variables in the current journal entry. 160 for (auto i = metalist.cbegin(); i != metalist.cend();) 161 { 162 rc = sd_journal_get_data(j, (*i).c_str(), (const void**)&data, 163 &length); 164 if (rc < 0) 165 { 166 // Metadata variable not found, check next metadata variable. 167 i++; 168 continue; 169 } 170 171 // Metadata variable found, save it and remove it from the set. 172 additionalData.emplace_back(data, length); 173 i = metalist.erase(i); 174 } 175 if (metalist.empty()) 176 { 177 // All metadata variables found, break out of journal loop. 178 break; 179 } 180 } 181 if (!metalist.empty()) 182 { 183 // Not all the metadata variables were found in the journal. 184 for (auto& metaVarStr : metalist) 185 { 186 logging::log<logging::level::INFO>( 187 "Failed to find metadata", 188 logging::entry("META_FIELD=%s", metaVarStr.c_str())); 189 } 190 } 191 192 sd_journal_close(j); 193 194 #endif 195 createEntry(errMsg, errLvl, additionalData); 196 } 197 198 void Manager::createEntry(std::string errMsg, Entry::Level errLvl, 199 std::vector<std::string> additionalData, 200 const FFDCEntries& ffdc) 201 { 202 if (!Extensions::disableDefaultLogCaps()) 203 { 204 if (errLvl < Entry::sevLowerLimit) 205 { 206 if (realErrors.size() >= ERROR_CAP) 207 { 208 erase(realErrors.front()); 209 } 210 } 211 else 212 { 213 if (infoErrors.size() >= ERROR_INFO_CAP) 214 { 215 erase(infoErrors.front()); 216 } 217 } 218 } 219 220 entryId++; 221 if (errLvl >= Entry::sevLowerLimit) 222 { 223 infoErrors.push_back(entryId); 224 } 225 else 226 { 227 realErrors.push_back(entryId); 228 } 229 auto ms = std::chrono::duration_cast<std::chrono::milliseconds>( 230 std::chrono::system_clock::now().time_since_epoch()) 231 .count(); 232 auto objPath = std::string(OBJ_ENTRY) + '/' + std::to_string(entryId); 233 234 AssociationList objects{}; 235 processMetadata(errMsg, additionalData, objects); 236 237 auto e = std::make_unique<Entry>(busLog, objPath, entryId, 238 ms, // Milliseconds since 1970 239 errLvl, std::move(errMsg), 240 std::move(additionalData), 241 std::move(objects), fwVersion, *this); 242 auto path = serialize(*e); 243 e->path(path); 244 245 if (isQuiesceOnErrorEnabled() && isCalloutPresent(*e)) 246 { 247 quiesceOnError(entryId); 248 } 249 250 // Add entry before calling the extensions so that they have access to it 251 entries.insert(std::make_pair(entryId, std::move(e))); 252 253 doExtensionLogCreate(*entries.find(entryId)->second, ffdc); 254 255 // Note: No need to close the file descriptors in the FFDC. 256 } 257 258 bool Manager::isQuiesceOnErrorEnabled() 259 { 260 // When running under tests, the Logging.Settings service will not be 261 // present. Assume false. 262 #ifdef TESTCASE 263 return false; 264 #endif 265 266 std::variant<bool> property; 267 268 auto method = this->busLog.new_method_call( 269 "xyz.openbmc_project.Settings", "/xyz/openbmc_project/logging/settings", 270 "org.freedesktop.DBus.Properties", "Get"); 271 272 method.append("xyz.openbmc_project.Logging.Settings", "QuiesceOnHwError"); 273 274 try 275 { 276 auto reply = this->busLog.call(method); 277 reply.read(property); 278 } 279 catch (const SdBusError& e) 280 { 281 log<level::ERR>("Error reading QuiesceOnHwError property", 282 entry("ERROR=%s", e.what())); 283 throw; 284 } 285 286 return std::get<bool>(property); 287 } 288 289 bool Manager::isCalloutPresent(const Entry& entry) 290 { 291 for (const auto& c : entry.additionalData()) 292 { 293 if (c.find("CALLOUT_") != std::string::npos) 294 { 295 return true; 296 } 297 } 298 299 return false; 300 } 301 302 void Manager::findAndRemoveResolvedBlocks() 303 { 304 for (auto& entry : entries) 305 { 306 if (entry.second->resolved()) 307 { 308 checkAndRemoveBlockingError(entry.first); 309 } 310 } 311 } 312 313 void Manager::onEntryResolve(sdbusplus::message::message& msg) 314 { 315 using Interface = std::string; 316 using Property = std::string; 317 using Value = std::string; 318 using Properties = std::map<Property, std::variant<Value>>; 319 320 Interface interface; 321 Properties properties; 322 323 msg.read(interface, properties); 324 325 for (const auto& p : properties) 326 { 327 if (p.first == "Resolved") 328 { 329 findAndRemoveResolvedBlocks(); 330 return; 331 } 332 } 333 } 334 335 void Manager::checkAndQuiesceHost() 336 { 337 // First check host state 338 std::variant<std::string> property; 339 340 auto method = this->busLog.new_method_call( 341 "xyz.openbmc_project.State.Host", "/xyz/openbmc_project/state/host0", 342 "org.freedesktop.DBus.Properties", "Get"); 343 344 method.append("xyz.openbmc_project.State.Host", "CurrentHostState"); 345 346 try 347 { 348 auto reply = this->busLog.call(method); 349 reply.read(property); 350 } 351 catch (const SdBusError& e) 352 { 353 // Quiescing the host is a "best effort" type function. If unable to 354 // read the host state or it comes back empty, just return. 355 // The boot block object will still be created and the associations to 356 // find the log will be present. Don't want a dependency with 357 // phosphor-state-manager service 358 log<level::INFO>("Error reading QuiesceOnHwError property", 359 entry("ERROR=%s", e.what())); 360 return; 361 } 362 363 std::string hostState = std::get<std::string>(property); 364 365 // If host state is empty, do nothing 366 if (hostState.empty()) 367 { 368 return; 369 } 370 371 using Host = sdbusplus::xyz::openbmc_project::State::server::Host; 372 auto state = Host::convertHostStateFromString(hostState); 373 if (state != Host::HostState::Running) 374 { 375 return; 376 } 377 378 auto quiesce = this->busLog.new_method_call( 379 "org.freedesktop.systemd1", "/org/freedesktop/systemd1", 380 "org.freedesktop.systemd1.Manager", "StartUnit"); 381 382 quiesce.append("obmc-host-quiesce@0.target"); 383 quiesce.append("replace"); 384 385 this->busLog.call_noreply(quiesce); 386 } 387 388 void Manager::quiesceOnError(const uint32_t entryId) 389 { 390 // Verify we don't already have this entry blocking 391 auto it = find_if( 392 this->blockingErrors.begin(), this->blockingErrors.end(), 393 [&](std::unique_ptr<Block>& obj) { return obj->entryId == entryId; }); 394 if (it != this->blockingErrors.end()) 395 { 396 // Already recorded so just return 397 logging::log<logging::level::DEBUG>( 398 "QuiesceOnError set and callout present but entry already logged"); 399 return; 400 } 401 402 logging::log<logging::level::INFO>( 403 "QuiesceOnError set and callout present"); 404 405 auto blockPath = 406 std::string(OBJ_LOGGING) + "/block" + std::to_string(entryId); 407 auto blockObj = std::make_unique<Block>(this->busLog, blockPath, entryId); 408 this->blockingErrors.push_back(std::move(blockObj)); 409 410 // Register call back if log is resolved 411 using namespace sdbusplus::bus::match::rules; 412 auto entryPath = std::string(OBJ_ENTRY) + '/' + std::to_string(entryId); 413 auto callback = std::make_unique<sdbusplus::bus::match::match>( 414 this->busLog, 415 propertiesChanged(entryPath, "xyz.openbmc_project.Logging.Entry"), 416 std::bind(std::mem_fn(&Manager::onEntryResolve), this, 417 std::placeholders::_1)); 418 419 propChangedEntryCallback.insert( 420 std::make_pair(entryId, std::move(callback))); 421 422 checkAndQuiesceHost(); 423 } 424 425 void Manager::doExtensionLogCreate(const Entry& entry, const FFDCEntries& ffdc) 426 { 427 // Make the association <endpointpath>/<endpointtype> paths 428 std::vector<std::string> assocs; 429 for (const auto& [forwardType, reverseType, endpoint] : 430 entry.associations()) 431 { 432 std::string e{endpoint}; 433 e += '/' + reverseType; 434 assocs.push_back(e); 435 } 436 437 for (auto& create : Extensions::getCreateFunctions()) 438 { 439 try 440 { 441 create(entry.message(), entry.id(), entry.timestamp(), 442 entry.severity(), entry.additionalData(), assocs, ffdc); 443 } 444 catch (std::exception& e) 445 { 446 log<level::ERR>("An extension's create function threw an exception", 447 phosphor::logging::entry("ERROR=%s", e.what())); 448 } 449 } 450 } 451 452 void Manager::processMetadata(const std::string& /*errorName*/, 453 const std::vector<std::string>& additionalData, 454 AssociationList& objects) const 455 { 456 // additionalData is a list of "metadata=value" 457 constexpr auto separator = '='; 458 for (const auto& entryItem : additionalData) 459 { 460 auto found = entryItem.find(separator); 461 if (std::string::npos != found) 462 { 463 auto metadata = entryItem.substr(0, found); 464 auto iter = meta.find(metadata); 465 if (meta.end() != iter) 466 { 467 (iter->second)(metadata, additionalData, objects); 468 } 469 } 470 } 471 } 472 473 void Manager::checkAndRemoveBlockingError(uint32_t entryId) 474 { 475 // First look for blocking object and remove 476 auto it = find_if( 477 blockingErrors.begin(), blockingErrors.end(), 478 [&](std::unique_ptr<Block>& obj) { return obj->entryId == entryId; }); 479 if (it != blockingErrors.end()) 480 { 481 blockingErrors.erase(it); 482 } 483 484 // Now remove the callback looking for the error to be resolved 485 auto resolveFind = propChangedEntryCallback.find(entryId); 486 if (resolveFind != propChangedEntryCallback.end()) 487 { 488 propChangedEntryCallback.erase(resolveFind); 489 } 490 491 return; 492 } 493 494 void Manager::erase(uint32_t entryId) 495 { 496 auto entryFound = entries.find(entryId); 497 if (entries.end() != entryFound) 498 { 499 for (auto& func : Extensions::getDeleteProhibitedFunctions()) 500 { 501 try 502 { 503 bool prohibited = false; 504 func(entryId, prohibited); 505 if (prohibited) 506 { 507 // Future work remains to throw an error here. 508 return; 509 } 510 } 511 catch (std::exception& e) 512 { 513 log<level::ERR>( 514 "An extension's deleteProhibited function threw " 515 "an exception", 516 entry("ERROR=%s", e.what())); 517 } 518 } 519 520 // Delete the persistent representation of this error. 521 fs::path errorPath(ERRLOG_PERSIST_PATH); 522 errorPath /= std::to_string(entryId); 523 fs::remove(errorPath); 524 525 auto removeId = [](std::list<uint32_t>& ids, uint32_t id) { 526 auto it = std::find(ids.begin(), ids.end(), id); 527 if (it != ids.end()) 528 { 529 ids.erase(it); 530 } 531 }; 532 if (entryFound->second->severity() >= Entry::sevLowerLimit) 533 { 534 removeId(infoErrors, entryId); 535 } 536 else 537 { 538 removeId(realErrors, entryId); 539 } 540 entries.erase(entryFound); 541 542 checkAndRemoveBlockingError(entryId); 543 544 for (auto& remove : Extensions::getDeleteFunctions()) 545 { 546 try 547 { 548 remove(entryId); 549 } 550 catch (std::exception& e) 551 { 552 log<level::ERR>("An extension's delete function threw an " 553 "exception", 554 entry("ERROR=%s", e.what())); 555 } 556 } 557 } 558 else 559 { 560 logging::log<level::ERR>("Invalid entry ID to delete", 561 logging::entry("ID=%d", entryId)); 562 } 563 } 564 565 void Manager::restore() 566 { 567 auto sanity = [](const auto& id, const auto& restoredId) { 568 return id == restoredId; 569 }; 570 std::vector<uint32_t> errorIds; 571 572 fs::path dir(ERRLOG_PERSIST_PATH); 573 if (!fs::exists(dir) || fs::is_empty(dir)) 574 { 575 return; 576 } 577 578 for (auto& file : fs::directory_iterator(dir)) 579 { 580 auto id = file.path().filename().c_str(); 581 auto idNum = std::stol(id); 582 auto e = std::make_unique<Entry>( 583 busLog, std::string(OBJ_ENTRY) + '/' + id, idNum, *this); 584 if (deserialize(file.path(), *e)) 585 { 586 // validate the restored error entry id 587 if (sanity(static_cast<uint32_t>(idNum), e->id())) 588 { 589 e->path(file.path()); 590 e->emit_object_added(); 591 if (e->severity() >= Entry::sevLowerLimit) 592 { 593 infoErrors.push_back(idNum); 594 } 595 else 596 { 597 realErrors.push_back(idNum); 598 } 599 600 entries.insert(std::make_pair(idNum, std::move(e))); 601 errorIds.push_back(idNum); 602 } 603 else 604 { 605 logging::log<logging::level::ERR>( 606 "Failed in sanity check while restoring error entry. " 607 "Ignoring error entry", 608 logging::entry("ID_NUM=%d", idNum), 609 logging::entry("ENTRY_ID=%d", e->id())); 610 } 611 } 612 } 613 614 if (!errorIds.empty()) 615 { 616 entryId = *(std::max_element(errorIds.begin(), errorIds.end())); 617 } 618 } 619 620 void Manager::journalSync() 621 { 622 bool syncRequested = false; 623 auto fd = -1; 624 auto rc = -1; 625 auto wd = -1; 626 auto bus = sdbusplus::bus::new_default(); 627 628 auto start = 629 duration_cast<microseconds>(steady_clock::now().time_since_epoch()) 630 .count(); 631 632 // Each time an error log is committed, a request to sync the journal 633 // must occur and block that error log commit until it completes. A 5sec 634 // block is done to allow sufficient time for the journal to be synced. 635 // 636 // Number of loop iterations = 3 for the following reasons: 637 // Iteration #1: Requests a journal sync by killing the journald service. 638 // Iteration #2: Setup an inotify watch to monitor the synced file that 639 // journald updates with the timestamp the last time the 640 // journal was flushed. 641 // Iteration #3: Poll to wait until inotify reports an event which blocks 642 // the error log from being commited until the sync completes. 643 constexpr auto maxRetry = 3; 644 for (int i = 0; i < maxRetry; i++) 645 { 646 // Read timestamp from synced file 647 constexpr auto syncedPath = "/run/systemd/journal/synced"; 648 std::ifstream syncedFile(syncedPath); 649 if (syncedFile.fail()) 650 { 651 // If the synced file doesn't exist, a sync request will create it. 652 if (errno != ENOENT) 653 { 654 log<level::ERR>("Failed to open journal synced file", 655 entry("FILENAME=%s", syncedPath), 656 entry("ERRNO=%d", errno)); 657 return; 658 } 659 } 660 else 661 { 662 // Only read the synced file if it exists. 663 // See if a sync happened by now 664 std::string timestampStr; 665 std::getline(syncedFile, timestampStr); 666 auto timestamp = std::stoll(timestampStr); 667 if (timestamp >= start) 668 { 669 break; 670 } 671 } 672 673 // Let's ask for a sync, but only once 674 if (!syncRequested) 675 { 676 syncRequested = true; 677 678 constexpr auto JOURNAL_UNIT = "systemd-journald.service"; 679 auto signal = SIGRTMIN + 1; 680 681 auto method = bus.new_method_call(SYSTEMD_BUSNAME, SYSTEMD_PATH, 682 SYSTEMD_INTERFACE, "KillUnit"); 683 method.append(JOURNAL_UNIT, "main", signal); 684 bus.call(method); 685 if (method.is_method_error()) 686 { 687 log<level::ERR>("Failed to kill journal service"); 688 break; 689 } 690 691 continue; 692 } 693 694 // Let's install the inotify watch, if we didn't do that yet. This watch 695 // monitors the syncedFile for when journald updates it with a newer 696 // timestamp. This means the journal has been flushed. 697 if (fd < 0) 698 { 699 fd = inotify_init1(IN_NONBLOCK | IN_CLOEXEC); 700 if (fd < 0) 701 { 702 log<level::ERR>("Failed to create inotify watch", 703 entry("ERRNO=%d", errno)); 704 return; 705 } 706 707 constexpr auto JOURNAL_RUN_PATH = "/run/systemd/journal"; 708 wd = inotify_add_watch(fd, JOURNAL_RUN_PATH, 709 IN_MOVED_TO | IN_DONT_FOLLOW | IN_ONLYDIR); 710 if (wd < 0) 711 { 712 log<level::ERR>("Failed to watch journal directory", 713 entry("PATH=%s", JOURNAL_RUN_PATH), 714 entry("ERRNO=%d", errno)); 715 close(fd); 716 return; 717 } 718 continue; 719 } 720 721 // Let's wait until inotify reports an event 722 struct pollfd fds = { 723 .fd = fd, 724 .events = POLLIN, 725 .revents = 0, 726 }; 727 constexpr auto pollTimeout = 5; // 5 seconds 728 rc = poll(&fds, 1, pollTimeout * 1000); 729 if (rc < 0) 730 { 731 log<level::ERR>("Failed to add event", entry("ERRNO=%d", errno), 732 entry("ERR=%s", strerror(-rc))); 733 inotify_rm_watch(fd, wd); 734 close(fd); 735 return; 736 } 737 else if (rc == 0) 738 { 739 log<level::INFO>("Poll timeout, no new journal synced data", 740 entry("TIMEOUT=%d", pollTimeout)); 741 break; 742 } 743 744 // Read from the specified file descriptor until there is no new data, 745 // throwing away everything read since the timestamp will be read at the 746 // beginning of the loop. 747 constexpr auto maxBytes = 64; 748 uint8_t buffer[maxBytes]; 749 while (read(fd, buffer, maxBytes) > 0) 750 ; 751 } 752 753 if (fd != -1) 754 { 755 if (wd != -1) 756 { 757 inotify_rm_watch(fd, wd); 758 } 759 close(fd); 760 } 761 762 return; 763 } 764 765 std::string Manager::readFWVersion() 766 { 767 auto version = util::getOSReleaseValue("VERSION_ID"); 768 769 if (!version) 770 { 771 log<level::ERR>("Unable to read BMC firmware version"); 772 } 773 774 return version.value_or(""); 775 } 776 777 void Manager::create(const std::string& message, Entry::Level severity, 778 const std::map<std::string, std::string>& additionalData) 779 { 780 // Convert the map into a vector of "key=value" strings 781 std::vector<std::string> ad; 782 metadata::associations::combine(additionalData, ad); 783 784 createEntry(message, severity, ad); 785 } 786 787 void Manager::createWithFFDC( 788 const std::string& message, Entry::Level severity, 789 const std::map<std::string, std::string>& additionalData, 790 const FFDCEntries& ffdc) 791 { 792 // Convert the map into a vector of "key=value" strings 793 std::vector<std::string> ad; 794 metadata::associations::combine(additionalData, ad); 795 796 createEntry(message, severity, ad, ffdc); 797 } 798 799 } // namespace internal 800 } // namespace logging 801 } // namespace phosphor 802