1 #include "config.h" 2 3 #include "log_manager.hpp" 4 5 #include "elog_entry.hpp" 6 #include "elog_meta.hpp" 7 #include "elog_serialize.hpp" 8 #include "extensions.hpp" 9 #include "util.hpp" 10 11 #include <poll.h> 12 #include <sys/inotify.h> 13 #include <systemd/sd-bus.h> 14 #include <systemd/sd-journal.h> 15 #include <unistd.h> 16 17 #include <cassert> 18 #include <chrono> 19 #include <cstdio> 20 #include <cstring> 21 #include <fstream> 22 #include <functional> 23 #include <future> 24 #include <iostream> 25 #include <map> 26 #include <phosphor-logging/log.hpp> 27 #include <sdbusplus/vtable.hpp> 28 #include <set> 29 #include <string> 30 #include <vector> 31 #include <xyz/openbmc_project/State/Host/server.hpp> 32 33 using namespace phosphor::logging; 34 using namespace std::chrono; 35 using sdbusplus::exception::SdBusError; 36 extern const std::map<metadata::Metadata, 37 std::function<metadata::associations::Type>> 38 meta; 39 40 namespace phosphor 41 { 42 namespace logging 43 { 44 namespace internal 45 { 46 inline auto getLevel(const std::string& errMsg) 47 { 48 auto reqLevel = Entry::Level::Error; // Default to Error 49 50 auto levelmap = g_errLevelMap.find(errMsg); 51 if (levelmap != g_errLevelMap.end()) 52 { 53 reqLevel = static_cast<Entry::Level>(levelmap->second); 54 } 55 56 return reqLevel; 57 } 58 59 int Manager::getRealErrSize() 60 { 61 return realErrors.size(); 62 } 63 64 int Manager::getInfoErrSize() 65 { 66 return infoErrors.size(); 67 } 68 69 uint32_t Manager::commit(uint64_t transactionId, std::string errMsg) 70 { 71 auto level = getLevel(errMsg); 72 _commit(transactionId, std::move(errMsg), level); 73 return entryId; 74 } 75 76 uint32_t Manager::commitWithLvl(uint64_t transactionId, std::string errMsg, 77 uint32_t errLvl) 78 { 79 _commit(transactionId, std::move(errMsg), 80 static_cast<Entry::Level>(errLvl)); 81 return entryId; 82 } 83 84 void Manager::_commit(uint64_t transactionId [[maybe_unused]], 85 std::string&& errMsg, Entry::Level errLvl) 86 { 87 std::vector<std::string> additionalData{}; 88 89 // When running as a test-case, the system may have a LOT of journal 90 // data and we may not have permissions to do some of the journal sync 91 // operations. Just skip over them. 92 if (!IS_UNIT_TEST) 93 { 94 constexpr const auto transactionIdVar = "TRANSACTION_ID"; 95 // Length of 'TRANSACTION_ID' string. 96 constexpr const auto transactionIdVarSize = 97 std::strlen(transactionIdVar); 98 // Length of 'TRANSACTION_ID=' string. 99 constexpr const auto transactionIdVarOffset = transactionIdVarSize + 1; 100 101 // Flush all the pending log messages into the journal 102 journalSync(); 103 104 sd_journal* j = nullptr; 105 int rc = sd_journal_open(&j, SD_JOURNAL_LOCAL_ONLY); 106 if (rc < 0) 107 { 108 logging::log<logging::level::ERR>( 109 "Failed to open journal", 110 logging::entry("DESCRIPTION=%s", strerror(-rc))); 111 return; 112 } 113 114 std::string transactionIdStr = std::to_string(transactionId); 115 std::set<std::string> metalist; 116 auto metamap = g_errMetaMap.find(errMsg); 117 if (metamap != g_errMetaMap.end()) 118 { 119 metalist.insert(metamap->second.begin(), metamap->second.end()); 120 } 121 122 // Add _PID field information in AdditionalData. 123 metalist.insert("_PID"); 124 125 // Read the journal from the end to get the most recent entry first. 126 // The result from the sd_journal_get_data() is of the form 127 // VARIABLE=value. 128 SD_JOURNAL_FOREACH_BACKWARDS(j) 129 { 130 const char* data = nullptr; 131 size_t length = 0; 132 133 // Look for the transaction id metadata variable 134 rc = sd_journal_get_data(j, transactionIdVar, (const void**)&data, 135 &length); 136 if (rc < 0) 137 { 138 // This journal entry does not have the TRANSACTION_ID 139 // metadata variable. 140 continue; 141 } 142 143 // journald does not guarantee that sd_journal_get_data() returns 144 // NULL terminated strings, so need to specify the size to use to 145 // compare, use the returned length instead of anything that relies 146 // on NULL terminators like strlen(). The data variable is in the 147 // form of 'TRANSACTION_ID=1234'. Remove the TRANSACTION_ID 148 // characters plus the (=) sign to do the comparison. 'data + 149 // transactionIdVarOffset' will be in the form of '1234'. 'length - 150 // transactionIdVarOffset' will be the length of '1234'. 151 if ((length <= (transactionIdVarOffset)) || 152 (transactionIdStr.compare( 153 0, transactionIdStr.size(), data + transactionIdVarOffset, 154 length - transactionIdVarOffset) != 0)) 155 { 156 // The value of the TRANSACTION_ID metadata is not the requested 157 // transaction id number. 158 continue; 159 } 160 161 // Search for all metadata variables in the current journal entry. 162 for (auto i = metalist.cbegin(); i != metalist.cend();) 163 { 164 rc = sd_journal_get_data(j, (*i).c_str(), (const void**)&data, 165 &length); 166 if (rc < 0) 167 { 168 // Metadata variable not found, check next metadata 169 // variable. 170 i++; 171 continue; 172 } 173 174 // Metadata variable found, save it and remove it from the set. 175 additionalData.emplace_back(data, length); 176 i = metalist.erase(i); 177 } 178 if (metalist.empty()) 179 { 180 // All metadata variables found, break out of journal loop. 181 break; 182 } 183 } 184 if (!metalist.empty()) 185 { 186 // Not all the metadata variables were found in the journal. 187 for (auto& metaVarStr : metalist) 188 { 189 logging::log<logging::level::INFO>( 190 "Failed to find metadata", 191 logging::entry("META_FIELD=%s", metaVarStr.c_str())); 192 } 193 } 194 195 sd_journal_close(j); 196 } 197 createEntry(errMsg, errLvl, additionalData); 198 } 199 200 void Manager::createEntry(std::string errMsg, Entry::Level errLvl, 201 std::vector<std::string> additionalData, 202 const FFDCEntries& ffdc) 203 { 204 if (!Extensions::disableDefaultLogCaps()) 205 { 206 if (errLvl < Entry::sevLowerLimit) 207 { 208 if (realErrors.size() >= ERROR_CAP) 209 { 210 erase(realErrors.front()); 211 } 212 } 213 else 214 { 215 if (infoErrors.size() >= ERROR_INFO_CAP) 216 { 217 erase(infoErrors.front()); 218 } 219 } 220 } 221 222 entryId++; 223 if (errLvl >= Entry::sevLowerLimit) 224 { 225 infoErrors.push_back(entryId); 226 } 227 else 228 { 229 realErrors.push_back(entryId); 230 } 231 auto ms = std::chrono::duration_cast<std::chrono::milliseconds>( 232 std::chrono::system_clock::now().time_since_epoch()) 233 .count(); 234 auto objPath = std::string(OBJ_ENTRY) + '/' + std::to_string(entryId); 235 236 AssociationList objects{}; 237 processMetadata(errMsg, additionalData, objects); 238 239 auto e = std::make_unique<Entry>(busLog, objPath, entryId, 240 ms, // Milliseconds since 1970 241 errLvl, std::move(errMsg), 242 std::move(additionalData), 243 std::move(objects), fwVersion, *this); 244 auto path = serialize(*e); 245 e->path(path); 246 247 if (isQuiesceOnErrorEnabled() && isCalloutPresent(*e)) 248 { 249 quiesceOnError(entryId); 250 } 251 252 // Add entry before calling the extensions so that they have access to it 253 entries.insert(std::make_pair(entryId, std::move(e))); 254 255 doExtensionLogCreate(*entries.find(entryId)->second, ffdc); 256 257 // Note: No need to close the file descriptors in the FFDC. 258 } 259 260 bool Manager::isQuiesceOnErrorEnabled() 261 { 262 // When running under tests, the Logging.Settings service will not be 263 // present. Assume false. 264 if (IS_UNIT_TEST) 265 { 266 return false; 267 } 268 269 std::variant<bool> property; 270 271 auto method = this->busLog.new_method_call( 272 "xyz.openbmc_project.Settings", "/xyz/openbmc_project/logging/settings", 273 "org.freedesktop.DBus.Properties", "Get"); 274 275 method.append("xyz.openbmc_project.Logging.Settings", "QuiesceOnHwError"); 276 277 try 278 { 279 auto reply = this->busLog.call(method); 280 reply.read(property); 281 } 282 catch (const SdBusError& e) 283 { 284 log<level::ERR>("Error reading QuiesceOnHwError property", 285 entry("ERROR=%s", e.what())); 286 throw; 287 } 288 289 return std::get<bool>(property); 290 } 291 292 bool Manager::isCalloutPresent(const Entry& entry) 293 { 294 for (const auto& c : entry.additionalData()) 295 { 296 if (c.find("CALLOUT_") != std::string::npos) 297 { 298 return true; 299 } 300 } 301 302 return false; 303 } 304 305 void Manager::findAndRemoveResolvedBlocks() 306 { 307 for (auto& entry : entries) 308 { 309 if (entry.second->resolved()) 310 { 311 checkAndRemoveBlockingError(entry.first); 312 } 313 } 314 } 315 316 void Manager::onEntryResolve(sdbusplus::message::message& msg) 317 { 318 using Interface = std::string; 319 using Property = std::string; 320 using Value = std::string; 321 using Properties = std::map<Property, std::variant<Value>>; 322 323 Interface interface; 324 Properties properties; 325 326 msg.read(interface, properties); 327 328 for (const auto& p : properties) 329 { 330 if (p.first == "Resolved") 331 { 332 findAndRemoveResolvedBlocks(); 333 return; 334 } 335 } 336 } 337 338 void Manager::checkAndQuiesceHost() 339 { 340 // First check host state 341 std::variant<std::string> property; 342 343 auto method = this->busLog.new_method_call( 344 "xyz.openbmc_project.State.Host", "/xyz/openbmc_project/state/host0", 345 "org.freedesktop.DBus.Properties", "Get"); 346 347 method.append("xyz.openbmc_project.State.Host", "CurrentHostState"); 348 349 try 350 { 351 auto reply = this->busLog.call(method); 352 reply.read(property); 353 } 354 catch (const SdBusError& e) 355 { 356 // Quiescing the host is a "best effort" type function. If unable to 357 // read the host state or it comes back empty, just return. 358 // The boot block object will still be created and the associations to 359 // find the log will be present. Don't want a dependency with 360 // phosphor-state-manager service 361 log<level::INFO>("Error reading QuiesceOnHwError property", 362 entry("ERROR=%s", e.what())); 363 return; 364 } 365 366 std::string hostState = std::get<std::string>(property); 367 368 // If host state is empty, do nothing 369 if (hostState.empty()) 370 { 371 return; 372 } 373 374 using Host = sdbusplus::xyz::openbmc_project::State::server::Host; 375 auto state = Host::convertHostStateFromString(hostState); 376 if (state != Host::HostState::Running) 377 { 378 return; 379 } 380 381 auto quiesce = this->busLog.new_method_call( 382 "org.freedesktop.systemd1", "/org/freedesktop/systemd1", 383 "org.freedesktop.systemd1.Manager", "StartUnit"); 384 385 quiesce.append("obmc-host-quiesce@0.target"); 386 quiesce.append("replace"); 387 388 this->busLog.call_noreply(quiesce); 389 } 390 391 void Manager::quiesceOnError(const uint32_t entryId) 392 { 393 // Verify we don't already have this entry blocking 394 auto it = find_if( 395 this->blockingErrors.begin(), this->blockingErrors.end(), 396 [&](std::unique_ptr<Block>& obj) { return obj->entryId == entryId; }); 397 if (it != this->blockingErrors.end()) 398 { 399 // Already recorded so just return 400 logging::log<logging::level::DEBUG>( 401 "QuiesceOnError set and callout present but entry already logged"); 402 return; 403 } 404 405 logging::log<logging::level::INFO>( 406 "QuiesceOnError set and callout present"); 407 408 auto blockPath = 409 std::string(OBJ_LOGGING) + "/block" + std::to_string(entryId); 410 auto blockObj = std::make_unique<Block>(this->busLog, blockPath, entryId); 411 this->blockingErrors.push_back(std::move(blockObj)); 412 413 // Register call back if log is resolved 414 using namespace sdbusplus::bus::match::rules; 415 auto entryPath = std::string(OBJ_ENTRY) + '/' + std::to_string(entryId); 416 auto callback = std::make_unique<sdbusplus::bus::match::match>( 417 this->busLog, 418 propertiesChanged(entryPath, "xyz.openbmc_project.Logging.Entry"), 419 std::bind(std::mem_fn(&Manager::onEntryResolve), this, 420 std::placeholders::_1)); 421 422 propChangedEntryCallback.insert( 423 std::make_pair(entryId, std::move(callback))); 424 425 checkAndQuiesceHost(); 426 } 427 428 void Manager::doExtensionLogCreate(const Entry& entry, const FFDCEntries& ffdc) 429 { 430 // Make the association <endpointpath>/<endpointtype> paths 431 std::vector<std::string> assocs; 432 for (const auto& [forwardType, reverseType, endpoint] : 433 entry.associations()) 434 { 435 std::string e{endpoint}; 436 e += '/' + reverseType; 437 assocs.push_back(e); 438 } 439 440 for (auto& create : Extensions::getCreateFunctions()) 441 { 442 try 443 { 444 create(entry.message(), entry.id(), entry.timestamp(), 445 entry.severity(), entry.additionalData(), assocs, ffdc); 446 } 447 catch (std::exception& e) 448 { 449 log<level::ERR>("An extension's create function threw an exception", 450 phosphor::logging::entry("ERROR=%s", e.what())); 451 } 452 } 453 } 454 455 void Manager::processMetadata(const std::string& /*errorName*/, 456 const std::vector<std::string>& additionalData, 457 AssociationList& objects) const 458 { 459 // additionalData is a list of "metadata=value" 460 constexpr auto separator = '='; 461 for (const auto& entryItem : additionalData) 462 { 463 auto found = entryItem.find(separator); 464 if (std::string::npos != found) 465 { 466 auto metadata = entryItem.substr(0, found); 467 auto iter = meta.find(metadata); 468 if (meta.end() != iter) 469 { 470 (iter->second)(metadata, additionalData, objects); 471 } 472 } 473 } 474 } 475 476 void Manager::checkAndRemoveBlockingError(uint32_t entryId) 477 { 478 // First look for blocking object and remove 479 auto it = find_if( 480 blockingErrors.begin(), blockingErrors.end(), 481 [&](std::unique_ptr<Block>& obj) { return obj->entryId == entryId; }); 482 if (it != blockingErrors.end()) 483 { 484 blockingErrors.erase(it); 485 } 486 487 // Now remove the callback looking for the error to be resolved 488 auto resolveFind = propChangedEntryCallback.find(entryId); 489 if (resolveFind != propChangedEntryCallback.end()) 490 { 491 propChangedEntryCallback.erase(resolveFind); 492 } 493 494 return; 495 } 496 497 void Manager::erase(uint32_t entryId) 498 { 499 auto entryFound = entries.find(entryId); 500 if (entries.end() != entryFound) 501 { 502 for (auto& func : Extensions::getDeleteProhibitedFunctions()) 503 { 504 try 505 { 506 bool prohibited = false; 507 func(entryId, prohibited); 508 if (prohibited) 509 { 510 // Future work remains to throw an error here. 511 return; 512 } 513 } 514 catch (std::exception& e) 515 { 516 log<level::ERR>( 517 "An extension's deleteProhibited function threw " 518 "an exception", 519 entry("ERROR=%s", e.what())); 520 } 521 } 522 523 // Delete the persistent representation of this error. 524 fs::path errorPath(ERRLOG_PERSIST_PATH); 525 errorPath /= std::to_string(entryId); 526 fs::remove(errorPath); 527 528 auto removeId = [](std::list<uint32_t>& ids, uint32_t id) { 529 auto it = std::find(ids.begin(), ids.end(), id); 530 if (it != ids.end()) 531 { 532 ids.erase(it); 533 } 534 }; 535 if (entryFound->second->severity() >= Entry::sevLowerLimit) 536 { 537 removeId(infoErrors, entryId); 538 } 539 else 540 { 541 removeId(realErrors, entryId); 542 } 543 entries.erase(entryFound); 544 545 checkAndRemoveBlockingError(entryId); 546 547 for (auto& remove : Extensions::getDeleteFunctions()) 548 { 549 try 550 { 551 remove(entryId); 552 } 553 catch (std::exception& e) 554 { 555 log<level::ERR>("An extension's delete function threw an " 556 "exception", 557 entry("ERROR=%s", e.what())); 558 } 559 } 560 } 561 else 562 { 563 logging::log<level::ERR>("Invalid entry ID to delete", 564 logging::entry("ID=%d", entryId)); 565 } 566 } 567 568 void Manager::restore() 569 { 570 auto sanity = [](const auto& id, const auto& restoredId) { 571 return id == restoredId; 572 }; 573 std::vector<uint32_t> errorIds; 574 575 fs::path dir(ERRLOG_PERSIST_PATH); 576 if (!fs::exists(dir) || fs::is_empty(dir)) 577 { 578 return; 579 } 580 581 for (auto& file : fs::directory_iterator(dir)) 582 { 583 auto id = file.path().filename().c_str(); 584 auto idNum = std::stol(id); 585 auto e = std::make_unique<Entry>( 586 busLog, std::string(OBJ_ENTRY) + '/' + id, idNum, *this); 587 if (deserialize(file.path(), *e)) 588 { 589 // validate the restored error entry id 590 if (sanity(static_cast<uint32_t>(idNum), e->id())) 591 { 592 e->path(file.path()); 593 e->emit_object_added(); 594 if (e->severity() >= Entry::sevLowerLimit) 595 { 596 infoErrors.push_back(idNum); 597 } 598 else 599 { 600 realErrors.push_back(idNum); 601 } 602 603 entries.insert(std::make_pair(idNum, std::move(e))); 604 errorIds.push_back(idNum); 605 } 606 else 607 { 608 logging::log<logging::level::ERR>( 609 "Failed in sanity check while restoring error entry. " 610 "Ignoring error entry", 611 logging::entry("ID_NUM=%d", idNum), 612 logging::entry("ENTRY_ID=%d", e->id())); 613 } 614 } 615 } 616 617 if (!errorIds.empty()) 618 { 619 entryId = *(std::max_element(errorIds.begin(), errorIds.end())); 620 } 621 } 622 623 void Manager::journalSync() 624 { 625 bool syncRequested = false; 626 auto fd = -1; 627 auto rc = -1; 628 auto wd = -1; 629 auto bus = sdbusplus::bus::new_default(); 630 631 auto start = 632 duration_cast<microseconds>(steady_clock::now().time_since_epoch()) 633 .count(); 634 635 // Each time an error log is committed, a request to sync the journal 636 // must occur and block that error log commit until it completes. A 5sec 637 // block is done to allow sufficient time for the journal to be synced. 638 // 639 // Number of loop iterations = 3 for the following reasons: 640 // Iteration #1: Requests a journal sync by killing the journald service. 641 // Iteration #2: Setup an inotify watch to monitor the synced file that 642 // journald updates with the timestamp the last time the 643 // journal was flushed. 644 // Iteration #3: Poll to wait until inotify reports an event which blocks 645 // the error log from being commited until the sync completes. 646 constexpr auto maxRetry = 3; 647 for (int i = 0; i < maxRetry; i++) 648 { 649 // Read timestamp from synced file 650 constexpr auto syncedPath = "/run/systemd/journal/synced"; 651 std::ifstream syncedFile(syncedPath); 652 if (syncedFile.fail()) 653 { 654 // If the synced file doesn't exist, a sync request will create it. 655 if (errno != ENOENT) 656 { 657 log<level::ERR>("Failed to open journal synced file", 658 entry("FILENAME=%s", syncedPath), 659 entry("ERRNO=%d", errno)); 660 return; 661 } 662 } 663 else 664 { 665 // Only read the synced file if it exists. 666 // See if a sync happened by now 667 std::string timestampStr; 668 std::getline(syncedFile, timestampStr); 669 auto timestamp = std::stoll(timestampStr); 670 if (timestamp >= start) 671 { 672 break; 673 } 674 } 675 676 // Let's ask for a sync, but only once 677 if (!syncRequested) 678 { 679 syncRequested = true; 680 681 constexpr auto JOURNAL_UNIT = "systemd-journald.service"; 682 auto signal = SIGRTMIN + 1; 683 684 auto method = bus.new_method_call(SYSTEMD_BUSNAME, SYSTEMD_PATH, 685 SYSTEMD_INTERFACE, "KillUnit"); 686 method.append(JOURNAL_UNIT, "main", signal); 687 bus.call(method); 688 if (method.is_method_error()) 689 { 690 log<level::ERR>("Failed to kill journal service"); 691 break; 692 } 693 694 continue; 695 } 696 697 // Let's install the inotify watch, if we didn't do that yet. This watch 698 // monitors the syncedFile for when journald updates it with a newer 699 // timestamp. This means the journal has been flushed. 700 if (fd < 0) 701 { 702 fd = inotify_init1(IN_NONBLOCK | IN_CLOEXEC); 703 if (fd < 0) 704 { 705 log<level::ERR>("Failed to create inotify watch", 706 entry("ERRNO=%d", errno)); 707 return; 708 } 709 710 constexpr auto JOURNAL_RUN_PATH = "/run/systemd/journal"; 711 wd = inotify_add_watch(fd, JOURNAL_RUN_PATH, 712 IN_MOVED_TO | IN_DONT_FOLLOW | IN_ONLYDIR); 713 if (wd < 0) 714 { 715 log<level::ERR>("Failed to watch journal directory", 716 entry("PATH=%s", JOURNAL_RUN_PATH), 717 entry("ERRNO=%d", errno)); 718 close(fd); 719 return; 720 } 721 continue; 722 } 723 724 // Let's wait until inotify reports an event 725 struct pollfd fds = { 726 fd, 727 POLLIN, 728 0, 729 }; 730 constexpr auto pollTimeout = 5; // 5 seconds 731 rc = poll(&fds, 1, pollTimeout * 1000); 732 if (rc < 0) 733 { 734 log<level::ERR>("Failed to add event", entry("ERRNO=%d", errno), 735 entry("ERR=%s", strerror(-rc))); 736 inotify_rm_watch(fd, wd); 737 close(fd); 738 return; 739 } 740 else if (rc == 0) 741 { 742 log<level::INFO>("Poll timeout, no new journal synced data", 743 entry("TIMEOUT=%d", pollTimeout)); 744 break; 745 } 746 747 // Read from the specified file descriptor until there is no new data, 748 // throwing away everything read since the timestamp will be read at the 749 // beginning of the loop. 750 constexpr auto maxBytes = 64; 751 uint8_t buffer[maxBytes]; 752 while (read(fd, buffer, maxBytes) > 0) 753 ; 754 } 755 756 if (fd != -1) 757 { 758 if (wd != -1) 759 { 760 inotify_rm_watch(fd, wd); 761 } 762 close(fd); 763 } 764 765 return; 766 } 767 768 std::string Manager::readFWVersion() 769 { 770 auto version = util::getOSReleaseValue("VERSION_ID"); 771 772 if (!version) 773 { 774 log<level::ERR>("Unable to read BMC firmware version"); 775 } 776 777 return version.value_or(""); 778 } 779 780 void Manager::create(const std::string& message, Entry::Level severity, 781 const std::map<std::string, std::string>& additionalData) 782 { 783 // Convert the map into a vector of "key=value" strings 784 std::vector<std::string> ad; 785 metadata::associations::combine(additionalData, ad); 786 787 createEntry(message, severity, ad); 788 } 789 790 void Manager::createWithFFDC( 791 const std::string& message, Entry::Level severity, 792 const std::map<std::string, std::string>& additionalData, 793 const FFDCEntries& ffdc) 794 { 795 // Convert the map into a vector of "key=value" strings 796 std::vector<std::string> ad; 797 metadata::associations::combine(additionalData, ad); 798 799 createEntry(message, severity, ad, ffdc); 800 } 801 802 } // namespace internal 803 } // namespace logging 804 } // namespace phosphor 805