1 #include "config.h" 2 3 #include "log_manager.hpp" 4 5 #include "elog_entry.hpp" 6 #include "elog_meta.hpp" 7 #include "elog_serialize.hpp" 8 #include "extensions.hpp" 9 #include "util.hpp" 10 11 #include <poll.h> 12 #include <sys/inotify.h> 13 #include <systemd/sd-bus.h> 14 #include <systemd/sd-journal.h> 15 #include <unistd.h> 16 17 #include <cassert> 18 #include <chrono> 19 #include <cstdio> 20 #include <cstring> 21 #include <fstream> 22 #include <functional> 23 #include <future> 24 #include <iostream> 25 #include <map> 26 #include <phosphor-logging/lg2.hpp> 27 #include <sdbusplus/vtable.hpp> 28 #include <set> 29 #include <string> 30 #include <string_view> 31 #include <vector> 32 #include <xyz/openbmc_project/State/Host/server.hpp> 33 34 using namespace std::chrono; 35 using sdbusplus::exception::SdBusError; 36 extern const std::map< 37 phosphor::logging::metadata::Metadata, 38 std::function<phosphor::logging::metadata::associations::Type>> 39 meta; 40 41 namespace phosphor 42 { 43 namespace logging 44 { 45 namespace internal 46 { 47 inline auto getLevel(const std::string& errMsg) 48 { 49 auto reqLevel = Entry::Level::Error; // Default to Error 50 51 auto levelmap = g_errLevelMap.find(errMsg); 52 if (levelmap != g_errLevelMap.end()) 53 { 54 reqLevel = static_cast<Entry::Level>(levelmap->second); 55 } 56 57 return reqLevel; 58 } 59 60 int Manager::getRealErrSize() 61 { 62 return realErrors.size(); 63 } 64 65 int Manager::getInfoErrSize() 66 { 67 return infoErrors.size(); 68 } 69 70 uint32_t Manager::commit(uint64_t transactionId, std::string errMsg) 71 { 72 auto level = getLevel(errMsg); 73 _commit(transactionId, std::move(errMsg), level); 74 return entryId; 75 } 76 77 uint32_t Manager::commitWithLvl(uint64_t transactionId, std::string errMsg, 78 uint32_t errLvl) 79 { 80 _commit(transactionId, std::move(errMsg), 81 static_cast<Entry::Level>(errLvl)); 82 return entryId; 83 } 84 85 void Manager::_commit(uint64_t transactionId [[maybe_unused]], 86 std::string&& errMsg, Entry::Level errLvl) 87 { 88 std::vector<std::string> additionalData{}; 89 90 // When running as a test-case, the system may have a LOT of journal 91 // data and we may not have permissions to do some of the journal sync 92 // operations. Just skip over them. 93 if (!IS_UNIT_TEST) 94 { 95 static constexpr auto transactionIdVar = 96 std::string_view{"TRANSACTION_ID"}; 97 // Length of 'TRANSACTION_ID' string. 98 static constexpr auto transactionIdVarSize = transactionIdVar.size(); 99 // Length of 'TRANSACTION_ID=' string. 100 static constexpr auto transactionIdVarOffset = transactionIdVarSize + 1; 101 102 // Flush all the pending log messages into the journal 103 journalSync(); 104 105 sd_journal* j = nullptr; 106 int rc = sd_journal_open(&j, SD_JOURNAL_LOCAL_ONLY); 107 if (rc < 0) 108 { 109 lg2::error("Failed to open journal: {ERROR}", "ERROR", 110 strerror(-rc)); 111 return; 112 } 113 114 std::string transactionIdStr = std::to_string(transactionId); 115 std::set<std::string> metalist; 116 auto metamap = g_errMetaMap.find(errMsg); 117 if (metamap != g_errMetaMap.end()) 118 { 119 metalist.insert(metamap->second.begin(), metamap->second.end()); 120 } 121 122 // Add _PID field information in AdditionalData. 123 metalist.insert("_PID"); 124 125 // Read the journal from the end to get the most recent entry first. 126 // The result from the sd_journal_get_data() is of the form 127 // VARIABLE=value. 128 SD_JOURNAL_FOREACH_BACKWARDS(j) 129 { 130 const char* data = nullptr; 131 size_t length = 0; 132 133 // Look for the transaction id metadata variable 134 rc = sd_journal_get_data(j, transactionIdVar.data(), 135 (const void**)&data, &length); 136 if (rc < 0) 137 { 138 // This journal entry does not have the TRANSACTION_ID 139 // metadata variable. 140 continue; 141 } 142 143 // journald does not guarantee that sd_journal_get_data() returns 144 // NULL terminated strings, so need to specify the size to use to 145 // compare, use the returned length instead of anything that relies 146 // on NULL terminators like strlen(). The data variable is in the 147 // form of 'TRANSACTION_ID=1234'. Remove the TRANSACTION_ID 148 // characters plus the (=) sign to do the comparison. 'data + 149 // transactionIdVarOffset' will be in the form of '1234'. 'length - 150 // transactionIdVarOffset' will be the length of '1234'. 151 if ((length <= (transactionIdVarOffset)) || 152 (transactionIdStr.compare( 153 0, transactionIdStr.size(), data + transactionIdVarOffset, 154 length - transactionIdVarOffset) != 0)) 155 { 156 // The value of the TRANSACTION_ID metadata is not the requested 157 // transaction id number. 158 continue; 159 } 160 161 // Search for all metadata variables in the current journal entry. 162 for (auto i = metalist.cbegin(); i != metalist.cend();) 163 { 164 rc = sd_journal_get_data(j, (*i).c_str(), (const void**)&data, 165 &length); 166 if (rc < 0) 167 { 168 // Metadata variable not found, check next metadata 169 // variable. 170 i++; 171 continue; 172 } 173 174 // Metadata variable found, save it and remove it from the set. 175 additionalData.emplace_back(data, length); 176 i = metalist.erase(i); 177 } 178 if (metalist.empty()) 179 { 180 // All metadata variables found, break out of journal loop. 181 break; 182 } 183 } 184 if (!metalist.empty()) 185 { 186 // Not all the metadata variables were found in the journal. 187 for (auto& metaVarStr : metalist) 188 { 189 lg2::info("Failed to find metadata: {META_FIELD}", "META_FIELD", 190 metaVarStr); 191 } 192 } 193 194 sd_journal_close(j); 195 } 196 createEntry(errMsg, errLvl, additionalData); 197 } 198 199 void Manager::createEntry(std::string errMsg, Entry::Level errLvl, 200 std::vector<std::string> additionalData, 201 const FFDCEntries& ffdc) 202 { 203 if (!Extensions::disableDefaultLogCaps()) 204 { 205 if (errLvl < Entry::sevLowerLimit) 206 { 207 if (realErrors.size() >= ERROR_CAP) 208 { 209 erase(realErrors.front()); 210 } 211 } 212 else 213 { 214 if (infoErrors.size() >= ERROR_INFO_CAP) 215 { 216 erase(infoErrors.front()); 217 } 218 } 219 } 220 221 entryId++; 222 if (errLvl >= Entry::sevLowerLimit) 223 { 224 infoErrors.push_back(entryId); 225 } 226 else 227 { 228 realErrors.push_back(entryId); 229 } 230 auto ms = std::chrono::duration_cast<std::chrono::milliseconds>( 231 std::chrono::system_clock::now().time_since_epoch()) 232 .count(); 233 auto objPath = std::string(OBJ_ENTRY) + '/' + std::to_string(entryId); 234 235 AssociationList objects{}; 236 processMetadata(errMsg, additionalData, objects); 237 238 auto e = std::make_unique<Entry>(busLog, objPath, entryId, 239 ms, // Milliseconds since 1970 240 errLvl, std::move(errMsg), 241 std::move(additionalData), 242 std::move(objects), fwVersion, *this); 243 auto path = serialize(*e); 244 e->path(path); 245 246 if (isQuiesceOnErrorEnabled() && isCalloutPresent(*e)) 247 { 248 quiesceOnError(entryId); 249 } 250 251 // Add entry before calling the extensions so that they have access to it 252 entries.insert(std::make_pair(entryId, std::move(e))); 253 254 doExtensionLogCreate(*entries.find(entryId)->second, ffdc); 255 256 // Note: No need to close the file descriptors in the FFDC. 257 } 258 259 bool Manager::isQuiesceOnErrorEnabled() 260 { 261 // When running under tests, the Logging.Settings service will not be 262 // present. Assume false. 263 if (IS_UNIT_TEST) 264 { 265 return false; 266 } 267 268 std::variant<bool> property; 269 270 auto method = this->busLog.new_method_call( 271 "xyz.openbmc_project.Settings", "/xyz/openbmc_project/logging/settings", 272 "org.freedesktop.DBus.Properties", "Get"); 273 274 method.append("xyz.openbmc_project.Logging.Settings", "QuiesceOnHwError"); 275 276 try 277 { 278 auto reply = this->busLog.call(method); 279 reply.read(property); 280 } 281 catch (const SdBusError& e) 282 { 283 lg2::error("Error reading QuiesceOnHwError property: {ERROR}", "ERROR", 284 e); 285 throw; 286 } 287 288 return std::get<bool>(property); 289 } 290 291 bool Manager::isCalloutPresent(const Entry& entry) 292 { 293 for (const auto& c : entry.additionalData()) 294 { 295 if (c.find("CALLOUT_") != std::string::npos) 296 { 297 return true; 298 } 299 } 300 301 return false; 302 } 303 304 void Manager::findAndRemoveResolvedBlocks() 305 { 306 for (auto& entry : entries) 307 { 308 if (entry.second->resolved()) 309 { 310 checkAndRemoveBlockingError(entry.first); 311 } 312 } 313 } 314 315 void Manager::onEntryResolve(sdbusplus::message::message& msg) 316 { 317 using Interface = std::string; 318 using Property = std::string; 319 using Value = std::string; 320 using Properties = std::map<Property, std::variant<Value>>; 321 322 Interface interface; 323 Properties properties; 324 325 msg.read(interface, properties); 326 327 for (const auto& p : properties) 328 { 329 if (p.first == "Resolved") 330 { 331 findAndRemoveResolvedBlocks(); 332 return; 333 } 334 } 335 } 336 337 void Manager::checkAndQuiesceHost() 338 { 339 using Host = sdbusplus::xyz::openbmc_project::State::server::Host; 340 341 // First check host state 342 std::variant<Host::HostState> property; 343 344 auto method = this->busLog.new_method_call( 345 "xyz.openbmc_project.State.Host", "/xyz/openbmc_project/state/host0", 346 "org.freedesktop.DBus.Properties", "Get"); 347 348 method.append("xyz.openbmc_project.State.Host", "CurrentHostState"); 349 350 try 351 { 352 auto reply = this->busLog.call(method); 353 reply.read(property); 354 } 355 catch (const SdBusError& e) 356 { 357 // Quiescing the host is a "best effort" type function. If unable to 358 // read the host state or it comes back empty, just return. 359 // The boot block object will still be created and the associations to 360 // find the log will be present. Don't want a dependency with 361 // phosphor-state-manager service 362 lg2::info("Error reading QuiesceOnHwError property: {ERROR}", "ERROR", 363 e); 364 return; 365 } 366 367 auto hostState = std::get<Host::HostState>(property); 368 if (hostState != Host::HostState::Running) 369 { 370 return; 371 } 372 373 auto quiesce = this->busLog.new_method_call( 374 "org.freedesktop.systemd1", "/org/freedesktop/systemd1", 375 "org.freedesktop.systemd1.Manager", "StartUnit"); 376 377 quiesce.append("obmc-host-quiesce@0.target"); 378 quiesce.append("replace"); 379 380 this->busLog.call_noreply(quiesce); 381 } 382 383 void Manager::quiesceOnError(const uint32_t entryId) 384 { 385 // Verify we don't already have this entry blocking 386 auto it = find_if( 387 this->blockingErrors.begin(), this->blockingErrors.end(), 388 [&](std::unique_ptr<Block>& obj) { return obj->entryId == entryId; }); 389 if (it != this->blockingErrors.end()) 390 { 391 // Already recorded so just return 392 lg2::debug( 393 "QuiesceOnError set and callout present but entry already logged"); 394 return; 395 } 396 397 lg2::info("QuiesceOnError set and callout present"); 398 399 auto blockPath = 400 std::string(OBJ_LOGGING) + "/block" + std::to_string(entryId); 401 auto blockObj = std::make_unique<Block>(this->busLog, blockPath, entryId); 402 this->blockingErrors.push_back(std::move(blockObj)); 403 404 // Register call back if log is resolved 405 using namespace sdbusplus::bus::match::rules; 406 auto entryPath = std::string(OBJ_ENTRY) + '/' + std::to_string(entryId); 407 auto callback = std::make_unique<sdbusplus::bus::match::match>( 408 this->busLog, 409 propertiesChanged(entryPath, "xyz.openbmc_project.Logging.Entry"), 410 std::bind(std::mem_fn(&Manager::onEntryResolve), this, 411 std::placeholders::_1)); 412 413 propChangedEntryCallback.insert( 414 std::make_pair(entryId, std::move(callback))); 415 416 checkAndQuiesceHost(); 417 } 418 419 void Manager::doExtensionLogCreate(const Entry& entry, const FFDCEntries& ffdc) 420 { 421 // Make the association <endpointpath>/<endpointtype> paths 422 std::vector<std::string> assocs; 423 for (const auto& [forwardType, reverseType, endpoint] : 424 entry.associations()) 425 { 426 std::string e{endpoint}; 427 e += '/' + reverseType; 428 assocs.push_back(e); 429 } 430 431 for (auto& create : Extensions::getCreateFunctions()) 432 { 433 try 434 { 435 create(entry.message(), entry.id(), entry.timestamp(), 436 entry.severity(), entry.additionalData(), assocs, ffdc); 437 } 438 catch (std::exception& e) 439 { 440 lg2::error( 441 "An extension's create function threw an exception: {ERROR}", 442 "ERROR", e); 443 } 444 } 445 } 446 447 void Manager::processMetadata(const std::string& /*errorName*/, 448 const std::vector<std::string>& additionalData, 449 AssociationList& objects) const 450 { 451 // additionalData is a list of "metadata=value" 452 constexpr auto separator = '='; 453 for (const auto& entryItem : additionalData) 454 { 455 auto found = entryItem.find(separator); 456 if (std::string::npos != found) 457 { 458 auto metadata = entryItem.substr(0, found); 459 auto iter = meta.find(metadata); 460 if (meta.end() != iter) 461 { 462 (iter->second)(metadata, additionalData, objects); 463 } 464 } 465 } 466 } 467 468 void Manager::checkAndRemoveBlockingError(uint32_t entryId) 469 { 470 // First look for blocking object and remove 471 auto it = find_if( 472 blockingErrors.begin(), blockingErrors.end(), 473 [&](std::unique_ptr<Block>& obj) { return obj->entryId == entryId; }); 474 if (it != blockingErrors.end()) 475 { 476 blockingErrors.erase(it); 477 } 478 479 // Now remove the callback looking for the error to be resolved 480 auto resolveFind = propChangedEntryCallback.find(entryId); 481 if (resolveFind != propChangedEntryCallback.end()) 482 { 483 propChangedEntryCallback.erase(resolveFind); 484 } 485 486 return; 487 } 488 489 void Manager::erase(uint32_t entryId) 490 { 491 auto entryFound = entries.find(entryId); 492 if (entries.end() != entryFound) 493 { 494 for (auto& func : Extensions::getDeleteProhibitedFunctions()) 495 { 496 try 497 { 498 bool prohibited = false; 499 func(entryId, prohibited); 500 if (prohibited) 501 { 502 // Future work remains to throw an error here. 503 return; 504 } 505 } 506 catch (std::exception& e) 507 { 508 lg2::error("An extension's deleteProhibited function threw an " 509 "exception: {ERROR}", 510 "ERROR", e); 511 } 512 } 513 514 // Delete the persistent representation of this error. 515 fs::path errorPath(ERRLOG_PERSIST_PATH); 516 errorPath /= std::to_string(entryId); 517 fs::remove(errorPath); 518 519 auto removeId = [](std::list<uint32_t>& ids, uint32_t id) { 520 auto it = std::find(ids.begin(), ids.end(), id); 521 if (it != ids.end()) 522 { 523 ids.erase(it); 524 } 525 }; 526 if (entryFound->second->severity() >= Entry::sevLowerLimit) 527 { 528 removeId(infoErrors, entryId); 529 } 530 else 531 { 532 removeId(realErrors, entryId); 533 } 534 entries.erase(entryFound); 535 536 checkAndRemoveBlockingError(entryId); 537 538 for (auto& remove : Extensions::getDeleteFunctions()) 539 { 540 try 541 { 542 remove(entryId); 543 } 544 catch (std::exception& e) 545 { 546 lg2::error("An extension's delete function threw an exception: " 547 "{ERROR}", 548 "ERROR", e); 549 } 550 } 551 } 552 else 553 { 554 lg2::error("Invalid entry ID ({ID}) to delete", "ID", entryId); 555 } 556 } 557 558 void Manager::restore() 559 { 560 auto sanity = [](const auto& id, const auto& restoredId) { 561 return id == restoredId; 562 }; 563 std::vector<uint32_t> errorIds; 564 565 fs::path dir(ERRLOG_PERSIST_PATH); 566 if (!fs::exists(dir) || fs::is_empty(dir)) 567 { 568 return; 569 } 570 571 for (auto& file : fs::directory_iterator(dir)) 572 { 573 auto id = file.path().filename().c_str(); 574 auto idNum = std::stol(id); 575 auto e = std::make_unique<Entry>( 576 busLog, std::string(OBJ_ENTRY) + '/' + id, idNum, *this); 577 if (deserialize(file.path(), *e)) 578 { 579 // validate the restored error entry id 580 if (sanity(static_cast<uint32_t>(idNum), e->id())) 581 { 582 e->path(file.path(), true); 583 e->emit_object_added(); 584 if (e->severity() >= Entry::sevLowerLimit) 585 { 586 infoErrors.push_back(idNum); 587 } 588 else 589 { 590 realErrors.push_back(idNum); 591 } 592 593 entries.insert(std::make_pair(idNum, std::move(e))); 594 errorIds.push_back(idNum); 595 } 596 else 597 { 598 lg2::error( 599 "Failed in sanity check while restoring error entry. " 600 "Ignoring error entry {ID_NUM}/{ENTRY_ID}.", 601 "ID_NUM", idNum, "ENTRY_ID", e->id()); 602 } 603 } 604 } 605 606 if (!errorIds.empty()) 607 { 608 entryId = *(std::max_element(errorIds.begin(), errorIds.end())); 609 } 610 } 611 612 void Manager::journalSync() 613 { 614 bool syncRequested = false; 615 auto fd = -1; 616 auto rc = -1; 617 auto wd = -1; 618 auto bus = sdbusplus::bus::new_default(); 619 620 auto start = 621 duration_cast<microseconds>(steady_clock::now().time_since_epoch()) 622 .count(); 623 624 // Each time an error log is committed, a request to sync the journal 625 // must occur and block that error log commit until it completes. A 5sec 626 // block is done to allow sufficient time for the journal to be synced. 627 // 628 // Number of loop iterations = 3 for the following reasons: 629 // Iteration #1: Requests a journal sync by killing the journald service. 630 // Iteration #2: Setup an inotify watch to monitor the synced file that 631 // journald updates with the timestamp the last time the 632 // journal was flushed. 633 // Iteration #3: Poll to wait until inotify reports an event which blocks 634 // the error log from being commited until the sync completes. 635 constexpr auto maxRetry = 3; 636 for (int i = 0; i < maxRetry; i++) 637 { 638 // Read timestamp from synced file 639 constexpr auto syncedPath = "/run/systemd/journal/synced"; 640 std::ifstream syncedFile(syncedPath); 641 if (syncedFile.fail()) 642 { 643 // If the synced file doesn't exist, a sync request will create it. 644 if (errno != ENOENT) 645 { 646 lg2::error( 647 "Failed to open journal synced file {FILENAME}: {ERROR}", 648 "FILENAME", syncedPath, "ERROR", strerror(errno)); 649 return; 650 } 651 } 652 else 653 { 654 // Only read the synced file if it exists. 655 // See if a sync happened by now 656 std::string timestampStr; 657 std::getline(syncedFile, timestampStr); 658 auto timestamp = std::stoll(timestampStr); 659 if (timestamp >= start) 660 { 661 break; 662 } 663 } 664 665 // Let's ask for a sync, but only once 666 if (!syncRequested) 667 { 668 syncRequested = true; 669 670 constexpr auto JOURNAL_UNIT = "systemd-journald.service"; 671 auto signal = SIGRTMIN + 1; 672 673 auto method = bus.new_method_call(SYSTEMD_BUSNAME, SYSTEMD_PATH, 674 SYSTEMD_INTERFACE, "KillUnit"); 675 method.append(JOURNAL_UNIT, "main", signal); 676 bus.call(method); 677 if (method.is_method_error()) 678 { 679 lg2::error("Failed to kill journal service"); 680 break; 681 } 682 683 continue; 684 } 685 686 // Let's install the inotify watch, if we didn't do that yet. This watch 687 // monitors the syncedFile for when journald updates it with a newer 688 // timestamp. This means the journal has been flushed. 689 if (fd < 0) 690 { 691 fd = inotify_init1(IN_NONBLOCK | IN_CLOEXEC); 692 if (fd < 0) 693 { 694 lg2::error("Failed to create inotify watch: {ERROR}", "ERROR", 695 strerror(errno)); 696 return; 697 } 698 699 constexpr auto JOURNAL_RUN_PATH = "/run/systemd/journal"; 700 wd = inotify_add_watch(fd, JOURNAL_RUN_PATH, 701 IN_MOVED_TO | IN_DONT_FOLLOW | IN_ONLYDIR); 702 if (wd < 0) 703 { 704 lg2::error("Failed to watch journal directory: {PATH}: {ERROR}", 705 "PATH", JOURNAL_RUN_PATH, "ERROR", strerror(errno)); 706 close(fd); 707 return; 708 } 709 continue; 710 } 711 712 // Let's wait until inotify reports an event 713 struct pollfd fds = { 714 fd, 715 POLLIN, 716 0, 717 }; 718 constexpr auto pollTimeout = 5; // 5 seconds 719 rc = poll(&fds, 1, pollTimeout * 1000); 720 if (rc < 0) 721 { 722 lg2::error("Failed to add event: {ERROR}", "ERROR", 723 strerror(errno)); 724 inotify_rm_watch(fd, wd); 725 close(fd); 726 return; 727 } 728 else if (rc == 0) 729 { 730 lg2::info("Poll timeout ({TIMEOUT}), no new journal synced data", 731 "TIMEOUT", pollTimeout); 732 break; 733 } 734 735 // Read from the specified file descriptor until there is no new data, 736 // throwing away everything read since the timestamp will be read at the 737 // beginning of the loop. 738 constexpr auto maxBytes = 64; 739 uint8_t buffer[maxBytes]; 740 while (read(fd, buffer, maxBytes) > 0) 741 ; 742 } 743 744 if (fd != -1) 745 { 746 if (wd != -1) 747 { 748 inotify_rm_watch(fd, wd); 749 } 750 close(fd); 751 } 752 753 return; 754 } 755 756 std::string Manager::readFWVersion() 757 { 758 auto version = util::getOSReleaseValue("VERSION_ID"); 759 760 if (!version) 761 { 762 lg2::error("Unable to read BMC firmware version"); 763 } 764 765 return version.value_or(""); 766 } 767 768 void Manager::create(const std::string& message, Entry::Level severity, 769 const std::map<std::string, std::string>& additionalData) 770 { 771 // Convert the map into a vector of "key=value" strings 772 std::vector<std::string> ad; 773 metadata::associations::combine(additionalData, ad); 774 775 createEntry(message, severity, ad); 776 } 777 778 void Manager::createWithFFDC( 779 const std::string& message, Entry::Level severity, 780 const std::map<std::string, std::string>& additionalData, 781 const FFDCEntries& ffdc) 782 { 783 // Convert the map into a vector of "key=value" strings 784 std::vector<std::string> ad; 785 metadata::associations::combine(additionalData, ad); 786 787 createEntry(message, severity, ad, ffdc); 788 } 789 790 } // namespace internal 791 } // namespace logging 792 } // namespace phosphor 793