1 #include "config.h" 2 3 #include "log_manager.hpp" 4 5 #include "elog_entry.hpp" 6 #include "elog_meta.hpp" 7 #include "elog_serialize.hpp" 8 #include "extensions.hpp" 9 #include "util.hpp" 10 11 #include <poll.h> 12 #include <sys/inotify.h> 13 #include <systemd/sd-bus.h> 14 #include <systemd/sd-journal.h> 15 #include <unistd.h> 16 17 #include <cassert> 18 #include <chrono> 19 #include <cstdio> 20 #include <cstring> 21 #include <fstream> 22 #include <functional> 23 #include <future> 24 #include <iostream> 25 #include <map> 26 #include <phosphor-logging/lg2.hpp> 27 #include <sdbusplus/vtable.hpp> 28 #include <set> 29 #include <string> 30 #include <string_view> 31 #include <vector> 32 #include <xyz/openbmc_project/State/Host/server.hpp> 33 34 using namespace std::chrono; 35 extern const std::map< 36 phosphor::logging::metadata::Metadata, 37 std::function<phosphor::logging::metadata::associations::Type>> 38 meta; 39 40 namespace phosphor 41 { 42 namespace logging 43 { 44 namespace internal 45 { 46 inline auto getLevel(const std::string& errMsg) 47 { 48 auto reqLevel = Entry::Level::Error; // Default to Error 49 50 auto levelmap = g_errLevelMap.find(errMsg); 51 if (levelmap != g_errLevelMap.end()) 52 { 53 reqLevel = static_cast<Entry::Level>(levelmap->second); 54 } 55 56 return reqLevel; 57 } 58 59 int Manager::getRealErrSize() 60 { 61 return realErrors.size(); 62 } 63 64 int Manager::getInfoErrSize() 65 { 66 return infoErrors.size(); 67 } 68 69 uint32_t Manager::commit(uint64_t transactionId, std::string errMsg) 70 { 71 auto level = getLevel(errMsg); 72 _commit(transactionId, std::move(errMsg), level); 73 return entryId; 74 } 75 76 uint32_t Manager::commitWithLvl(uint64_t transactionId, std::string errMsg, 77 uint32_t errLvl) 78 { 79 _commit(transactionId, std::move(errMsg), 80 static_cast<Entry::Level>(errLvl)); 81 return entryId; 82 } 83 84 void Manager::_commit(uint64_t transactionId [[maybe_unused]], 85 std::string&& errMsg, Entry::Level errLvl) 86 { 87 std::vector<std::string> additionalData{}; 88 89 // When running as a test-case, the system may have a LOT of journal 90 // data and we may not have permissions to do some of the journal sync 91 // operations. Just skip over them. 92 if (!IS_UNIT_TEST) 93 { 94 static constexpr auto transactionIdVar = 95 std::string_view{"TRANSACTION_ID"}; 96 // Length of 'TRANSACTION_ID' string. 97 static constexpr auto transactionIdVarSize = transactionIdVar.size(); 98 // Length of 'TRANSACTION_ID=' string. 99 static constexpr auto transactionIdVarOffset = transactionIdVarSize + 1; 100 101 // Flush all the pending log messages into the journal 102 journalSync(); 103 104 sd_journal* j = nullptr; 105 int rc = sd_journal_open(&j, SD_JOURNAL_LOCAL_ONLY); 106 if (rc < 0) 107 { 108 lg2::error("Failed to open journal: {ERROR}", "ERROR", 109 strerror(-rc)); 110 return; 111 } 112 113 std::string transactionIdStr = std::to_string(transactionId); 114 std::set<std::string> metalist; 115 auto metamap = g_errMetaMap.find(errMsg); 116 if (metamap != g_errMetaMap.end()) 117 { 118 metalist.insert(metamap->second.begin(), metamap->second.end()); 119 } 120 121 // Add _PID field information in AdditionalData. 122 metalist.insert("_PID"); 123 124 // Read the journal from the end to get the most recent entry first. 125 // The result from the sd_journal_get_data() is of the form 126 // VARIABLE=value. 127 SD_JOURNAL_FOREACH_BACKWARDS(j) 128 { 129 const char* data = nullptr; 130 size_t length = 0; 131 132 // Look for the transaction id metadata variable 133 rc = sd_journal_get_data(j, transactionIdVar.data(), 134 (const void**)&data, &length); 135 if (rc < 0) 136 { 137 // This journal entry does not have the TRANSACTION_ID 138 // metadata variable. 139 continue; 140 } 141 142 // journald does not guarantee that sd_journal_get_data() returns 143 // NULL terminated strings, so need to specify the size to use to 144 // compare, use the returned length instead of anything that relies 145 // on NULL terminators like strlen(). The data variable is in the 146 // form of 'TRANSACTION_ID=1234'. Remove the TRANSACTION_ID 147 // characters plus the (=) sign to do the comparison. 'data + 148 // transactionIdVarOffset' will be in the form of '1234'. 'length - 149 // transactionIdVarOffset' will be the length of '1234'. 150 if ((length <= (transactionIdVarOffset)) || 151 (transactionIdStr.compare( 152 0, transactionIdStr.size(), data + transactionIdVarOffset, 153 length - transactionIdVarOffset) != 0)) 154 { 155 // The value of the TRANSACTION_ID metadata is not the requested 156 // transaction id number. 157 continue; 158 } 159 160 // Search for all metadata variables in the current journal entry. 161 for (auto i = metalist.cbegin(); i != metalist.cend();) 162 { 163 rc = sd_journal_get_data(j, (*i).c_str(), (const void**)&data, 164 &length); 165 if (rc < 0) 166 { 167 // Metadata variable not found, check next metadata 168 // variable. 169 i++; 170 continue; 171 } 172 173 // Metadata variable found, save it and remove it from the set. 174 additionalData.emplace_back(data, length); 175 i = metalist.erase(i); 176 } 177 if (metalist.empty()) 178 { 179 // All metadata variables found, break out of journal loop. 180 break; 181 } 182 } 183 if (!metalist.empty()) 184 { 185 // Not all the metadata variables were found in the journal. 186 for (auto& metaVarStr : metalist) 187 { 188 lg2::info("Failed to find metadata: {META_FIELD}", "META_FIELD", 189 metaVarStr); 190 } 191 } 192 193 sd_journal_close(j); 194 } 195 createEntry(errMsg, errLvl, additionalData); 196 } 197 198 void Manager::createEntry(std::string errMsg, Entry::Level errLvl, 199 std::vector<std::string> additionalData, 200 const FFDCEntries& ffdc) 201 { 202 if (!Extensions::disableDefaultLogCaps()) 203 { 204 if (errLvl < Entry::sevLowerLimit) 205 { 206 if (realErrors.size() >= ERROR_CAP) 207 { 208 erase(realErrors.front()); 209 } 210 } 211 else 212 { 213 if (infoErrors.size() >= ERROR_INFO_CAP) 214 { 215 erase(infoErrors.front()); 216 } 217 } 218 } 219 220 entryId++; 221 if (errLvl >= Entry::sevLowerLimit) 222 { 223 infoErrors.push_back(entryId); 224 } 225 else 226 { 227 realErrors.push_back(entryId); 228 } 229 auto ms = std::chrono::duration_cast<std::chrono::milliseconds>( 230 std::chrono::system_clock::now().time_since_epoch()) 231 .count(); 232 auto objPath = std::string(OBJ_ENTRY) + '/' + std::to_string(entryId); 233 234 AssociationList objects{}; 235 processMetadata(errMsg, additionalData, objects); 236 237 auto e = std::make_unique<Entry>(busLog, objPath, entryId, 238 ms, // Milliseconds since 1970 239 errLvl, std::move(errMsg), 240 std::move(additionalData), 241 std::move(objects), fwVersion, *this); 242 auto path = serialize(*e); 243 e->path(path); 244 245 if (isQuiesceOnErrorEnabled() && isCalloutPresent(*e)) 246 { 247 quiesceOnError(entryId); 248 } 249 250 // Add entry before calling the extensions so that they have access to it 251 entries.insert(std::make_pair(entryId, std::move(e))); 252 253 doExtensionLogCreate(*entries.find(entryId)->second, ffdc); 254 255 // Note: No need to close the file descriptors in the FFDC. 256 } 257 258 bool Manager::isQuiesceOnErrorEnabled() 259 { 260 // When running under tests, the Logging.Settings service will not be 261 // present. Assume false. 262 if (IS_UNIT_TEST) 263 { 264 return false; 265 } 266 267 std::variant<bool> property; 268 269 auto method = this->busLog.new_method_call( 270 "xyz.openbmc_project.Settings", "/xyz/openbmc_project/logging/settings", 271 "org.freedesktop.DBus.Properties", "Get"); 272 273 method.append("xyz.openbmc_project.Logging.Settings", "QuiesceOnHwError"); 274 275 try 276 { 277 auto reply = this->busLog.call(method); 278 reply.read(property); 279 } 280 catch (const sdbusplus::exception::exception& e) 281 { 282 lg2::error("Error reading QuiesceOnHwError property: {ERROR}", "ERROR", 283 e); 284 throw; 285 } 286 287 return std::get<bool>(property); 288 } 289 290 bool Manager::isCalloutPresent(const Entry& entry) 291 { 292 for (const auto& c : entry.additionalData()) 293 { 294 if (c.find("CALLOUT_") != std::string::npos) 295 { 296 return true; 297 } 298 } 299 300 return false; 301 } 302 303 void Manager::findAndRemoveResolvedBlocks() 304 { 305 for (auto& entry : entries) 306 { 307 if (entry.second->resolved()) 308 { 309 checkAndRemoveBlockingError(entry.first); 310 } 311 } 312 } 313 314 void Manager::onEntryResolve(sdbusplus::message::message& msg) 315 { 316 using Interface = std::string; 317 using Property = std::string; 318 using Value = std::string; 319 using Properties = std::map<Property, std::variant<Value>>; 320 321 Interface interface; 322 Properties properties; 323 324 msg.read(interface, properties); 325 326 for (const auto& p : properties) 327 { 328 if (p.first == "Resolved") 329 { 330 findAndRemoveResolvedBlocks(); 331 return; 332 } 333 } 334 } 335 336 void Manager::checkAndQuiesceHost() 337 { 338 using Host = sdbusplus::xyz::openbmc_project::State::server::Host; 339 340 // First check host state 341 std::variant<Host::HostState> property; 342 343 auto method = this->busLog.new_method_call( 344 "xyz.openbmc_project.State.Host", "/xyz/openbmc_project/state/host0", 345 "org.freedesktop.DBus.Properties", "Get"); 346 347 method.append("xyz.openbmc_project.State.Host", "CurrentHostState"); 348 349 try 350 { 351 auto reply = this->busLog.call(method); 352 reply.read(property); 353 } 354 catch (const sdbusplus::exception::exception& e) 355 { 356 // Quiescing the host is a "best effort" type function. If unable to 357 // read the host state or it comes back empty, just return. 358 // The boot block object will still be created and the associations to 359 // find the log will be present. Don't want a dependency with 360 // phosphor-state-manager service 361 lg2::info("Error reading QuiesceOnHwError property: {ERROR}", "ERROR", 362 e); 363 return; 364 } 365 366 auto hostState = std::get<Host::HostState>(property); 367 if (hostState != Host::HostState::Running) 368 { 369 return; 370 } 371 372 auto quiesce = this->busLog.new_method_call( 373 "org.freedesktop.systemd1", "/org/freedesktop/systemd1", 374 "org.freedesktop.systemd1.Manager", "StartUnit"); 375 376 quiesce.append("obmc-host-quiesce@0.target"); 377 quiesce.append("replace"); 378 379 this->busLog.call_noreply(quiesce); 380 } 381 382 void Manager::quiesceOnError(const uint32_t entryId) 383 { 384 // Verify we don't already have this entry blocking 385 auto it = find_if( 386 this->blockingErrors.begin(), this->blockingErrors.end(), 387 [&](std::unique_ptr<Block>& obj) { return obj->entryId == entryId; }); 388 if (it != this->blockingErrors.end()) 389 { 390 // Already recorded so just return 391 lg2::debug( 392 "QuiesceOnError set and callout present but entry already logged"); 393 return; 394 } 395 396 lg2::info("QuiesceOnError set and callout present"); 397 398 auto blockPath = 399 std::string(OBJ_LOGGING) + "/block" + std::to_string(entryId); 400 auto blockObj = std::make_unique<Block>(this->busLog, blockPath, entryId); 401 this->blockingErrors.push_back(std::move(blockObj)); 402 403 // Register call back if log is resolved 404 using namespace sdbusplus::bus::match::rules; 405 auto entryPath = std::string(OBJ_ENTRY) + '/' + std::to_string(entryId); 406 auto callback = std::make_unique<sdbusplus::bus::match::match>( 407 this->busLog, 408 propertiesChanged(entryPath, "xyz.openbmc_project.Logging.Entry"), 409 std::bind(std::mem_fn(&Manager::onEntryResolve), this, 410 std::placeholders::_1)); 411 412 propChangedEntryCallback.insert( 413 std::make_pair(entryId, std::move(callback))); 414 415 checkAndQuiesceHost(); 416 } 417 418 void Manager::doExtensionLogCreate(const Entry& entry, const FFDCEntries& ffdc) 419 { 420 // Make the association <endpointpath>/<endpointtype> paths 421 std::vector<std::string> assocs; 422 for (const auto& [forwardType, reverseType, endpoint] : 423 entry.associations()) 424 { 425 std::string e{endpoint}; 426 e += '/' + reverseType; 427 assocs.push_back(e); 428 } 429 430 for (auto& create : Extensions::getCreateFunctions()) 431 { 432 try 433 { 434 create(entry.message(), entry.id(), entry.timestamp(), 435 entry.severity(), entry.additionalData(), assocs, ffdc); 436 } 437 catch (const std::exception& e) 438 { 439 lg2::error( 440 "An extension's create function threw an exception: {ERROR}", 441 "ERROR", e); 442 } 443 } 444 } 445 446 void Manager::processMetadata(const std::string& /*errorName*/, 447 const std::vector<std::string>& additionalData, 448 AssociationList& objects) const 449 { 450 // additionalData is a list of "metadata=value" 451 constexpr auto separator = '='; 452 for (const auto& entryItem : additionalData) 453 { 454 auto found = entryItem.find(separator); 455 if (std::string::npos != found) 456 { 457 auto metadata = entryItem.substr(0, found); 458 auto iter = meta.find(metadata); 459 if (meta.end() != iter) 460 { 461 (iter->second)(metadata, additionalData, objects); 462 } 463 } 464 } 465 } 466 467 void Manager::checkAndRemoveBlockingError(uint32_t entryId) 468 { 469 // First look for blocking object and remove 470 auto it = find_if( 471 blockingErrors.begin(), blockingErrors.end(), 472 [&](std::unique_ptr<Block>& obj) { return obj->entryId == entryId; }); 473 if (it != blockingErrors.end()) 474 { 475 blockingErrors.erase(it); 476 } 477 478 // Now remove the callback looking for the error to be resolved 479 auto resolveFind = propChangedEntryCallback.find(entryId); 480 if (resolveFind != propChangedEntryCallback.end()) 481 { 482 propChangedEntryCallback.erase(resolveFind); 483 } 484 485 return; 486 } 487 488 void Manager::erase(uint32_t entryId) 489 { 490 auto entryFound = entries.find(entryId); 491 if (entries.end() != entryFound) 492 { 493 for (auto& func : Extensions::getDeleteProhibitedFunctions()) 494 { 495 try 496 { 497 bool prohibited = false; 498 func(entryId, prohibited); 499 if (prohibited) 500 { 501 // Future work remains to throw an error here. 502 return; 503 } 504 } 505 catch (const std::exception& e) 506 { 507 lg2::error("An extension's deleteProhibited function threw an " 508 "exception: {ERROR}", 509 "ERROR", e); 510 } 511 } 512 513 // Delete the persistent representation of this error. 514 fs::path errorPath(ERRLOG_PERSIST_PATH); 515 errorPath /= std::to_string(entryId); 516 fs::remove(errorPath); 517 518 auto removeId = [](std::list<uint32_t>& ids, uint32_t id) { 519 auto it = std::find(ids.begin(), ids.end(), id); 520 if (it != ids.end()) 521 { 522 ids.erase(it); 523 } 524 }; 525 if (entryFound->second->severity() >= Entry::sevLowerLimit) 526 { 527 removeId(infoErrors, entryId); 528 } 529 else 530 { 531 removeId(realErrors, entryId); 532 } 533 entries.erase(entryFound); 534 535 checkAndRemoveBlockingError(entryId); 536 537 for (auto& remove : Extensions::getDeleteFunctions()) 538 { 539 try 540 { 541 remove(entryId); 542 } 543 catch (const std::exception& e) 544 { 545 lg2::error("An extension's delete function threw an exception: " 546 "{ERROR}", 547 "ERROR", e); 548 } 549 } 550 } 551 else 552 { 553 lg2::error("Invalid entry ID ({ID}) to delete", "ID", entryId); 554 } 555 } 556 557 void Manager::restore() 558 { 559 auto sanity = [](const auto& id, const auto& restoredId) { 560 return id == restoredId; 561 }; 562 std::vector<uint32_t> errorIds; 563 564 fs::path dir(ERRLOG_PERSIST_PATH); 565 if (!fs::exists(dir) || fs::is_empty(dir)) 566 { 567 return; 568 } 569 570 for (auto& file : fs::directory_iterator(dir)) 571 { 572 auto id = file.path().filename().c_str(); 573 auto idNum = std::stol(id); 574 auto e = std::make_unique<Entry>( 575 busLog, std::string(OBJ_ENTRY) + '/' + id, idNum, *this); 576 if (deserialize(file.path(), *e)) 577 { 578 // validate the restored error entry id 579 if (sanity(static_cast<uint32_t>(idNum), e->id())) 580 { 581 e->path(file.path(), true); 582 e->emit_object_added(); 583 if (e->severity() >= Entry::sevLowerLimit) 584 { 585 infoErrors.push_back(idNum); 586 } 587 else 588 { 589 realErrors.push_back(idNum); 590 } 591 592 entries.insert(std::make_pair(idNum, std::move(e))); 593 errorIds.push_back(idNum); 594 } 595 else 596 { 597 lg2::error( 598 "Failed in sanity check while restoring error entry. " 599 "Ignoring error entry {ID_NUM}/{ENTRY_ID}.", 600 "ID_NUM", idNum, "ENTRY_ID", e->id()); 601 } 602 } 603 } 604 605 if (!errorIds.empty()) 606 { 607 entryId = *(std::max_element(errorIds.begin(), errorIds.end())); 608 } 609 } 610 611 void Manager::journalSync() 612 { 613 bool syncRequested = false; 614 auto fd = -1; 615 auto rc = -1; 616 auto wd = -1; 617 auto bus = sdbusplus::bus::new_default(); 618 619 auto start = 620 duration_cast<microseconds>(steady_clock::now().time_since_epoch()) 621 .count(); 622 623 // Each time an error log is committed, a request to sync the journal 624 // must occur and block that error log commit until it completes. A 5sec 625 // block is done to allow sufficient time for the journal to be synced. 626 // 627 // Number of loop iterations = 3 for the following reasons: 628 // Iteration #1: Requests a journal sync by killing the journald service. 629 // Iteration #2: Setup an inotify watch to monitor the synced file that 630 // journald updates with the timestamp the last time the 631 // journal was flushed. 632 // Iteration #3: Poll to wait until inotify reports an event which blocks 633 // the error log from being commited until the sync completes. 634 constexpr auto maxRetry = 3; 635 for (int i = 0; i < maxRetry; i++) 636 { 637 // Read timestamp from synced file 638 constexpr auto syncedPath = "/run/systemd/journal/synced"; 639 std::ifstream syncedFile(syncedPath); 640 if (syncedFile.fail()) 641 { 642 // If the synced file doesn't exist, a sync request will create it. 643 if (errno != ENOENT) 644 { 645 lg2::error( 646 "Failed to open journal synced file {FILENAME}: {ERROR}", 647 "FILENAME", syncedPath, "ERROR", strerror(errno)); 648 return; 649 } 650 } 651 else 652 { 653 // Only read the synced file if it exists. 654 // See if a sync happened by now 655 std::string timestampStr; 656 std::getline(syncedFile, timestampStr); 657 auto timestamp = std::stoll(timestampStr); 658 if (timestamp >= start) 659 { 660 break; 661 } 662 } 663 664 // Let's ask for a sync, but only once 665 if (!syncRequested) 666 { 667 syncRequested = true; 668 669 constexpr auto JOURNAL_UNIT = "systemd-journald.service"; 670 auto signal = SIGRTMIN + 1; 671 672 auto method = bus.new_method_call(SYSTEMD_BUSNAME, SYSTEMD_PATH, 673 SYSTEMD_INTERFACE, "KillUnit"); 674 method.append(JOURNAL_UNIT, "main", signal); 675 bus.call(method); 676 if (method.is_method_error()) 677 { 678 lg2::error("Failed to kill journal service"); 679 break; 680 } 681 682 continue; 683 } 684 685 // Let's install the inotify watch, if we didn't do that yet. This watch 686 // monitors the syncedFile for when journald updates it with a newer 687 // timestamp. This means the journal has been flushed. 688 if (fd < 0) 689 { 690 fd = inotify_init1(IN_NONBLOCK | IN_CLOEXEC); 691 if (fd < 0) 692 { 693 lg2::error("Failed to create inotify watch: {ERROR}", "ERROR", 694 strerror(errno)); 695 return; 696 } 697 698 constexpr auto JOURNAL_RUN_PATH = "/run/systemd/journal"; 699 wd = inotify_add_watch(fd, JOURNAL_RUN_PATH, 700 IN_MOVED_TO | IN_DONT_FOLLOW | IN_ONLYDIR); 701 if (wd < 0) 702 { 703 lg2::error("Failed to watch journal directory: {PATH}: {ERROR}", 704 "PATH", JOURNAL_RUN_PATH, "ERROR", strerror(errno)); 705 close(fd); 706 return; 707 } 708 continue; 709 } 710 711 // Let's wait until inotify reports an event 712 struct pollfd fds = { 713 fd, 714 POLLIN, 715 0, 716 }; 717 constexpr auto pollTimeout = 5; // 5 seconds 718 rc = poll(&fds, 1, pollTimeout * 1000); 719 if (rc < 0) 720 { 721 lg2::error("Failed to add event: {ERROR}", "ERROR", 722 strerror(errno)); 723 inotify_rm_watch(fd, wd); 724 close(fd); 725 return; 726 } 727 else if (rc == 0) 728 { 729 lg2::info("Poll timeout ({TIMEOUT}), no new journal synced data", 730 "TIMEOUT", pollTimeout); 731 break; 732 } 733 734 // Read from the specified file descriptor until there is no new data, 735 // throwing away everything read since the timestamp will be read at the 736 // beginning of the loop. 737 constexpr auto maxBytes = 64; 738 uint8_t buffer[maxBytes]; 739 while (read(fd, buffer, maxBytes) > 0) 740 ; 741 } 742 743 if (fd != -1) 744 { 745 if (wd != -1) 746 { 747 inotify_rm_watch(fd, wd); 748 } 749 close(fd); 750 } 751 752 return; 753 } 754 755 std::string Manager::readFWVersion() 756 { 757 auto version = util::getOSReleaseValue("VERSION_ID"); 758 759 if (!version) 760 { 761 lg2::error("Unable to read BMC firmware version"); 762 } 763 764 return version.value_or(""); 765 } 766 767 void Manager::create(const std::string& message, Entry::Level severity, 768 const std::map<std::string, std::string>& additionalData) 769 { 770 // Convert the map into a vector of "key=value" strings 771 std::vector<std::string> ad; 772 metadata::associations::combine(additionalData, ad); 773 774 createEntry(message, severity, ad); 775 } 776 777 void Manager::createWithFFDC( 778 const std::string& message, Entry::Level severity, 779 const std::map<std::string, std::string>& additionalData, 780 const FFDCEntries& ffdc) 781 { 782 // Convert the map into a vector of "key=value" strings 783 std::vector<std::string> ad; 784 metadata::associations::combine(additionalData, ad); 785 786 createEntry(message, severity, ad, ffdc); 787 } 788 789 } // namespace internal 790 } // namespace logging 791 } // namespace phosphor 792