1 #include "config.h" 2 3 #include "log_manager.hpp" 4 5 #include "elog_entry.hpp" 6 #include "elog_meta.hpp" 7 #include "elog_serialize.hpp" 8 #include "extensions.hpp" 9 #include "util.hpp" 10 11 #include <poll.h> 12 #include <sys/inotify.h> 13 #include <systemd/sd-bus.h> 14 #include <systemd/sd-journal.h> 15 #include <unistd.h> 16 17 #include <cassert> 18 #include <chrono> 19 #include <cstdio> 20 #include <cstring> 21 #include <fstream> 22 #include <functional> 23 #include <future> 24 #include <iostream> 25 #include <map> 26 #include <phosphor-logging/log.hpp> 27 #include <sdbusplus/vtable.hpp> 28 #include <set> 29 #include <string> 30 #include <vector> 31 #include <xyz/openbmc_project/State/Host/server.hpp> 32 33 using namespace phosphor::logging; 34 using namespace std::chrono; 35 using sdbusplus::exception::SdBusError; 36 extern const std::map<metadata::Metadata, 37 std::function<metadata::associations::Type>> 38 meta; 39 40 namespace phosphor 41 { 42 namespace logging 43 { 44 namespace internal 45 { 46 inline auto getLevel(const std::string& errMsg) 47 { 48 auto reqLevel = Entry::Level::Error; // Default to Error 49 50 auto levelmap = g_errLevelMap.find(errMsg); 51 if (levelmap != g_errLevelMap.end()) 52 { 53 reqLevel = static_cast<Entry::Level>(levelmap->second); 54 } 55 56 return reqLevel; 57 } 58 59 int Manager::getRealErrSize() 60 { 61 return realErrors.size(); 62 } 63 64 int Manager::getInfoErrSize() 65 { 66 return infoErrors.size(); 67 } 68 69 uint32_t Manager::commit(uint64_t transactionId, std::string errMsg) 70 { 71 auto level = getLevel(errMsg); 72 _commit(transactionId, std::move(errMsg), level); 73 return entryId; 74 } 75 76 uint32_t Manager::commitWithLvl(uint64_t transactionId, std::string errMsg, 77 uint32_t errLvl) 78 { 79 _commit(transactionId, std::move(errMsg), 80 static_cast<Entry::Level>(errLvl)); 81 return entryId; 82 } 83 84 void Manager::_commit(uint64_t transactionId, std::string&& errMsg, 85 Entry::Level errLvl) 86 { 87 constexpr const auto transactionIdVar = "TRANSACTION_ID"; 88 // Length of 'TRANSACTION_ID' string. 89 constexpr const auto transactionIdVarSize = std::strlen(transactionIdVar); 90 // Length of 'TRANSACTION_ID=' string. 91 constexpr const auto transactionIdVarOffset = transactionIdVarSize + 1; 92 93 // Flush all the pending log messages into the journal 94 journalSync(); 95 96 sd_journal* j = nullptr; 97 int rc = sd_journal_open(&j, SD_JOURNAL_LOCAL_ONLY); 98 if (rc < 0) 99 { 100 logging::log<logging::level::ERR>( 101 "Failed to open journal", 102 logging::entry("DESCRIPTION=%s", strerror(-rc))); 103 return; 104 } 105 106 std::string transactionIdStr = std::to_string(transactionId); 107 std::set<std::string> metalist; 108 auto metamap = g_errMetaMap.find(errMsg); 109 if (metamap != g_errMetaMap.end()) 110 { 111 metalist.insert(metamap->second.begin(), metamap->second.end()); 112 } 113 114 // Add _PID field information in AdditionalData. 115 metalist.insert("_PID"); 116 117 std::vector<std::string> additionalData; 118 119 // Read the journal from the end to get the most recent entry first. 120 // The result from the sd_journal_get_data() is of the form VARIABLE=value. 121 SD_JOURNAL_FOREACH_BACKWARDS(j) 122 { 123 const char* data = nullptr; 124 size_t length = 0; 125 126 // Look for the transaction id metadata variable 127 rc = sd_journal_get_data(j, transactionIdVar, (const void**)&data, 128 &length); 129 if (rc < 0) 130 { 131 // This journal entry does not have the TRANSACTION_ID 132 // metadata variable. 133 continue; 134 } 135 136 // journald does not guarantee that sd_journal_get_data() returns NULL 137 // terminated strings, so need to specify the size to use to compare, 138 // use the returned length instead of anything that relies on NULL 139 // terminators like strlen(). 140 // The data variable is in the form of 'TRANSACTION_ID=1234'. Remove 141 // the TRANSACTION_ID characters plus the (=) sign to do the comparison. 142 // 'data + transactionIdVarOffset' will be in the form of '1234'. 143 // 'length - transactionIdVarOffset' will be the length of '1234'. 144 if ((length <= (transactionIdVarOffset)) || 145 (transactionIdStr.compare(0, transactionIdStr.size(), 146 data + transactionIdVarOffset, 147 length - transactionIdVarOffset) != 0)) 148 { 149 // The value of the TRANSACTION_ID metadata is not the requested 150 // transaction id number. 151 continue; 152 } 153 154 // Search for all metadata variables in the current journal entry. 155 for (auto i = metalist.cbegin(); i != metalist.cend();) 156 { 157 rc = sd_journal_get_data(j, (*i).c_str(), (const void**)&data, 158 &length); 159 if (rc < 0) 160 { 161 // Metadata variable not found, check next metadata variable. 162 i++; 163 continue; 164 } 165 166 // Metadata variable found, save it and remove it from the set. 167 additionalData.emplace_back(data, length); 168 i = metalist.erase(i); 169 } 170 if (metalist.empty()) 171 { 172 // All metadata variables found, break out of journal loop. 173 break; 174 } 175 } 176 if (!metalist.empty()) 177 { 178 // Not all the metadata variables were found in the journal. 179 for (auto& metaVarStr : metalist) 180 { 181 logging::log<logging::level::INFO>( 182 "Failed to find metadata", 183 logging::entry("META_FIELD=%s", metaVarStr.c_str())); 184 } 185 } 186 187 sd_journal_close(j); 188 189 createEntry(errMsg, errLvl, additionalData); 190 } 191 192 void Manager::createEntry(std::string errMsg, Entry::Level errLvl, 193 std::vector<std::string> additionalData, 194 const FFDCEntries& ffdc) 195 { 196 if (!Extensions::disableDefaultLogCaps()) 197 { 198 if (errLvl < Entry::sevLowerLimit) 199 { 200 if (realErrors.size() >= ERROR_CAP) 201 { 202 erase(realErrors.front()); 203 } 204 } 205 else 206 { 207 if (infoErrors.size() >= ERROR_INFO_CAP) 208 { 209 erase(infoErrors.front()); 210 } 211 } 212 } 213 214 entryId++; 215 if (errLvl >= Entry::sevLowerLimit) 216 { 217 infoErrors.push_back(entryId); 218 } 219 else 220 { 221 realErrors.push_back(entryId); 222 } 223 auto ms = std::chrono::duration_cast<std::chrono::milliseconds>( 224 std::chrono::system_clock::now().time_since_epoch()) 225 .count(); 226 auto objPath = std::string(OBJ_ENTRY) + '/' + std::to_string(entryId); 227 228 AssociationList objects{}; 229 processMetadata(errMsg, additionalData, objects); 230 231 auto e = std::make_unique<Entry>(busLog, objPath, entryId, 232 ms, // Milliseconds since 1970 233 errLvl, std::move(errMsg), 234 std::move(additionalData), 235 std::move(objects), fwVersion, *this); 236 auto path = serialize(*e); 237 e->path(path); 238 239 if (isQuiesceOnErrorEnabled() && isCalloutPresent(*e)) 240 { 241 quiesceOnError(entryId); 242 } 243 244 // Add entry before calling the extensions so that they have access to it 245 entries.insert(std::make_pair(entryId, std::move(e))); 246 247 doExtensionLogCreate(*entries.find(entryId)->second, ffdc); 248 249 // Note: No need to close the file descriptors in the FFDC. 250 } 251 252 bool Manager::isQuiesceOnErrorEnabled() 253 { 254 std::variant<bool> property; 255 256 auto method = this->busLog.new_method_call( 257 "xyz.openbmc_project.Settings", "/xyz/openbmc_project/logging/settings", 258 "org.freedesktop.DBus.Properties", "Get"); 259 260 method.append("xyz.openbmc_project.Logging.Settings", "QuiesceOnHwError"); 261 262 try 263 { 264 auto reply = this->busLog.call(method); 265 reply.read(property); 266 } 267 catch (const SdBusError& e) 268 { 269 log<level::ERR>("Error reading QuiesceOnHwError property", 270 entry("ERROR=%s", e.what())); 271 throw; 272 } 273 274 return std::get<bool>(property); 275 } 276 277 bool Manager::isCalloutPresent(const Entry& entry) 278 { 279 for (const auto& c : entry.additionalData()) 280 { 281 if (c.find("CALLOUT_") != std::string::npos) 282 { 283 return true; 284 } 285 } 286 287 return false; 288 } 289 290 void Manager::findAndRemoveResolvedBlocks() 291 { 292 for (auto& entry : entries) 293 { 294 if (entry.second->resolved()) 295 { 296 checkAndRemoveBlockingError(entry.first); 297 } 298 } 299 } 300 301 void Manager::onEntryResolve(sdbusplus::message::message& msg) 302 { 303 using Interface = std::string; 304 using Property = std::string; 305 using Value = std::string; 306 using Properties = std::map<Property, std::variant<Value>>; 307 308 Interface interface; 309 Properties properties; 310 311 msg.read(interface, properties); 312 313 for (const auto& p : properties) 314 { 315 if (p.first == "Resolved") 316 { 317 findAndRemoveResolvedBlocks(); 318 return; 319 } 320 } 321 } 322 323 void Manager::checkAndQuiesceHost() 324 { 325 // First check host state 326 std::variant<std::string> property; 327 328 auto method = this->busLog.new_method_call( 329 "xyz.openbmc_project.State.Host", "/xyz/openbmc_project/state/host0", 330 "org.freedesktop.DBus.Properties", "Get"); 331 332 method.append("xyz.openbmc_project.State.Host", "CurrentHostState"); 333 334 try 335 { 336 auto reply = this->busLog.call(method); 337 reply.read(property); 338 } 339 catch (const SdBusError& e) 340 { 341 // Quiescing the host is a "best effort" type function. If unable to 342 // read the host state or it comes back empty, just return. 343 // The boot block object will still be created and the associations to 344 // find the log will be present. Don't want a dependency with 345 // phosphor-state-manager service 346 log<level::INFO>("Error reading QuiesceOnHwError property", 347 entry("ERROR=%s", e.what())); 348 return; 349 } 350 351 std::string hostState = std::get<std::string>(property); 352 353 // If host state is empty, do nothing 354 if (hostState.empty()) 355 { 356 return; 357 } 358 359 using Host = sdbusplus::xyz::openbmc_project::State::server::Host; 360 auto state = Host::convertHostStateFromString(hostState); 361 if (state != Host::HostState::Running) 362 { 363 return; 364 } 365 366 auto quiesce = this->busLog.new_method_call( 367 "org.freedesktop.systemd1", "/org/freedesktop/systemd1", 368 "org.freedesktop.systemd1.Manager", "StartUnit"); 369 370 quiesce.append("obmc-host-quiesce@0.target"); 371 quiesce.append("replace"); 372 373 this->busLog.call_noreply(quiesce); 374 } 375 376 void Manager::quiesceOnError(const uint32_t entryId) 377 { 378 // Verify we don't already have this entry blocking 379 auto it = find_if( 380 this->blockingErrors.begin(), this->blockingErrors.end(), 381 [&](std::unique_ptr<Block>& obj) { return obj->entryId == entryId; }); 382 if (it != this->blockingErrors.end()) 383 { 384 // Already recorded so just return 385 logging::log<logging::level::DEBUG>( 386 "QuiesceOnError set and callout present but entry already logged"); 387 return; 388 } 389 390 logging::log<logging::level::INFO>( 391 "QuiesceOnError set and callout present"); 392 393 auto blockPath = 394 std::string(OBJ_LOGGING) + "/block" + std::to_string(entryId); 395 auto blockObj = std::make_unique<Block>(this->busLog, blockPath, entryId); 396 this->blockingErrors.push_back(std::move(blockObj)); 397 398 // Register call back if log is resolved 399 using namespace sdbusplus::bus::match::rules; 400 auto entryPath = std::string(OBJ_ENTRY) + '/' + std::to_string(entryId); 401 auto callback = std::make_unique<sdbusplus::bus::match::match>( 402 this->busLog, 403 propertiesChanged(entryPath, "xyz.openbmc_project.Logging.Entry"), 404 std::bind(std::mem_fn(&Manager::onEntryResolve), this, 405 std::placeholders::_1)); 406 407 propChangedEntryCallback.insert( 408 std::make_pair(entryId, std::move(callback))); 409 410 checkAndQuiesceHost(); 411 } 412 413 void Manager::doExtensionLogCreate(const Entry& entry, const FFDCEntries& ffdc) 414 { 415 // Make the association <endpointpath>/<endpointtype> paths 416 std::vector<std::string> assocs; 417 for (const auto& [forwardType, reverseType, endpoint] : 418 entry.associations()) 419 { 420 std::string e{endpoint}; 421 e += '/' + reverseType; 422 assocs.push_back(e); 423 } 424 425 for (auto& create : Extensions::getCreateFunctions()) 426 { 427 try 428 { 429 create(entry.message(), entry.id(), entry.timestamp(), 430 entry.severity(), entry.additionalData(), assocs, ffdc); 431 } 432 catch (std::exception& e) 433 { 434 log<level::ERR>("An extension's create function threw an exception", 435 phosphor::logging::entry("ERROR=%s", e.what())); 436 } 437 } 438 } 439 440 void Manager::processMetadata(const std::string& errorName, 441 const std::vector<std::string>& additionalData, 442 AssociationList& objects) const 443 { 444 // additionalData is a list of "metadata=value" 445 constexpr auto separator = '='; 446 for (const auto& entryItem : additionalData) 447 { 448 auto found = entryItem.find(separator); 449 if (std::string::npos != found) 450 { 451 auto metadata = entryItem.substr(0, found); 452 auto iter = meta.find(metadata); 453 if (meta.end() != iter) 454 { 455 (iter->second)(metadata, additionalData, objects); 456 } 457 } 458 } 459 } 460 461 void Manager::checkAndRemoveBlockingError(uint32_t entryId) 462 { 463 // First look for blocking object and remove 464 auto it = find_if( 465 blockingErrors.begin(), blockingErrors.end(), 466 [&](std::unique_ptr<Block>& obj) { return obj->entryId == entryId; }); 467 if (it != blockingErrors.end()) 468 { 469 blockingErrors.erase(it); 470 } 471 472 // Now remove the callback looking for the error to be resolved 473 auto resolveFind = propChangedEntryCallback.find(entryId); 474 if (resolveFind != propChangedEntryCallback.end()) 475 { 476 propChangedEntryCallback.erase(resolveFind); 477 } 478 479 return; 480 } 481 482 void Manager::erase(uint32_t entryId) 483 { 484 auto entryFound = entries.find(entryId); 485 if (entries.end() != entryFound) 486 { 487 for (auto& func : Extensions::getDeleteProhibitedFunctions()) 488 { 489 try 490 { 491 bool prohibited = false; 492 func(entryId, prohibited); 493 if (prohibited) 494 { 495 // Future work remains to throw an error here. 496 return; 497 } 498 } 499 catch (std::exception& e) 500 { 501 log<level::ERR>( 502 "An extension's deleteProhibited function threw " 503 "an exception", 504 entry("ERROR=%s", e.what())); 505 } 506 } 507 508 // Delete the persistent representation of this error. 509 fs::path errorPath(ERRLOG_PERSIST_PATH); 510 errorPath /= std::to_string(entryId); 511 fs::remove(errorPath); 512 513 auto removeId = [](std::list<uint32_t>& ids, uint32_t id) { 514 auto it = std::find(ids.begin(), ids.end(), id); 515 if (it != ids.end()) 516 { 517 ids.erase(it); 518 } 519 }; 520 if (entryFound->second->severity() >= Entry::sevLowerLimit) 521 { 522 removeId(infoErrors, entryId); 523 } 524 else 525 { 526 removeId(realErrors, entryId); 527 } 528 entries.erase(entryFound); 529 530 checkAndRemoveBlockingError(entryId); 531 532 for (auto& remove : Extensions::getDeleteFunctions()) 533 { 534 try 535 { 536 remove(entryId); 537 } 538 catch (std::exception& e) 539 { 540 log<level::ERR>("An extension's delete function threw an " 541 "exception", 542 entry("ERROR=%s", e.what())); 543 } 544 } 545 } 546 else 547 { 548 logging::log<level::ERR>("Invalid entry ID to delete", 549 logging::entry("ID=%d", entryId)); 550 } 551 } 552 553 void Manager::restore() 554 { 555 auto sanity = [](const auto& id, const auto& restoredId) { 556 return id == restoredId; 557 }; 558 std::vector<uint32_t> errorIds; 559 560 fs::path dir(ERRLOG_PERSIST_PATH); 561 if (!fs::exists(dir) || fs::is_empty(dir)) 562 { 563 return; 564 } 565 566 for (auto& file : fs::directory_iterator(dir)) 567 { 568 auto id = file.path().filename().c_str(); 569 auto idNum = std::stol(id); 570 auto e = std::make_unique<Entry>( 571 busLog, std::string(OBJ_ENTRY) + '/' + id, idNum, *this); 572 if (deserialize(file.path(), *e)) 573 { 574 // validate the restored error entry id 575 if (sanity(static_cast<uint32_t>(idNum), e->id())) 576 { 577 e->path(file.path()); 578 e->emit_object_added(); 579 if (e->severity() >= Entry::sevLowerLimit) 580 { 581 infoErrors.push_back(idNum); 582 } 583 else 584 { 585 realErrors.push_back(idNum); 586 } 587 588 entries.insert(std::make_pair(idNum, std::move(e))); 589 errorIds.push_back(idNum); 590 } 591 else 592 { 593 logging::log<logging::level::ERR>( 594 "Failed in sanity check while restoring error entry. " 595 "Ignoring error entry", 596 logging::entry("ID_NUM=%d", idNum), 597 logging::entry("ENTRY_ID=%d", e->id())); 598 } 599 } 600 } 601 602 if (!errorIds.empty()) 603 { 604 entryId = *(std::max_element(errorIds.begin(), errorIds.end())); 605 } 606 } 607 608 void Manager::journalSync() 609 { 610 bool syncRequested = false; 611 auto fd = -1; 612 auto rc = -1; 613 auto wd = -1; 614 auto bus = sdbusplus::bus::new_default(); 615 616 auto start = 617 duration_cast<microseconds>(steady_clock::now().time_since_epoch()) 618 .count(); 619 620 // Each time an error log is committed, a request to sync the journal 621 // must occur and block that error log commit until it completes. A 5sec 622 // block is done to allow sufficient time for the journal to be synced. 623 // 624 // Number of loop iterations = 3 for the following reasons: 625 // Iteration #1: Requests a journal sync by killing the journald service. 626 // Iteration #2: Setup an inotify watch to monitor the synced file that 627 // journald updates with the timestamp the last time the 628 // journal was flushed. 629 // Iteration #3: Poll to wait until inotify reports an event which blocks 630 // the error log from being commited until the sync completes. 631 constexpr auto maxRetry = 3; 632 for (int i = 0; i < maxRetry; i++) 633 { 634 // Read timestamp from synced file 635 constexpr auto syncedPath = "/run/systemd/journal/synced"; 636 std::ifstream syncedFile(syncedPath); 637 if (syncedFile.fail()) 638 { 639 // If the synced file doesn't exist, a sync request will create it. 640 if (errno != ENOENT) 641 { 642 log<level::ERR>("Failed to open journal synced file", 643 entry("FILENAME=%s", syncedPath), 644 entry("ERRNO=%d", errno)); 645 return; 646 } 647 } 648 else 649 { 650 // Only read the synced file if it exists. 651 // See if a sync happened by now 652 std::string timestampStr; 653 std::getline(syncedFile, timestampStr); 654 auto timestamp = std::stoll(timestampStr); 655 if (timestamp >= start) 656 { 657 break; 658 } 659 } 660 661 // Let's ask for a sync, but only once 662 if (!syncRequested) 663 { 664 syncRequested = true; 665 666 constexpr auto JOURNAL_UNIT = "systemd-journald.service"; 667 auto signal = SIGRTMIN + 1; 668 669 auto method = bus.new_method_call(SYSTEMD_BUSNAME, SYSTEMD_PATH, 670 SYSTEMD_INTERFACE, "KillUnit"); 671 method.append(JOURNAL_UNIT, "main", signal); 672 bus.call(method); 673 if (method.is_method_error()) 674 { 675 log<level::ERR>("Failed to kill journal service"); 676 break; 677 } 678 continue; 679 } 680 681 // Let's install the inotify watch, if we didn't do that yet. This watch 682 // monitors the syncedFile for when journald updates it with a newer 683 // timestamp. This means the journal has been flushed. 684 if (fd < 0) 685 { 686 fd = inotify_init1(IN_NONBLOCK | IN_CLOEXEC); 687 if (fd < 0) 688 { 689 log<level::ERR>("Failed to create inotify watch", 690 entry("ERRNO=%d", errno)); 691 return; 692 } 693 694 constexpr auto JOURNAL_RUN_PATH = "/run/systemd/journal"; 695 wd = inotify_add_watch(fd, JOURNAL_RUN_PATH, 696 IN_MOVED_TO | IN_DONT_FOLLOW | IN_ONLYDIR); 697 if (wd < 0) 698 { 699 log<level::ERR>("Failed to watch journal directory", 700 entry("PATH=%s", JOURNAL_RUN_PATH), 701 entry("ERRNO=%d", errno)); 702 close(fd); 703 return; 704 } 705 continue; 706 } 707 708 // Let's wait until inotify reports an event 709 struct pollfd fds = { 710 .fd = fd, 711 .events = POLLIN, 712 }; 713 constexpr auto pollTimeout = 5; // 5 seconds 714 rc = poll(&fds, 1, pollTimeout * 1000); 715 if (rc < 0) 716 { 717 log<level::ERR>("Failed to add event", entry("ERRNO=%d", errno), 718 entry("ERR=%s", strerror(-rc))); 719 inotify_rm_watch(fd, wd); 720 close(fd); 721 return; 722 } 723 else if (rc == 0) 724 { 725 log<level::INFO>("Poll timeout, no new journal synced data", 726 entry("TIMEOUT=%d", pollTimeout)); 727 break; 728 } 729 730 // Read from the specified file descriptor until there is no new data, 731 // throwing away everything read since the timestamp will be read at the 732 // beginning of the loop. 733 constexpr auto maxBytes = 64; 734 uint8_t buffer[maxBytes]; 735 while (read(fd, buffer, maxBytes) > 0) 736 ; 737 } 738 739 if (fd != -1) 740 { 741 if (wd != -1) 742 { 743 inotify_rm_watch(fd, wd); 744 } 745 close(fd); 746 } 747 748 return; 749 } 750 751 std::string Manager::readFWVersion() 752 { 753 auto version = util::getOSReleaseValue("VERSION_ID"); 754 755 if (!version) 756 { 757 log<level::ERR>("Unable to read BMC firmware version"); 758 } 759 760 return version.value_or(""); 761 } 762 763 void Manager::create(const std::string& message, Entry::Level severity, 764 const std::map<std::string, std::string>& additionalData) 765 { 766 // Convert the map into a vector of "key=value" strings 767 std::vector<std::string> ad; 768 metadata::associations::combine(additionalData, ad); 769 770 createEntry(message, severity, ad); 771 } 772 773 void Manager::createWithFFDC( 774 const std::string& message, Entry::Level severity, 775 const std::map<std::string, std::string>& additionalData, 776 const FFDCEntries& ffdc) 777 { 778 // Convert the map into a vector of "key=value" strings 779 std::vector<std::string> ad; 780 metadata::associations::combine(additionalData, ad); 781 782 createEntry(message, severity, ad, ffdc); 783 } 784 785 } // namespace internal 786 } // namespace logging 787 } // namespace phosphor 788