1 #include "config.h" 2 3 #include "log_manager.hpp" 4 5 #include "elog_entry.hpp" 6 #include "elog_meta.hpp" 7 #include "elog_serialize.hpp" 8 #include "extensions.hpp" 9 10 #include <poll.h> 11 #include <sys/inotify.h> 12 #include <systemd/sd-bus.h> 13 #include <systemd/sd-journal.h> 14 #include <unistd.h> 15 16 #include <cassert> 17 #include <chrono> 18 #include <cstdio> 19 #include <cstring> 20 #include <fstream> 21 #include <functional> 22 #include <future> 23 #include <iostream> 24 #include <map> 25 #include <phosphor-logging/log.hpp> 26 #include <sdbusplus/vtable.hpp> 27 #include <set> 28 #include <string> 29 #include <vector> 30 #include <xyz/openbmc_project/State/Host/server.hpp> 31 32 using namespace phosphor::logging; 33 using namespace std::chrono; 34 using sdbusplus::exception::SdBusError; 35 extern const std::map<metadata::Metadata, 36 std::function<metadata::associations::Type>> 37 meta; 38 39 namespace phosphor 40 { 41 namespace logging 42 { 43 namespace internal 44 { 45 inline auto getLevel(const std::string& errMsg) 46 { 47 auto reqLevel = Entry::Level::Error; // Default to Error 48 49 auto levelmap = g_errLevelMap.find(errMsg); 50 if (levelmap != g_errLevelMap.end()) 51 { 52 reqLevel = static_cast<Entry::Level>(levelmap->second); 53 } 54 55 return reqLevel; 56 } 57 58 int Manager::getRealErrSize() 59 { 60 return realErrors.size(); 61 } 62 63 int Manager::getInfoErrSize() 64 { 65 return infoErrors.size(); 66 } 67 68 void Manager::commit(uint64_t transactionId, std::string errMsg) 69 { 70 auto level = getLevel(errMsg); 71 _commit(transactionId, std::move(errMsg), level); 72 } 73 74 void Manager::commitWithLvl(uint64_t transactionId, std::string errMsg, 75 uint32_t errLvl) 76 { 77 _commit(transactionId, std::move(errMsg), 78 static_cast<Entry::Level>(errLvl)); 79 } 80 81 void Manager::_commit(uint64_t transactionId, std::string&& errMsg, 82 Entry::Level errLvl) 83 { 84 constexpr const auto transactionIdVar = "TRANSACTION_ID"; 85 // Length of 'TRANSACTION_ID' string. 86 constexpr const auto transactionIdVarSize = std::strlen(transactionIdVar); 87 // Length of 'TRANSACTION_ID=' string. 88 constexpr const auto transactionIdVarOffset = transactionIdVarSize + 1; 89 90 // Flush all the pending log messages into the journal 91 journalSync(); 92 93 sd_journal* j = nullptr; 94 int rc = sd_journal_open(&j, SD_JOURNAL_LOCAL_ONLY); 95 if (rc < 0) 96 { 97 logging::log<logging::level::ERR>( 98 "Failed to open journal", 99 logging::entry("DESCRIPTION=%s", strerror(-rc))); 100 return; 101 } 102 103 std::string transactionIdStr = std::to_string(transactionId); 104 std::set<std::string> metalist; 105 auto metamap = g_errMetaMap.find(errMsg); 106 if (metamap != g_errMetaMap.end()) 107 { 108 metalist.insert(metamap->second.begin(), metamap->second.end()); 109 } 110 111 // Add _PID field information in AdditionalData. 112 metalist.insert("_PID"); 113 114 std::vector<std::string> additionalData; 115 116 // Read the journal from the end to get the most recent entry first. 117 // The result from the sd_journal_get_data() is of the form VARIABLE=value. 118 SD_JOURNAL_FOREACH_BACKWARDS(j) 119 { 120 const char* data = nullptr; 121 size_t length = 0; 122 123 // Look for the transaction id metadata variable 124 rc = sd_journal_get_data(j, transactionIdVar, (const void**)&data, 125 &length); 126 if (rc < 0) 127 { 128 // This journal entry does not have the TRANSACTION_ID 129 // metadata variable. 130 continue; 131 } 132 133 // journald does not guarantee that sd_journal_get_data() returns NULL 134 // terminated strings, so need to specify the size to use to compare, 135 // use the returned length instead of anything that relies on NULL 136 // terminators like strlen(). 137 // The data variable is in the form of 'TRANSACTION_ID=1234'. Remove 138 // the TRANSACTION_ID characters plus the (=) sign to do the comparison. 139 // 'data + transactionIdVarOffset' will be in the form of '1234'. 140 // 'length - transactionIdVarOffset' will be the length of '1234'. 141 if ((length <= (transactionIdVarOffset)) || 142 (transactionIdStr.compare(0, transactionIdStr.size(), 143 data + transactionIdVarOffset, 144 length - transactionIdVarOffset) != 0)) 145 { 146 // The value of the TRANSACTION_ID metadata is not the requested 147 // transaction id number. 148 continue; 149 } 150 151 // Search for all metadata variables in the current journal entry. 152 for (auto i = metalist.cbegin(); i != metalist.cend();) 153 { 154 rc = sd_journal_get_data(j, (*i).c_str(), (const void**)&data, 155 &length); 156 if (rc < 0) 157 { 158 // Metadata variable not found, check next metadata variable. 159 i++; 160 continue; 161 } 162 163 // Metadata variable found, save it and remove it from the set. 164 additionalData.emplace_back(data, length); 165 i = metalist.erase(i); 166 } 167 if (metalist.empty()) 168 { 169 // All metadata variables found, break out of journal loop. 170 break; 171 } 172 } 173 if (!metalist.empty()) 174 { 175 // Not all the metadata variables were found in the journal. 176 for (auto& metaVarStr : metalist) 177 { 178 logging::log<logging::level::INFO>( 179 "Failed to find metadata", 180 logging::entry("META_FIELD=%s", metaVarStr.c_str())); 181 } 182 } 183 184 sd_journal_close(j); 185 186 createEntry(errMsg, errLvl, additionalData); 187 } 188 189 void Manager::createEntry(std::string errMsg, Entry::Level errLvl, 190 std::vector<std::string> additionalData, 191 const FFDCEntries& ffdc) 192 { 193 if (!Extensions::disableDefaultLogCaps()) 194 { 195 if (errLvl < Entry::sevLowerLimit) 196 { 197 if (realErrors.size() >= ERROR_CAP) 198 { 199 erase(realErrors.front()); 200 } 201 } 202 else 203 { 204 if (infoErrors.size() >= ERROR_INFO_CAP) 205 { 206 erase(infoErrors.front()); 207 } 208 } 209 } 210 211 entryId++; 212 if (errLvl >= Entry::sevLowerLimit) 213 { 214 infoErrors.push_back(entryId); 215 } 216 else 217 { 218 realErrors.push_back(entryId); 219 } 220 auto ms = std::chrono::duration_cast<std::chrono::milliseconds>( 221 std::chrono::system_clock::now().time_since_epoch()) 222 .count(); 223 auto objPath = std::string(OBJ_ENTRY) + '/' + std::to_string(entryId); 224 225 AssociationList objects{}; 226 processMetadata(errMsg, additionalData, objects); 227 228 auto e = std::make_unique<Entry>(busLog, objPath, entryId, 229 ms, // Milliseconds since 1970 230 errLvl, std::move(errMsg), 231 std::move(additionalData), 232 std::move(objects), fwVersion, *this); 233 serialize(*e); 234 235 if (isQuiesceOnErrorEnabled()) 236 { 237 checkQuiesceOnError(*e); 238 } 239 240 doExtensionLogCreate(*e, ffdc); 241 242 // Note: No need to close the file descriptors in the FFDC. 243 244 entries.insert(std::make_pair(entryId, std::move(e))); 245 } 246 247 bool Manager::isQuiesceOnErrorEnabled() 248 { 249 std::variant<bool> property; 250 251 auto method = this->busLog.new_method_call( 252 "xyz.openbmc_project.Settings", "/xyz/openbmc_project/logging/settings", 253 "org.freedesktop.DBus.Properties", "Get"); 254 255 method.append("xyz.openbmc_project.Logging.Settings", "QuiesceOnHwError"); 256 257 try 258 { 259 auto reply = this->busLog.call(method); 260 reply.read(property); 261 } 262 catch (const SdBusError& e) 263 { 264 log<level::ERR>("Error reading QuiesceOnHwError property", 265 entry("ERROR=%s", e.what())); 266 throw; 267 } 268 269 return std::get<bool>(property); 270 } 271 272 bool Manager::isCalloutPresent(const Entry& entry) 273 { 274 for (const auto& c : entry.additionalData()) 275 { 276 if (c.find("CALLOUT_") != std::string::npos) 277 { 278 return true; 279 } 280 } 281 282 return false; 283 } 284 285 void Manager::findAndRemoveResolvedBlocks() 286 { 287 for (auto& entry : entries) 288 { 289 if (entry.second->resolved()) 290 { 291 checkAndRemoveBlockingError(entry.first); 292 } 293 } 294 } 295 296 void Manager::onEntryResolve(sdbusplus::message::message& msg) 297 { 298 using Interface = std::string; 299 using Property = std::string; 300 using Value = std::string; 301 using Properties = std::map<Property, sdbusplus::message::variant<Value>>; 302 303 Interface interface; 304 Properties properties; 305 306 msg.read(interface, properties); 307 308 for (const auto& p : properties) 309 { 310 if (p.first == "Resolved") 311 { 312 findAndRemoveResolvedBlocks(); 313 return; 314 } 315 } 316 } 317 318 void Manager::checkAndQuiesceHost() 319 { 320 // First check host state 321 std::variant<std::string> property; 322 323 auto method = this->busLog.new_method_call( 324 "xyz.openbmc_project.State.Host", "/xyz/openbmc_project/state/host0", 325 "org.freedesktop.DBus.Properties", "Get"); 326 327 method.append("xyz.openbmc_project.State.Host", "CurrentHostState"); 328 329 try 330 { 331 auto reply = this->busLog.call(method); 332 reply.read(property); 333 } 334 catch (const SdBusError& e) 335 { 336 // Quiescing the host is a "best effort" type function. If unable to 337 // read the host state or it comes back empty, just return. 338 // The boot block object will still be created and the associations to 339 // find the log will be present. Don't want a dependency with 340 // phosphor-state-manager service 341 log<level::INFO>("Error reading QuiesceOnHwError property", 342 entry("ERROR=%s", e.what())); 343 return; 344 } 345 346 std::string hostState = std::get<std::string>(property); 347 348 // If host state is empty, do nothing 349 if (hostState.empty()) 350 { 351 return; 352 } 353 354 using Host = sdbusplus::xyz::openbmc_project::State::server::Host; 355 auto state = Host::convertHostStateFromString(hostState); 356 if (state != Host::HostState::Running) 357 { 358 return; 359 } 360 361 auto quiesce = this->busLog.new_method_call( 362 "org.freedesktop.systemd1", "/org/freedesktop/systemd1", 363 "org.freedesktop.systemd1.Manager", "StartUnit"); 364 365 quiesce.append("obmc-host-quiesce@0.target"); 366 quiesce.append("replace"); 367 368 this->busLog.call_noreply(quiesce); 369 } 370 371 void Manager::checkQuiesceOnError(const Entry& entry) 372 { 373 374 if (!isCalloutPresent(entry)) 375 { 376 return; 377 } 378 379 logging::log<logging::level::INFO>( 380 "QuiesceOnError set and callout present"); 381 382 auto blockPath = 383 std::string(OBJ_LOGGING) + "/block" + std::to_string(entry.id()); 384 auto blockObj = 385 std::make_unique<Block>(this->busLog, blockPath, entry.id()); 386 this->blockingErrors.push_back(std::move(blockObj)); 387 388 // Register call back if log is resolved 389 using namespace sdbusplus::bus::match::rules; 390 auto entryPath = std::string(OBJ_ENTRY) + '/' + std::to_string(entry.id()); 391 auto callback = std::make_unique<sdbusplus::bus::match::match>( 392 this->busLog, 393 propertiesChanged(entryPath, "xyz.openbmc_project.Logging.Entry"), 394 std::bind(std::mem_fn(&Manager::onEntryResolve), this, 395 std::placeholders::_1)); 396 397 propChangedEntryCallback.insert( 398 std::make_pair(entry.id(), std::move(callback))); 399 400 checkAndQuiesceHost(); 401 } 402 403 void Manager::doExtensionLogCreate(const Entry& entry, const FFDCEntries& ffdc) 404 { 405 // Make the association <endpointpath>/<endpointtype> paths 406 std::vector<std::string> assocs; 407 for (const auto& [forwardType, reverseType, endpoint] : 408 entry.associations()) 409 { 410 std::string e{endpoint}; 411 e += '/' + reverseType; 412 assocs.push_back(e); 413 } 414 415 for (auto& create : Extensions::getCreateFunctions()) 416 { 417 try 418 { 419 create(entry.message(), entry.id(), entry.timestamp(), 420 entry.severity(), entry.additionalData(), assocs, ffdc); 421 } 422 catch (std::exception& e) 423 { 424 log<level::ERR>("An extension's create function threw an exception", 425 phosphor::logging::entry("ERROR=%s", e.what())); 426 } 427 } 428 } 429 430 void Manager::processMetadata(const std::string& errorName, 431 const std::vector<std::string>& additionalData, 432 AssociationList& objects) const 433 { 434 // additionalData is a list of "metadata=value" 435 constexpr auto separator = '='; 436 for (const auto& entryItem : additionalData) 437 { 438 auto found = entryItem.find(separator); 439 if (std::string::npos != found) 440 { 441 auto metadata = entryItem.substr(0, found); 442 auto iter = meta.find(metadata); 443 if (meta.end() != iter) 444 { 445 (iter->second)(metadata, additionalData, objects); 446 } 447 } 448 } 449 } 450 451 void Manager::checkAndRemoveBlockingError(uint32_t entryId) 452 { 453 // First look for blocking object and remove 454 auto it = find_if( 455 blockingErrors.begin(), blockingErrors.end(), 456 [&](std::unique_ptr<Block>& obj) { return obj->entryId == entryId; }); 457 if (it != blockingErrors.end()) 458 { 459 blockingErrors.erase(it); 460 } 461 462 // Now remove the callback looking for the error to be resolved 463 auto resolveFind = propChangedEntryCallback.find(entryId); 464 if (resolveFind != propChangedEntryCallback.end()) 465 { 466 propChangedEntryCallback.erase(resolveFind); 467 } 468 469 return; 470 } 471 472 void Manager::erase(uint32_t entryId) 473 { 474 auto entryFound = entries.find(entryId); 475 if (entries.end() != entryFound) 476 { 477 for (auto& func : Extensions::getDeleteProhibitedFunctions()) 478 { 479 try 480 { 481 bool prohibited = false; 482 func(entryId, prohibited); 483 if (prohibited) 484 { 485 // Future work remains to throw an error here. 486 return; 487 } 488 } 489 catch (std::exception& e) 490 { 491 log<level::ERR>( 492 "An extension's deleteProhibited function threw " 493 "an exception", 494 entry("ERROR=%s", e.what())); 495 } 496 } 497 498 // Delete the persistent representation of this error. 499 fs::path errorPath(ERRLOG_PERSIST_PATH); 500 errorPath /= std::to_string(entryId); 501 fs::remove(errorPath); 502 503 auto removeId = [](std::list<uint32_t>& ids, uint32_t id) { 504 auto it = std::find(ids.begin(), ids.end(), id); 505 if (it != ids.end()) 506 { 507 ids.erase(it); 508 } 509 }; 510 if (entryFound->second->severity() >= Entry::sevLowerLimit) 511 { 512 removeId(infoErrors, entryId); 513 } 514 else 515 { 516 removeId(realErrors, entryId); 517 } 518 entries.erase(entryFound); 519 520 checkAndRemoveBlockingError(entryId); 521 522 for (auto& remove : Extensions::getDeleteFunctions()) 523 { 524 try 525 { 526 remove(entryId); 527 } 528 catch (std::exception& e) 529 { 530 log<level::ERR>("An extension's delete function threw an " 531 "exception", 532 entry("ERROR=%s", e.what())); 533 } 534 } 535 } 536 else 537 { 538 logging::log<level::ERR>("Invalid entry ID to delete", 539 logging::entry("ID=%d", entryId)); 540 } 541 } 542 543 void Manager::restore() 544 { 545 auto sanity = [](const auto& id, const auto& restoredId) { 546 return id == restoredId; 547 }; 548 std::vector<uint32_t> errorIds; 549 550 fs::path dir(ERRLOG_PERSIST_PATH); 551 if (!fs::exists(dir) || fs::is_empty(dir)) 552 { 553 return; 554 } 555 556 for (auto& file : fs::directory_iterator(dir)) 557 { 558 auto id = file.path().filename().c_str(); 559 auto idNum = std::stol(id); 560 auto e = std::make_unique<Entry>( 561 busLog, std::string(OBJ_ENTRY) + '/' + id, idNum, *this); 562 if (deserialize(file.path(), *e)) 563 { 564 // validate the restored error entry id 565 if (sanity(static_cast<uint32_t>(idNum), e->id())) 566 { 567 e->emit_object_added(); 568 if (e->severity() >= Entry::sevLowerLimit) 569 { 570 infoErrors.push_back(idNum); 571 } 572 else 573 { 574 realErrors.push_back(idNum); 575 } 576 577 entries.insert(std::make_pair(idNum, std::move(e))); 578 errorIds.push_back(idNum); 579 } 580 else 581 { 582 logging::log<logging::level::ERR>( 583 "Failed in sanity check while restoring error entry. " 584 "Ignoring error entry", 585 logging::entry("ID_NUM=%d", idNum), 586 logging::entry("ENTRY_ID=%d", e->id())); 587 } 588 } 589 } 590 591 if (!errorIds.empty()) 592 { 593 entryId = *(std::max_element(errorIds.begin(), errorIds.end())); 594 } 595 } 596 597 void Manager::journalSync() 598 { 599 bool syncRequested = false; 600 auto fd = -1; 601 auto rc = -1; 602 auto wd = -1; 603 auto bus = sdbusplus::bus::new_default(); 604 605 auto start = 606 duration_cast<microseconds>(steady_clock::now().time_since_epoch()) 607 .count(); 608 609 // Each time an error log is committed, a request to sync the journal 610 // must occur and block that error log commit until it completes. A 5sec 611 // block is done to allow sufficient time for the journal to be synced. 612 // 613 // Number of loop iterations = 3 for the following reasons: 614 // Iteration #1: Requests a journal sync by killing the journald service. 615 // Iteration #2: Setup an inotify watch to monitor the synced file that 616 // journald updates with the timestamp the last time the 617 // journal was flushed. 618 // Iteration #3: Poll to wait until inotify reports an event which blocks 619 // the error log from being commited until the sync completes. 620 constexpr auto maxRetry = 3; 621 for (int i = 0; i < maxRetry; i++) 622 { 623 // Read timestamp from synced file 624 constexpr auto syncedPath = "/run/systemd/journal/synced"; 625 std::ifstream syncedFile(syncedPath); 626 if (syncedFile.fail()) 627 { 628 // If the synced file doesn't exist, a sync request will create it. 629 if (errno != ENOENT) 630 { 631 log<level::ERR>("Failed to open journal synced file", 632 entry("FILENAME=%s", syncedPath), 633 entry("ERRNO=%d", errno)); 634 return; 635 } 636 } 637 else 638 { 639 // Only read the synced file if it exists. 640 // See if a sync happened by now 641 std::string timestampStr; 642 std::getline(syncedFile, timestampStr); 643 auto timestamp = std::stoll(timestampStr); 644 if (timestamp >= start) 645 { 646 break; 647 } 648 } 649 650 // Let's ask for a sync, but only once 651 if (!syncRequested) 652 { 653 syncRequested = true; 654 655 constexpr auto JOURNAL_UNIT = "systemd-journald.service"; 656 auto signal = SIGRTMIN + 1; 657 658 auto method = bus.new_method_call(SYSTEMD_BUSNAME, SYSTEMD_PATH, 659 SYSTEMD_INTERFACE, "KillUnit"); 660 method.append(JOURNAL_UNIT, "main", signal); 661 bus.call(method); 662 if (method.is_method_error()) 663 { 664 log<level::ERR>("Failed to kill journal service"); 665 break; 666 } 667 continue; 668 } 669 670 // Let's install the inotify watch, if we didn't do that yet. This watch 671 // monitors the syncedFile for when journald updates it with a newer 672 // timestamp. This means the journal has been flushed. 673 if (fd < 0) 674 { 675 fd = inotify_init1(IN_NONBLOCK | IN_CLOEXEC); 676 if (fd < 0) 677 { 678 log<level::ERR>("Failed to create inotify watch", 679 entry("ERRNO=%d", errno)); 680 return; 681 } 682 683 constexpr auto JOURNAL_RUN_PATH = "/run/systemd/journal"; 684 wd = inotify_add_watch(fd, JOURNAL_RUN_PATH, 685 IN_MOVED_TO | IN_DONT_FOLLOW | IN_ONLYDIR); 686 if (wd < 0) 687 { 688 log<level::ERR>("Failed to watch journal directory", 689 entry("PATH=%s", JOURNAL_RUN_PATH), 690 entry("ERRNO=%d", errno)); 691 close(fd); 692 return; 693 } 694 continue; 695 } 696 697 // Let's wait until inotify reports an event 698 struct pollfd fds = { 699 .fd = fd, 700 .events = POLLIN, 701 }; 702 constexpr auto pollTimeout = 5; // 5 seconds 703 rc = poll(&fds, 1, pollTimeout * 1000); 704 if (rc < 0) 705 { 706 log<level::ERR>("Failed to add event", entry("ERRNO=%d", errno), 707 entry("ERR=%s", strerror(-rc))); 708 inotify_rm_watch(fd, wd); 709 close(fd); 710 return; 711 } 712 else if (rc == 0) 713 { 714 log<level::INFO>("Poll timeout, no new journal synced data", 715 entry("TIMEOUT=%d", pollTimeout)); 716 break; 717 } 718 719 // Read from the specified file descriptor until there is no new data, 720 // throwing away everything read since the timestamp will be read at the 721 // beginning of the loop. 722 constexpr auto maxBytes = 64; 723 uint8_t buffer[maxBytes]; 724 while (read(fd, buffer, maxBytes) > 0) 725 ; 726 } 727 728 if (fd != -1) 729 { 730 if (wd != -1) 731 { 732 inotify_rm_watch(fd, wd); 733 } 734 close(fd); 735 } 736 737 return; 738 } 739 740 std::string Manager::readFWVersion() 741 { 742 std::string version; 743 std::ifstream versionFile{BMC_VERSION_FILE}; 744 std::string line; 745 static constexpr auto VERSION_ID = "VERSION_ID="; 746 747 while (std::getline(versionFile, line)) 748 { 749 if (line.find(VERSION_ID) != std::string::npos) 750 { 751 auto pos = line.find_first_of('"') + 1; 752 version = line.substr(pos, line.find_last_of('"') - pos); 753 break; 754 } 755 } 756 757 if (version.empty()) 758 { 759 log<level::ERR>("Unable to read BMC firmware version"); 760 } 761 762 return version; 763 } 764 765 void Manager::create(const std::string& message, Entry::Level severity, 766 const std::map<std::string, std::string>& additionalData) 767 { 768 // Convert the map into a vector of "key=value" strings 769 std::vector<std::string> ad; 770 metadata::associations::combine(additionalData, ad); 771 772 createEntry(message, severity, ad); 773 } 774 775 void Manager::createWithFFDC( 776 const std::string& message, Entry::Level severity, 777 const std::map<std::string, std::string>& additionalData, 778 const FFDCEntries& ffdc) 779 { 780 // Convert the map into a vector of "key=value" strings 781 std::vector<std::string> ad; 782 metadata::associations::combine(additionalData, ad); 783 784 createEntry(message, severity, ad, ffdc); 785 } 786 787 } // namespace internal 788 } // namespace logging 789 } // namespace phosphor 790