1 #include "config.h"
2 
3 #include "log_manager.hpp"
4 
5 #include "elog_entry.hpp"
6 #include "elog_meta.hpp"
7 #include "elog_serialize.hpp"
8 #include "extensions.hpp"
9 #include "util.hpp"
10 
11 #include <poll.h>
12 #include <sys/inotify.h>
13 #include <systemd/sd-bus.h>
14 #include <systemd/sd-journal.h>
15 #include <unistd.h>
16 
17 #include <cassert>
18 #include <chrono>
19 #include <cstdio>
20 #include <cstring>
21 #include <fstream>
22 #include <functional>
23 #include <future>
24 #include <iostream>
25 #include <map>
26 #include <phosphor-logging/log.hpp>
27 #include <sdbusplus/vtable.hpp>
28 #include <set>
29 #include <string>
30 #include <vector>
31 #include <xyz/openbmc_project/State/Host/server.hpp>
32 
33 using namespace phosphor::logging;
34 using namespace std::chrono;
35 using sdbusplus::exception::SdBusError;
36 extern const std::map<metadata::Metadata,
37                       std::function<metadata::associations::Type>>
38     meta;
39 
40 namespace phosphor
41 {
42 namespace logging
43 {
44 namespace internal
45 {
46 inline auto getLevel(const std::string& errMsg)
47 {
48     auto reqLevel = Entry::Level::Error; // Default to Error
49 
50     auto levelmap = g_errLevelMap.find(errMsg);
51     if (levelmap != g_errLevelMap.end())
52     {
53         reqLevel = static_cast<Entry::Level>(levelmap->second);
54     }
55 
56     return reqLevel;
57 }
58 
59 int Manager::getRealErrSize()
60 {
61     return realErrors.size();
62 }
63 
64 int Manager::getInfoErrSize()
65 {
66     return infoErrors.size();
67 }
68 
69 uint32_t Manager::commit(uint64_t transactionId, std::string errMsg)
70 {
71     auto level = getLevel(errMsg);
72     _commit(transactionId, std::move(errMsg), level);
73     return entryId;
74 }
75 
76 uint32_t Manager::commitWithLvl(uint64_t transactionId, std::string errMsg,
77                                 uint32_t errLvl)
78 {
79     _commit(transactionId, std::move(errMsg),
80             static_cast<Entry::Level>(errLvl));
81     return entryId;
82 }
83 
84 void Manager::_commit(uint64_t transactionId [[maybe_unused]],
85                       std::string&& errMsg, Entry::Level errLvl)
86 {
87     std::vector<std::string> additionalData{};
88 
89     // When running as a test-case, the system may have a LOT of journal
90     // data and we may not have permissions to do some of the journal sync
91     // operations.  Just skip over them.
92     if (!IS_UNIT_TEST)
93     {
94         constexpr const auto transactionIdVar = "TRANSACTION_ID";
95         // Length of 'TRANSACTION_ID' string.
96         constexpr const auto transactionIdVarSize =
97             std::strlen(transactionIdVar);
98         // Length of 'TRANSACTION_ID=' string.
99         constexpr const auto transactionIdVarOffset = transactionIdVarSize + 1;
100 
101         // Flush all the pending log messages into the journal
102         journalSync();
103 
104         sd_journal* j = nullptr;
105         int rc = sd_journal_open(&j, SD_JOURNAL_LOCAL_ONLY);
106         if (rc < 0)
107         {
108             logging::log<logging::level::ERR>(
109                 "Failed to open journal",
110                 logging::entry("DESCRIPTION=%s", strerror(-rc)));
111             return;
112         }
113 
114         std::string transactionIdStr = std::to_string(transactionId);
115         std::set<std::string> metalist;
116         auto metamap = g_errMetaMap.find(errMsg);
117         if (metamap != g_errMetaMap.end())
118         {
119             metalist.insert(metamap->second.begin(), metamap->second.end());
120         }
121 
122         // Add _PID field information in AdditionalData.
123         metalist.insert("_PID");
124 
125         // Read the journal from the end to get the most recent entry first.
126         // The result from the sd_journal_get_data() is of the form
127         // VARIABLE=value.
128         SD_JOURNAL_FOREACH_BACKWARDS(j)
129         {
130             const char* data = nullptr;
131             size_t length = 0;
132 
133             // Look for the transaction id metadata variable
134             rc = sd_journal_get_data(j, transactionIdVar, (const void**)&data,
135                                      &length);
136             if (rc < 0)
137             {
138                 // This journal entry does not have the TRANSACTION_ID
139                 // metadata variable.
140                 continue;
141             }
142 
143             // journald does not guarantee that sd_journal_get_data() returns
144             // NULL terminated strings, so need to specify the size to use to
145             // compare, use the returned length instead of anything that relies
146             // on NULL terminators like strlen(). The data variable is in the
147             // form of 'TRANSACTION_ID=1234'. Remove the TRANSACTION_ID
148             // characters plus the (=) sign to do the comparison. 'data +
149             // transactionIdVarOffset' will be in the form of '1234'. 'length -
150             // transactionIdVarOffset' will be the length of '1234'.
151             if ((length <= (transactionIdVarOffset)) ||
152                 (transactionIdStr.compare(
153                      0, transactionIdStr.size(), data + transactionIdVarOffset,
154                      length - transactionIdVarOffset) != 0))
155             {
156                 // The value of the TRANSACTION_ID metadata is not the requested
157                 // transaction id number.
158                 continue;
159             }
160 
161             // Search for all metadata variables in the current journal entry.
162             for (auto i = metalist.cbegin(); i != metalist.cend();)
163             {
164                 rc = sd_journal_get_data(j, (*i).c_str(), (const void**)&data,
165                                          &length);
166                 if (rc < 0)
167                 {
168                     // Metadata variable not found, check next metadata
169                     // variable.
170                     i++;
171                     continue;
172                 }
173 
174                 // Metadata variable found, save it and remove it from the set.
175                 additionalData.emplace_back(data, length);
176                 i = metalist.erase(i);
177             }
178             if (metalist.empty())
179             {
180                 // All metadata variables found, break out of journal loop.
181                 break;
182             }
183         }
184         if (!metalist.empty())
185         {
186             // Not all the metadata variables were found in the journal.
187             for (auto& metaVarStr : metalist)
188             {
189                 logging::log<logging::level::INFO>(
190                     "Failed to find metadata",
191                     logging::entry("META_FIELD=%s", metaVarStr.c_str()));
192             }
193         }
194 
195         sd_journal_close(j);
196     }
197     createEntry(errMsg, errLvl, additionalData);
198 }
199 
200 void Manager::createEntry(std::string errMsg, Entry::Level errLvl,
201                           std::vector<std::string> additionalData,
202                           const FFDCEntries& ffdc)
203 {
204     if (!Extensions::disableDefaultLogCaps())
205     {
206         if (errLvl < Entry::sevLowerLimit)
207         {
208             if (realErrors.size() >= ERROR_CAP)
209             {
210                 erase(realErrors.front());
211             }
212         }
213         else
214         {
215             if (infoErrors.size() >= ERROR_INFO_CAP)
216             {
217                 erase(infoErrors.front());
218             }
219         }
220     }
221 
222     entryId++;
223     if (errLvl >= Entry::sevLowerLimit)
224     {
225         infoErrors.push_back(entryId);
226     }
227     else
228     {
229         realErrors.push_back(entryId);
230     }
231     auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(
232                   std::chrono::system_clock::now().time_since_epoch())
233                   .count();
234     auto objPath = std::string(OBJ_ENTRY) + '/' + std::to_string(entryId);
235 
236     AssociationList objects{};
237     processMetadata(errMsg, additionalData, objects);
238 
239     auto e = std::make_unique<Entry>(busLog, objPath, entryId,
240                                      ms, // Milliseconds since 1970
241                                      errLvl, std::move(errMsg),
242                                      std::move(additionalData),
243                                      std::move(objects), fwVersion, *this);
244     auto path = serialize(*e);
245     e->path(path);
246 
247     if (isQuiesceOnErrorEnabled() && isCalloutPresent(*e))
248     {
249         quiesceOnError(entryId);
250     }
251 
252     // Add entry before calling the extensions so that they have access to it
253     entries.insert(std::make_pair(entryId, std::move(e)));
254 
255     doExtensionLogCreate(*entries.find(entryId)->second, ffdc);
256 
257     // Note: No need to close the file descriptors in the FFDC.
258 }
259 
260 bool Manager::isQuiesceOnErrorEnabled()
261 {
262     // When running under tests, the Logging.Settings service will not be
263     // present.  Assume false.
264     if (IS_UNIT_TEST)
265     {
266         return false;
267     }
268 
269     std::variant<bool> property;
270 
271     auto method = this->busLog.new_method_call(
272         "xyz.openbmc_project.Settings", "/xyz/openbmc_project/logging/settings",
273         "org.freedesktop.DBus.Properties", "Get");
274 
275     method.append("xyz.openbmc_project.Logging.Settings", "QuiesceOnHwError");
276 
277     try
278     {
279         auto reply = this->busLog.call(method);
280         reply.read(property);
281     }
282     catch (const SdBusError& e)
283     {
284         log<level::ERR>("Error reading QuiesceOnHwError property",
285                         entry("ERROR=%s", e.what()));
286         throw;
287     }
288 
289     return std::get<bool>(property);
290 }
291 
292 bool Manager::isCalloutPresent(const Entry& entry)
293 {
294     for (const auto& c : entry.additionalData())
295     {
296         if (c.find("CALLOUT_") != std::string::npos)
297         {
298             return true;
299         }
300     }
301 
302     return false;
303 }
304 
305 void Manager::findAndRemoveResolvedBlocks()
306 {
307     for (auto& entry : entries)
308     {
309         if (entry.second->resolved())
310         {
311             checkAndRemoveBlockingError(entry.first);
312         }
313     }
314 }
315 
316 void Manager::onEntryResolve(sdbusplus::message::message& msg)
317 {
318     using Interface = std::string;
319     using Property = std::string;
320     using Value = std::string;
321     using Properties = std::map<Property, std::variant<Value>>;
322 
323     Interface interface;
324     Properties properties;
325 
326     msg.read(interface, properties);
327 
328     for (const auto& p : properties)
329     {
330         if (p.first == "Resolved")
331         {
332             findAndRemoveResolvedBlocks();
333             return;
334         }
335     }
336 }
337 
338 void Manager::checkAndQuiesceHost()
339 {
340     using Host = sdbusplus::xyz::openbmc_project::State::server::Host;
341 
342     // First check host state
343     std::variant<Host::HostState> property;
344 
345     auto method = this->busLog.new_method_call(
346         "xyz.openbmc_project.State.Host", "/xyz/openbmc_project/state/host0",
347         "org.freedesktop.DBus.Properties", "Get");
348 
349     method.append("xyz.openbmc_project.State.Host", "CurrentHostState");
350 
351     try
352     {
353         auto reply = this->busLog.call(method);
354         reply.read(property);
355     }
356     catch (const SdBusError& e)
357     {
358         // Quiescing the host is a "best effort" type function. If unable to
359         // read the host state or it comes back empty, just return.
360         // The boot block object will still be created and the associations to
361         // find the log will be present. Don't want a dependency with
362         // phosphor-state-manager service
363         log<level::INFO>("Error reading QuiesceOnHwError property",
364                          entry("ERROR=%s", e.what()));
365         return;
366     }
367 
368     auto hostState = std::get<Host::HostState>(property);
369     if (hostState != Host::HostState::Running)
370     {
371         return;
372     }
373 
374     auto quiesce = this->busLog.new_method_call(
375         "org.freedesktop.systemd1", "/org/freedesktop/systemd1",
376         "org.freedesktop.systemd1.Manager", "StartUnit");
377 
378     quiesce.append("obmc-host-quiesce@0.target");
379     quiesce.append("replace");
380 
381     this->busLog.call_noreply(quiesce);
382 }
383 
384 void Manager::quiesceOnError(const uint32_t entryId)
385 {
386     // Verify we don't already have this entry blocking
387     auto it = find_if(
388         this->blockingErrors.begin(), this->blockingErrors.end(),
389         [&](std::unique_ptr<Block>& obj) { return obj->entryId == entryId; });
390     if (it != this->blockingErrors.end())
391     {
392         // Already recorded so just return
393         logging::log<logging::level::DEBUG>(
394             "QuiesceOnError set and callout present but entry already logged");
395         return;
396     }
397 
398     logging::log<logging::level::INFO>(
399         "QuiesceOnError set and callout present");
400 
401     auto blockPath =
402         std::string(OBJ_LOGGING) + "/block" + std::to_string(entryId);
403     auto blockObj = std::make_unique<Block>(this->busLog, blockPath, entryId);
404     this->blockingErrors.push_back(std::move(blockObj));
405 
406     // Register call back if log is resolved
407     using namespace sdbusplus::bus::match::rules;
408     auto entryPath = std::string(OBJ_ENTRY) + '/' + std::to_string(entryId);
409     auto callback = std::make_unique<sdbusplus::bus::match::match>(
410         this->busLog,
411         propertiesChanged(entryPath, "xyz.openbmc_project.Logging.Entry"),
412         std::bind(std::mem_fn(&Manager::onEntryResolve), this,
413                   std::placeholders::_1));
414 
415     propChangedEntryCallback.insert(
416         std::make_pair(entryId, std::move(callback)));
417 
418     checkAndQuiesceHost();
419 }
420 
421 void Manager::doExtensionLogCreate(const Entry& entry, const FFDCEntries& ffdc)
422 {
423     // Make the association <endpointpath>/<endpointtype> paths
424     std::vector<std::string> assocs;
425     for (const auto& [forwardType, reverseType, endpoint] :
426          entry.associations())
427     {
428         std::string e{endpoint};
429         e += '/' + reverseType;
430         assocs.push_back(e);
431     }
432 
433     for (auto& create : Extensions::getCreateFunctions())
434     {
435         try
436         {
437             create(entry.message(), entry.id(), entry.timestamp(),
438                    entry.severity(), entry.additionalData(), assocs, ffdc);
439         }
440         catch (std::exception& e)
441         {
442             log<level::ERR>("An extension's create function threw an exception",
443                             phosphor::logging::entry("ERROR=%s", e.what()));
444         }
445     }
446 }
447 
448 void Manager::processMetadata(const std::string& /*errorName*/,
449                               const std::vector<std::string>& additionalData,
450                               AssociationList& objects) const
451 {
452     // additionalData is a list of "metadata=value"
453     constexpr auto separator = '=';
454     for (const auto& entryItem : additionalData)
455     {
456         auto found = entryItem.find(separator);
457         if (std::string::npos != found)
458         {
459             auto metadata = entryItem.substr(0, found);
460             auto iter = meta.find(metadata);
461             if (meta.end() != iter)
462             {
463                 (iter->second)(metadata, additionalData, objects);
464             }
465         }
466     }
467 }
468 
469 void Manager::checkAndRemoveBlockingError(uint32_t entryId)
470 {
471     // First look for blocking object and remove
472     auto it = find_if(
473         blockingErrors.begin(), blockingErrors.end(),
474         [&](std::unique_ptr<Block>& obj) { return obj->entryId == entryId; });
475     if (it != blockingErrors.end())
476     {
477         blockingErrors.erase(it);
478     }
479 
480     // Now remove the callback looking for the error to be resolved
481     auto resolveFind = propChangedEntryCallback.find(entryId);
482     if (resolveFind != propChangedEntryCallback.end())
483     {
484         propChangedEntryCallback.erase(resolveFind);
485     }
486 
487     return;
488 }
489 
490 void Manager::erase(uint32_t entryId)
491 {
492     auto entryFound = entries.find(entryId);
493     if (entries.end() != entryFound)
494     {
495         for (auto& func : Extensions::getDeleteProhibitedFunctions())
496         {
497             try
498             {
499                 bool prohibited = false;
500                 func(entryId, prohibited);
501                 if (prohibited)
502                 {
503                     // Future work remains to throw an error here.
504                     return;
505                 }
506             }
507             catch (std::exception& e)
508             {
509                 log<level::ERR>(
510                     "An extension's deleteProhibited function threw "
511                     "an exception",
512                     entry("ERROR=%s", e.what()));
513             }
514         }
515 
516         // Delete the persistent representation of this error.
517         fs::path errorPath(ERRLOG_PERSIST_PATH);
518         errorPath /= std::to_string(entryId);
519         fs::remove(errorPath);
520 
521         auto removeId = [](std::list<uint32_t>& ids, uint32_t id) {
522             auto it = std::find(ids.begin(), ids.end(), id);
523             if (it != ids.end())
524             {
525                 ids.erase(it);
526             }
527         };
528         if (entryFound->second->severity() >= Entry::sevLowerLimit)
529         {
530             removeId(infoErrors, entryId);
531         }
532         else
533         {
534             removeId(realErrors, entryId);
535         }
536         entries.erase(entryFound);
537 
538         checkAndRemoveBlockingError(entryId);
539 
540         for (auto& remove : Extensions::getDeleteFunctions())
541         {
542             try
543             {
544                 remove(entryId);
545             }
546             catch (std::exception& e)
547             {
548                 log<level::ERR>("An extension's delete function threw an "
549                                 "exception",
550                                 entry("ERROR=%s", e.what()));
551             }
552         }
553     }
554     else
555     {
556         logging::log<level::ERR>("Invalid entry ID to delete",
557                                  logging::entry("ID=%d", entryId));
558     }
559 }
560 
561 void Manager::restore()
562 {
563     auto sanity = [](const auto& id, const auto& restoredId) {
564         return id == restoredId;
565     };
566     std::vector<uint32_t> errorIds;
567 
568     fs::path dir(ERRLOG_PERSIST_PATH);
569     if (!fs::exists(dir) || fs::is_empty(dir))
570     {
571         return;
572     }
573 
574     for (auto& file : fs::directory_iterator(dir))
575     {
576         auto id = file.path().filename().c_str();
577         auto idNum = std::stol(id);
578         auto e = std::make_unique<Entry>(
579             busLog, std::string(OBJ_ENTRY) + '/' + id, idNum, *this);
580         if (deserialize(file.path(), *e))
581         {
582             // validate the restored error entry id
583             if (sanity(static_cast<uint32_t>(idNum), e->id()))
584             {
585                 e->path(file.path(), true);
586                 e->emit_object_added();
587                 if (e->severity() >= Entry::sevLowerLimit)
588                 {
589                     infoErrors.push_back(idNum);
590                 }
591                 else
592                 {
593                     realErrors.push_back(idNum);
594                 }
595 
596                 entries.insert(std::make_pair(idNum, std::move(e)));
597                 errorIds.push_back(idNum);
598             }
599             else
600             {
601                 logging::log<logging::level::ERR>(
602                     "Failed in sanity check while restoring error entry. "
603                     "Ignoring error entry",
604                     logging::entry("ID_NUM=%d", idNum),
605                     logging::entry("ENTRY_ID=%d", e->id()));
606             }
607         }
608     }
609 
610     if (!errorIds.empty())
611     {
612         entryId = *(std::max_element(errorIds.begin(), errorIds.end()));
613     }
614 }
615 
616 void Manager::journalSync()
617 {
618     bool syncRequested = false;
619     auto fd = -1;
620     auto rc = -1;
621     auto wd = -1;
622     auto bus = sdbusplus::bus::new_default();
623 
624     auto start =
625         duration_cast<microseconds>(steady_clock::now().time_since_epoch())
626             .count();
627 
628     // Each time an error log is committed, a request to sync the journal
629     // must occur and block that error log commit until it completes. A 5sec
630     // block is done to allow sufficient time for the journal to be synced.
631     //
632     // Number of loop iterations = 3 for the following reasons:
633     // Iteration #1: Requests a journal sync by killing the journald service.
634     // Iteration #2: Setup an inotify watch to monitor the synced file that
635     //               journald updates with the timestamp the last time the
636     //               journal was flushed.
637     // Iteration #3: Poll to wait until inotify reports an event which blocks
638     //               the error log from being commited until the sync completes.
639     constexpr auto maxRetry = 3;
640     for (int i = 0; i < maxRetry; i++)
641     {
642         // Read timestamp from synced file
643         constexpr auto syncedPath = "/run/systemd/journal/synced";
644         std::ifstream syncedFile(syncedPath);
645         if (syncedFile.fail())
646         {
647             // If the synced file doesn't exist, a sync request will create it.
648             if (errno != ENOENT)
649             {
650                 log<level::ERR>("Failed to open journal synced file",
651                                 entry("FILENAME=%s", syncedPath),
652                                 entry("ERRNO=%d", errno));
653                 return;
654             }
655         }
656         else
657         {
658             // Only read the synced file if it exists.
659             // See if a sync happened by now
660             std::string timestampStr;
661             std::getline(syncedFile, timestampStr);
662             auto timestamp = std::stoll(timestampStr);
663             if (timestamp >= start)
664             {
665                 break;
666             }
667         }
668 
669         // Let's ask for a sync, but only once
670         if (!syncRequested)
671         {
672             syncRequested = true;
673 
674             constexpr auto JOURNAL_UNIT = "systemd-journald.service";
675             auto signal = SIGRTMIN + 1;
676 
677             auto method = bus.new_method_call(SYSTEMD_BUSNAME, SYSTEMD_PATH,
678                                               SYSTEMD_INTERFACE, "KillUnit");
679             method.append(JOURNAL_UNIT, "main", signal);
680             bus.call(method);
681             if (method.is_method_error())
682             {
683                 log<level::ERR>("Failed to kill journal service");
684                 break;
685             }
686 
687             continue;
688         }
689 
690         // Let's install the inotify watch, if we didn't do that yet. This watch
691         // monitors the syncedFile for when journald updates it with a newer
692         // timestamp. This means the journal has been flushed.
693         if (fd < 0)
694         {
695             fd = inotify_init1(IN_NONBLOCK | IN_CLOEXEC);
696             if (fd < 0)
697             {
698                 log<level::ERR>("Failed to create inotify watch",
699                                 entry("ERRNO=%d", errno));
700                 return;
701             }
702 
703             constexpr auto JOURNAL_RUN_PATH = "/run/systemd/journal";
704             wd = inotify_add_watch(fd, JOURNAL_RUN_PATH,
705                                    IN_MOVED_TO | IN_DONT_FOLLOW | IN_ONLYDIR);
706             if (wd < 0)
707             {
708                 log<level::ERR>("Failed to watch journal directory",
709                                 entry("PATH=%s", JOURNAL_RUN_PATH),
710                                 entry("ERRNO=%d", errno));
711                 close(fd);
712                 return;
713             }
714             continue;
715         }
716 
717         // Let's wait until inotify reports an event
718         struct pollfd fds = {
719             fd,
720             POLLIN,
721             0,
722         };
723         constexpr auto pollTimeout = 5; // 5 seconds
724         rc = poll(&fds, 1, pollTimeout * 1000);
725         if (rc < 0)
726         {
727             log<level::ERR>("Failed to add event", entry("ERRNO=%d", errno),
728                             entry("ERR=%s", strerror(-rc)));
729             inotify_rm_watch(fd, wd);
730             close(fd);
731             return;
732         }
733         else if (rc == 0)
734         {
735             log<level::INFO>("Poll timeout, no new journal synced data",
736                              entry("TIMEOUT=%d", pollTimeout));
737             break;
738         }
739 
740         // Read from the specified file descriptor until there is no new data,
741         // throwing away everything read since the timestamp will be read at the
742         // beginning of the loop.
743         constexpr auto maxBytes = 64;
744         uint8_t buffer[maxBytes];
745         while (read(fd, buffer, maxBytes) > 0)
746             ;
747     }
748 
749     if (fd != -1)
750     {
751         if (wd != -1)
752         {
753             inotify_rm_watch(fd, wd);
754         }
755         close(fd);
756     }
757 
758     return;
759 }
760 
761 std::string Manager::readFWVersion()
762 {
763     auto version = util::getOSReleaseValue("VERSION_ID");
764 
765     if (!version)
766     {
767         log<level::ERR>("Unable to read BMC firmware version");
768     }
769 
770     return version.value_or("");
771 }
772 
773 void Manager::create(const std::string& message, Entry::Level severity,
774                      const std::map<std::string, std::string>& additionalData)
775 {
776     // Convert the map into a vector of "key=value" strings
777     std::vector<std::string> ad;
778     metadata::associations::combine(additionalData, ad);
779 
780     createEntry(message, severity, ad);
781 }
782 
783 void Manager::createWithFFDC(
784     const std::string& message, Entry::Level severity,
785     const std::map<std::string, std::string>& additionalData,
786     const FFDCEntries& ffdc)
787 {
788     // Convert the map into a vector of "key=value" strings
789     std::vector<std::string> ad;
790     metadata::associations::combine(additionalData, ad);
791 
792     createEntry(message, severity, ad, ffdc);
793 }
794 
795 } // namespace internal
796 } // namespace logging
797 } // namespace phosphor
798