xref: /openbmc/phosphor-logging/log_manager.cpp (revision 81a91e3ee4bf962111cf555ab9d3c3c51000fa3b)
1 #include "config.h"
2 
3 #include "log_manager.hpp"
4 
5 #include "elog_entry.hpp"
6 #include "elog_meta.hpp"
7 #include "elog_serialize.hpp"
8 #include "extensions.hpp"
9 
10 #include <poll.h>
11 #include <sys/inotify.h>
12 #include <systemd/sd-bus.h>
13 #include <systemd/sd-journal.h>
14 #include <unistd.h>
15 
16 #include <cassert>
17 #include <chrono>
18 #include <cstdio>
19 #include <cstring>
20 #include <fstream>
21 #include <functional>
22 #include <future>
23 #include <iostream>
24 #include <map>
25 #include <phosphor-logging/log.hpp>
26 #include <sdbusplus/vtable.hpp>
27 #include <set>
28 #include <string>
29 #include <vector>
30 #include <xyz/openbmc_project/State/Host/server.hpp>
31 
32 using namespace phosphor::logging;
33 using namespace std::chrono;
34 using sdbusplus::exception::SdBusError;
35 extern const std::map<metadata::Metadata,
36                       std::function<metadata::associations::Type>>
37     meta;
38 
39 namespace phosphor
40 {
41 namespace logging
42 {
43 namespace internal
44 {
45 inline auto getLevel(const std::string& errMsg)
46 {
47     auto reqLevel = Entry::Level::Error; // Default to Error
48 
49     auto levelmap = g_errLevelMap.find(errMsg);
50     if (levelmap != g_errLevelMap.end())
51     {
52         reqLevel = static_cast<Entry::Level>(levelmap->second);
53     }
54 
55     return reqLevel;
56 }
57 
58 int Manager::getRealErrSize()
59 {
60     return realErrors.size();
61 }
62 
63 int Manager::getInfoErrSize()
64 {
65     return infoErrors.size();
66 }
67 
68 void Manager::commit(uint64_t transactionId, std::string errMsg)
69 {
70     auto level = getLevel(errMsg);
71     _commit(transactionId, std::move(errMsg), level);
72 }
73 
74 void Manager::commitWithLvl(uint64_t transactionId, std::string errMsg,
75                             uint32_t errLvl)
76 {
77     _commit(transactionId, std::move(errMsg),
78             static_cast<Entry::Level>(errLvl));
79 }
80 
81 void Manager::_commit(uint64_t transactionId, std::string&& errMsg,
82                       Entry::Level errLvl)
83 {
84     constexpr const auto transactionIdVar = "TRANSACTION_ID";
85     // Length of 'TRANSACTION_ID' string.
86     constexpr const auto transactionIdVarSize = std::strlen(transactionIdVar);
87     // Length of 'TRANSACTION_ID=' string.
88     constexpr const auto transactionIdVarOffset = transactionIdVarSize + 1;
89 
90     // Flush all the pending log messages into the journal
91     journalSync();
92 
93     sd_journal* j = nullptr;
94     int rc = sd_journal_open(&j, SD_JOURNAL_LOCAL_ONLY);
95     if (rc < 0)
96     {
97         logging::log<logging::level::ERR>(
98             "Failed to open journal",
99             logging::entry("DESCRIPTION=%s", strerror(-rc)));
100         return;
101     }
102 
103     std::string transactionIdStr = std::to_string(transactionId);
104     std::set<std::string> metalist;
105     auto metamap = g_errMetaMap.find(errMsg);
106     if (metamap != g_errMetaMap.end())
107     {
108         metalist.insert(metamap->second.begin(), metamap->second.end());
109     }
110 
111     // Add _PID field information in AdditionalData.
112     metalist.insert("_PID");
113 
114     std::vector<std::string> additionalData;
115 
116     // Read the journal from the end to get the most recent entry first.
117     // The result from the sd_journal_get_data() is of the form VARIABLE=value.
118     SD_JOURNAL_FOREACH_BACKWARDS(j)
119     {
120         const char* data = nullptr;
121         size_t length = 0;
122 
123         // Look for the transaction id metadata variable
124         rc = sd_journal_get_data(j, transactionIdVar, (const void**)&data,
125                                  &length);
126         if (rc < 0)
127         {
128             // This journal entry does not have the TRANSACTION_ID
129             // metadata variable.
130             continue;
131         }
132 
133         // journald does not guarantee that sd_journal_get_data() returns NULL
134         // terminated strings, so need to specify the size to use to compare,
135         // use the returned length instead of anything that relies on NULL
136         // terminators like strlen().
137         // The data variable is in the form of 'TRANSACTION_ID=1234'. Remove
138         // the TRANSACTION_ID characters plus the (=) sign to do the comparison.
139         // 'data + transactionIdVarOffset' will be in the form of '1234'.
140         // 'length - transactionIdVarOffset' will be the length of '1234'.
141         if ((length <= (transactionIdVarOffset)) ||
142             (transactionIdStr.compare(0, transactionIdStr.size(),
143                                       data + transactionIdVarOffset,
144                                       length - transactionIdVarOffset) != 0))
145         {
146             // The value of the TRANSACTION_ID metadata is not the requested
147             // transaction id number.
148             continue;
149         }
150 
151         // Search for all metadata variables in the current journal entry.
152         for (auto i = metalist.cbegin(); i != metalist.cend();)
153         {
154             rc = sd_journal_get_data(j, (*i).c_str(), (const void**)&data,
155                                      &length);
156             if (rc < 0)
157             {
158                 // Metadata variable not found, check next metadata variable.
159                 i++;
160                 continue;
161             }
162 
163             // Metadata variable found, save it and remove it from the set.
164             additionalData.emplace_back(data, length);
165             i = metalist.erase(i);
166         }
167         if (metalist.empty())
168         {
169             // All metadata variables found, break out of journal loop.
170             break;
171         }
172     }
173     if (!metalist.empty())
174     {
175         // Not all the metadata variables were found in the journal.
176         for (auto& metaVarStr : metalist)
177         {
178             logging::log<logging::level::INFO>(
179                 "Failed to find metadata",
180                 logging::entry("META_FIELD=%s", metaVarStr.c_str()));
181         }
182     }
183 
184     sd_journal_close(j);
185 
186     createEntry(errMsg, errLvl, additionalData);
187 }
188 
189 void Manager::createEntry(std::string errMsg, Entry::Level errLvl,
190                           std::vector<std::string> additionalData,
191                           const FFDCEntries& ffdc)
192 {
193     if (!Extensions::disableDefaultLogCaps())
194     {
195         if (errLvl < Entry::sevLowerLimit)
196         {
197             if (realErrors.size() >= ERROR_CAP)
198             {
199                 erase(realErrors.front());
200             }
201         }
202         else
203         {
204             if (infoErrors.size() >= ERROR_INFO_CAP)
205             {
206                 erase(infoErrors.front());
207             }
208         }
209     }
210 
211     entryId++;
212     if (errLvl >= Entry::sevLowerLimit)
213     {
214         infoErrors.push_back(entryId);
215     }
216     else
217     {
218         realErrors.push_back(entryId);
219     }
220     auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(
221                   std::chrono::system_clock::now().time_since_epoch())
222                   .count();
223     auto objPath = std::string(OBJ_ENTRY) + '/' + std::to_string(entryId);
224 
225     AssociationList objects{};
226     processMetadata(errMsg, additionalData, objects);
227 
228     auto e = std::make_unique<Entry>(busLog, objPath, entryId,
229                                      ms, // Milliseconds since 1970
230                                      errLvl, std::move(errMsg),
231                                      std::move(additionalData),
232                                      std::move(objects), fwVersion, *this);
233     serialize(*e);
234 
235     if (isQuiesceOnErrorEnabled())
236     {
237         checkQuiesceOnError(*e);
238     }
239 
240     doExtensionLogCreate(*e, ffdc);
241 
242     // Note: No need to close the file descriptors in the FFDC.
243 
244     entries.insert(std::make_pair(entryId, std::move(e)));
245 }
246 
247 bool Manager::isQuiesceOnErrorEnabled()
248 {
249     std::variant<bool> property;
250 
251     auto method = this->busLog.new_method_call(
252         "xyz.openbmc_project.Settings", "/xyz/openbmc_project/logging/settings",
253         "org.freedesktop.DBus.Properties", "Get");
254 
255     method.append("xyz.openbmc_project.Logging.Settings", "QuiesceOnHwError");
256 
257     try
258     {
259         auto reply = this->busLog.call(method);
260         reply.read(property);
261     }
262     catch (const SdBusError& e)
263     {
264         log<level::ERR>("Error reading QuiesceOnHwError property",
265                         entry("ERROR=%s", e.what()));
266         throw;
267     }
268 
269     return std::get<bool>(property);
270 }
271 
272 bool Manager::isCalloutPresent(const Entry& entry)
273 {
274     for (const auto& c : entry.additionalData())
275     {
276         if (c.find("CALLOUT_") != std::string::npos)
277         {
278             return true;
279         }
280     }
281 
282     return false;
283 }
284 
285 void Manager::findAndRemoveResolvedBlocks()
286 {
287     for (auto& entry : entries)
288     {
289         if (entry.second->resolved())
290         {
291             checkAndRemoveBlockingError(entry.first);
292         }
293     }
294 }
295 
296 void Manager::onEntryResolve(sdbusplus::message::message& msg)
297 {
298     using Interface = std::string;
299     using Property = std::string;
300     using Value = std::string;
301     using Properties = std::map<Property, sdbusplus::message::variant<Value>>;
302 
303     Interface interface;
304     Properties properties;
305 
306     msg.read(interface, properties);
307 
308     for (const auto& p : properties)
309     {
310         if (p.first == "Resolved")
311         {
312             findAndRemoveResolvedBlocks();
313             return;
314         }
315     }
316 }
317 
318 void Manager::checkAndQuiesceHost()
319 {
320     // First check host state
321     std::variant<std::string> property;
322 
323     auto method = this->busLog.new_method_call(
324         "xyz.openbmc_project.State.Host", "/xyz/openbmc_project/state/host0",
325         "org.freedesktop.DBus.Properties", "Get");
326 
327     method.append("xyz.openbmc_project.State.Host", "CurrentHostState");
328 
329     try
330     {
331         auto reply = this->busLog.call(method);
332         reply.read(property);
333     }
334     catch (const SdBusError& e)
335     {
336         // Quiescing the host is a "best effort" type function. If unable to
337         // read the host state or it comes back empty, just return.
338         // The boot block object will still be created and the associations to
339         // find the log will be present. Don't want a dependency with
340         // phosphor-state-manager service
341         log<level::INFO>("Error reading QuiesceOnHwError property",
342                          entry("ERROR=%s", e.what()));
343         return;
344     }
345 
346     std::string hostState = std::get<std::string>(property);
347 
348     // If host state is empty, do nothing
349     if (hostState.empty())
350     {
351         return;
352     }
353 
354     using Host = sdbusplus::xyz::openbmc_project::State::server::Host;
355     auto state = Host::convertHostStateFromString(hostState);
356     if (state != Host::HostState::Running)
357     {
358         return;
359     }
360 
361     auto quiesce = this->busLog.new_method_call(
362         "org.freedesktop.systemd1", "/org/freedesktop/systemd1",
363         "org.freedesktop.systemd1.Manager", "StartUnit");
364 
365     quiesce.append("obmc-host-quiesce@0.target");
366     quiesce.append("replace");
367 
368     this->busLog.call_noreply(quiesce);
369 }
370 
371 void Manager::checkQuiesceOnError(const Entry& entry)
372 {
373 
374     if (!isCalloutPresent(entry))
375     {
376         return;
377     }
378 
379     logging::log<logging::level::INFO>(
380         "QuiesceOnError set and callout present");
381 
382     auto blockPath =
383         std::string(OBJ_LOGGING) + "/block" + std::to_string(entry.id());
384     auto blockObj =
385         std::make_unique<Block>(this->busLog, blockPath, entry.id());
386     this->blockingErrors.push_back(std::move(blockObj));
387 
388     // Register call back if log is resolved
389     using namespace sdbusplus::bus::match::rules;
390     auto entryPath = std::string(OBJ_ENTRY) + '/' + std::to_string(entry.id());
391     auto callback = std::make_unique<sdbusplus::bus::match::match>(
392         this->busLog,
393         propertiesChanged(entryPath, "xyz.openbmc_project.Logging.Entry"),
394         std::bind(std::mem_fn(&Manager::onEntryResolve), this,
395                   std::placeholders::_1));
396 
397     propChangedEntryCallback.insert(
398         std::make_pair(entry.id(), std::move(callback)));
399 
400     checkAndQuiesceHost();
401 }
402 
403 void Manager::doExtensionLogCreate(const Entry& entry, const FFDCEntries& ffdc)
404 {
405     // Make the association <endpointpath>/<endpointtype> paths
406     std::vector<std::string> assocs;
407     for (const auto& [forwardType, reverseType, endpoint] :
408          entry.associations())
409     {
410         std::string e{endpoint};
411         e += '/' + reverseType;
412         assocs.push_back(e);
413     }
414 
415     for (auto& create : Extensions::getCreateFunctions())
416     {
417         try
418         {
419             create(entry.message(), entry.id(), entry.timestamp(),
420                    entry.severity(), entry.additionalData(), assocs, ffdc);
421         }
422         catch (std::exception& e)
423         {
424             log<level::ERR>("An extension's create function threw an exception",
425                             phosphor::logging::entry("ERROR=%s", e.what()));
426         }
427     }
428 }
429 
430 void Manager::processMetadata(const std::string& errorName,
431                               const std::vector<std::string>& additionalData,
432                               AssociationList& objects) const
433 {
434     // additionalData is a list of "metadata=value"
435     constexpr auto separator = '=';
436     for (const auto& entryItem : additionalData)
437     {
438         auto found = entryItem.find(separator);
439         if (std::string::npos != found)
440         {
441             auto metadata = entryItem.substr(0, found);
442             auto iter = meta.find(metadata);
443             if (meta.end() != iter)
444             {
445                 (iter->second)(metadata, additionalData, objects);
446             }
447         }
448     }
449 }
450 
451 void Manager::checkAndRemoveBlockingError(uint32_t entryId)
452 {
453     // First look for blocking object and remove
454     auto it = find_if(
455         blockingErrors.begin(), blockingErrors.end(),
456         [&](std::unique_ptr<Block>& obj) { return obj->entryId == entryId; });
457     if (it != blockingErrors.end())
458     {
459         blockingErrors.erase(it);
460     }
461 
462     // Now remove the callback looking for the error to be resolved
463     auto resolveFind = propChangedEntryCallback.find(entryId);
464     if (resolveFind != propChangedEntryCallback.end())
465     {
466         propChangedEntryCallback.erase(resolveFind);
467     }
468 
469     return;
470 }
471 
472 void Manager::erase(uint32_t entryId)
473 {
474     auto entryFound = entries.find(entryId);
475     if (entries.end() != entryFound)
476     {
477         for (auto& func : Extensions::getDeleteProhibitedFunctions())
478         {
479             try
480             {
481                 bool prohibited = false;
482                 func(entryId, prohibited);
483                 if (prohibited)
484                 {
485                     // Future work remains to throw an error here.
486                     return;
487                 }
488             }
489             catch (std::exception& e)
490             {
491                 log<level::ERR>(
492                     "An extension's deleteProhibited function threw "
493                     "an exception",
494                     entry("ERROR=%s", e.what()));
495             }
496         }
497 
498         // Delete the persistent representation of this error.
499         fs::path errorPath(ERRLOG_PERSIST_PATH);
500         errorPath /= std::to_string(entryId);
501         fs::remove(errorPath);
502 
503         auto removeId = [](std::list<uint32_t>& ids, uint32_t id) {
504             auto it = std::find(ids.begin(), ids.end(), id);
505             if (it != ids.end())
506             {
507                 ids.erase(it);
508             }
509         };
510         if (entryFound->second->severity() >= Entry::sevLowerLimit)
511         {
512             removeId(infoErrors, entryId);
513         }
514         else
515         {
516             removeId(realErrors, entryId);
517         }
518         entries.erase(entryFound);
519 
520         checkAndRemoveBlockingError(entryId);
521 
522         for (auto& remove : Extensions::getDeleteFunctions())
523         {
524             try
525             {
526                 remove(entryId);
527             }
528             catch (std::exception& e)
529             {
530                 log<level::ERR>("An extension's delete function threw an "
531                                 "exception",
532                                 entry("ERROR=%s", e.what()));
533             }
534         }
535     }
536     else
537     {
538         logging::log<level::ERR>("Invalid entry ID to delete",
539                                  logging::entry("ID=%d", entryId));
540     }
541 }
542 
543 void Manager::restore()
544 {
545     auto sanity = [](const auto& id, const auto& restoredId) {
546         return id == restoredId;
547     };
548     std::vector<uint32_t> errorIds;
549 
550     fs::path dir(ERRLOG_PERSIST_PATH);
551     if (!fs::exists(dir) || fs::is_empty(dir))
552     {
553         return;
554     }
555 
556     for (auto& file : fs::directory_iterator(dir))
557     {
558         auto id = file.path().filename().c_str();
559         auto idNum = std::stol(id);
560         auto e = std::make_unique<Entry>(
561             busLog, std::string(OBJ_ENTRY) + '/' + id, idNum, *this);
562         if (deserialize(file.path(), *e))
563         {
564             // validate the restored error entry id
565             if (sanity(static_cast<uint32_t>(idNum), e->id()))
566             {
567                 e->emit_object_added();
568                 if (e->severity() >= Entry::sevLowerLimit)
569                 {
570                     infoErrors.push_back(idNum);
571                 }
572                 else
573                 {
574                     realErrors.push_back(idNum);
575                 }
576 
577                 entries.insert(std::make_pair(idNum, std::move(e)));
578                 errorIds.push_back(idNum);
579             }
580             else
581             {
582                 logging::log<logging::level::ERR>(
583                     "Failed in sanity check while restoring error entry. "
584                     "Ignoring error entry",
585                     logging::entry("ID_NUM=%d", idNum),
586                     logging::entry("ENTRY_ID=%d", e->id()));
587             }
588         }
589     }
590 
591     if (!errorIds.empty())
592     {
593         entryId = *(std::max_element(errorIds.begin(), errorIds.end()));
594     }
595 }
596 
597 void Manager::journalSync()
598 {
599     bool syncRequested = false;
600     auto fd = -1;
601     auto rc = -1;
602     auto wd = -1;
603     auto bus = sdbusplus::bus::new_default();
604 
605     auto start =
606         duration_cast<microseconds>(steady_clock::now().time_since_epoch())
607             .count();
608 
609     // Each time an error log is committed, a request to sync the journal
610     // must occur and block that error log commit until it completes. A 5sec
611     // block is done to allow sufficient time for the journal to be synced.
612     //
613     // Number of loop iterations = 3 for the following reasons:
614     // Iteration #1: Requests a journal sync by killing the journald service.
615     // Iteration #2: Setup an inotify watch to monitor the synced file that
616     //               journald updates with the timestamp the last time the
617     //               journal was flushed.
618     // Iteration #3: Poll to wait until inotify reports an event which blocks
619     //               the error log from being commited until the sync completes.
620     constexpr auto maxRetry = 3;
621     for (int i = 0; i < maxRetry; i++)
622     {
623         // Read timestamp from synced file
624         constexpr auto syncedPath = "/run/systemd/journal/synced";
625         std::ifstream syncedFile(syncedPath);
626         if (syncedFile.fail())
627         {
628             // If the synced file doesn't exist, a sync request will create it.
629             if (errno != ENOENT)
630             {
631                 log<level::ERR>("Failed to open journal synced file",
632                                 entry("FILENAME=%s", syncedPath),
633                                 entry("ERRNO=%d", errno));
634                 return;
635             }
636         }
637         else
638         {
639             // Only read the synced file if it exists.
640             // See if a sync happened by now
641             std::string timestampStr;
642             std::getline(syncedFile, timestampStr);
643             auto timestamp = std::stoll(timestampStr);
644             if (timestamp >= start)
645             {
646                 break;
647             }
648         }
649 
650         // Let's ask for a sync, but only once
651         if (!syncRequested)
652         {
653             syncRequested = true;
654 
655             constexpr auto JOURNAL_UNIT = "systemd-journald.service";
656             auto signal = SIGRTMIN + 1;
657 
658             auto method = bus.new_method_call(SYSTEMD_BUSNAME, SYSTEMD_PATH,
659                                               SYSTEMD_INTERFACE, "KillUnit");
660             method.append(JOURNAL_UNIT, "main", signal);
661             bus.call(method);
662             if (method.is_method_error())
663             {
664                 log<level::ERR>("Failed to kill journal service");
665                 break;
666             }
667             continue;
668         }
669 
670         // Let's install the inotify watch, if we didn't do that yet. This watch
671         // monitors the syncedFile for when journald updates it with a newer
672         // timestamp. This means the journal has been flushed.
673         if (fd < 0)
674         {
675             fd = inotify_init1(IN_NONBLOCK | IN_CLOEXEC);
676             if (fd < 0)
677             {
678                 log<level::ERR>("Failed to create inotify watch",
679                                 entry("ERRNO=%d", errno));
680                 return;
681             }
682 
683             constexpr auto JOURNAL_RUN_PATH = "/run/systemd/journal";
684             wd = inotify_add_watch(fd, JOURNAL_RUN_PATH,
685                                    IN_MOVED_TO | IN_DONT_FOLLOW | IN_ONLYDIR);
686             if (wd < 0)
687             {
688                 log<level::ERR>("Failed to watch journal directory",
689                                 entry("PATH=%s", JOURNAL_RUN_PATH),
690                                 entry("ERRNO=%d", errno));
691                 close(fd);
692                 return;
693             }
694             continue;
695         }
696 
697         // Let's wait until inotify reports an event
698         struct pollfd fds = {
699             .fd = fd,
700             .events = POLLIN,
701         };
702         constexpr auto pollTimeout = 5; // 5 seconds
703         rc = poll(&fds, 1, pollTimeout * 1000);
704         if (rc < 0)
705         {
706             log<level::ERR>("Failed to add event", entry("ERRNO=%d", errno),
707                             entry("ERR=%s", strerror(-rc)));
708             inotify_rm_watch(fd, wd);
709             close(fd);
710             return;
711         }
712         else if (rc == 0)
713         {
714             log<level::INFO>("Poll timeout, no new journal synced data",
715                              entry("TIMEOUT=%d", pollTimeout));
716             break;
717         }
718 
719         // Read from the specified file descriptor until there is no new data,
720         // throwing away everything read since the timestamp will be read at the
721         // beginning of the loop.
722         constexpr auto maxBytes = 64;
723         uint8_t buffer[maxBytes];
724         while (read(fd, buffer, maxBytes) > 0)
725             ;
726     }
727 
728     if (fd != -1)
729     {
730         if (wd != -1)
731         {
732             inotify_rm_watch(fd, wd);
733         }
734         close(fd);
735     }
736 
737     return;
738 }
739 
740 std::string Manager::readFWVersion()
741 {
742     std::string version;
743     std::ifstream versionFile{BMC_VERSION_FILE};
744     std::string line;
745     static constexpr auto VERSION_ID = "VERSION_ID=";
746 
747     while (std::getline(versionFile, line))
748     {
749         if (line.find(VERSION_ID) != std::string::npos)
750         {
751             auto pos = line.find_first_of('"') + 1;
752             version = line.substr(pos, line.find_last_of('"') - pos);
753             break;
754         }
755     }
756 
757     if (version.empty())
758     {
759         log<level::ERR>("Unable to read BMC firmware version");
760     }
761 
762     return version;
763 }
764 
765 void Manager::create(const std::string& message, Entry::Level severity,
766                      const std::map<std::string, std::string>& additionalData)
767 {
768     // Convert the map into a vector of "key=value" strings
769     std::vector<std::string> ad;
770     metadata::associations::combine(additionalData, ad);
771 
772     createEntry(message, severity, ad);
773 }
774 
775 void Manager::createWithFFDC(
776     const std::string& message, Entry::Level severity,
777     const std::map<std::string, std::string>& additionalData,
778     const FFDCEntries& ffdc)
779 {
780     // Convert the map into a vector of "key=value" strings
781     std::vector<std::string> ad;
782     metadata::associations::combine(additionalData, ad);
783 
784     createEntry(message, severity, ad, ffdc);
785 }
786 
787 } // namespace internal
788 } // namespace logging
789 } // namespace phosphor
790