xref: /openbmc/phosphor-logging/log_manager.cpp (revision f397afcc472d521341a5ab9afd6065a6195db6c0)
1 #include "config.h"
2 
3 #include "log_manager.hpp"
4 
5 #include "elog_entry.hpp"
6 #include "elog_meta.hpp"
7 #include "elog_serialize.hpp"
8 #include "extensions.hpp"
9 #include "util.hpp"
10 
11 #include <poll.h>
12 #include <sys/inotify.h>
13 #include <systemd/sd-bus.h>
14 #include <systemd/sd-journal.h>
15 #include <unistd.h>
16 
17 #include <cassert>
18 #include <chrono>
19 #include <cstdio>
20 #include <cstring>
21 #include <fstream>
22 #include <functional>
23 #include <future>
24 #include <iostream>
25 #include <map>
26 #include <phosphor-logging/log.hpp>
27 #include <sdbusplus/vtable.hpp>
28 #include <set>
29 #include <string>
30 #include <vector>
31 #include <xyz/openbmc_project/State/Host/server.hpp>
32 
33 using namespace phosphor::logging;
34 using namespace std::chrono;
35 using sdbusplus::exception::SdBusError;
36 extern const std::map<metadata::Metadata,
37                       std::function<metadata::associations::Type>>
38     meta;
39 
40 namespace phosphor
41 {
42 namespace logging
43 {
44 namespace internal
45 {
46 inline auto getLevel(const std::string& errMsg)
47 {
48     auto reqLevel = Entry::Level::Error; // Default to Error
49 
50     auto levelmap = g_errLevelMap.find(errMsg);
51     if (levelmap != g_errLevelMap.end())
52     {
53         reqLevel = static_cast<Entry::Level>(levelmap->second);
54     }
55 
56     return reqLevel;
57 }
58 
59 int Manager::getRealErrSize()
60 {
61     return realErrors.size();
62 }
63 
64 int Manager::getInfoErrSize()
65 {
66     return infoErrors.size();
67 }
68 
69 void Manager::commit(uint64_t transactionId, std::string errMsg)
70 {
71     auto level = getLevel(errMsg);
72     _commit(transactionId, std::move(errMsg), level);
73 }
74 
75 void Manager::commitWithLvl(uint64_t transactionId, std::string errMsg,
76                             uint32_t errLvl)
77 {
78     _commit(transactionId, std::move(errMsg),
79             static_cast<Entry::Level>(errLvl));
80 }
81 
82 void Manager::_commit(uint64_t transactionId, std::string&& errMsg,
83                       Entry::Level errLvl)
84 {
85     constexpr const auto transactionIdVar = "TRANSACTION_ID";
86     // Length of 'TRANSACTION_ID' string.
87     constexpr const auto transactionIdVarSize = std::strlen(transactionIdVar);
88     // Length of 'TRANSACTION_ID=' string.
89     constexpr const auto transactionIdVarOffset = transactionIdVarSize + 1;
90 
91     // Flush all the pending log messages into the journal
92     journalSync();
93 
94     sd_journal* j = nullptr;
95     int rc = sd_journal_open(&j, SD_JOURNAL_LOCAL_ONLY);
96     if (rc < 0)
97     {
98         logging::log<logging::level::ERR>(
99             "Failed to open journal",
100             logging::entry("DESCRIPTION=%s", strerror(-rc)));
101         return;
102     }
103 
104     std::string transactionIdStr = std::to_string(transactionId);
105     std::set<std::string> metalist;
106     auto metamap = g_errMetaMap.find(errMsg);
107     if (metamap != g_errMetaMap.end())
108     {
109         metalist.insert(metamap->second.begin(), metamap->second.end());
110     }
111 
112     // Add _PID field information in AdditionalData.
113     metalist.insert("_PID");
114 
115     std::vector<std::string> additionalData;
116 
117     // Read the journal from the end to get the most recent entry first.
118     // The result from the sd_journal_get_data() is of the form VARIABLE=value.
119     SD_JOURNAL_FOREACH_BACKWARDS(j)
120     {
121         const char* data = nullptr;
122         size_t length = 0;
123 
124         // Look for the transaction id metadata variable
125         rc = sd_journal_get_data(j, transactionIdVar, (const void**)&data,
126                                  &length);
127         if (rc < 0)
128         {
129             // This journal entry does not have the TRANSACTION_ID
130             // metadata variable.
131             continue;
132         }
133 
134         // journald does not guarantee that sd_journal_get_data() returns NULL
135         // terminated strings, so need to specify the size to use to compare,
136         // use the returned length instead of anything that relies on NULL
137         // terminators like strlen().
138         // The data variable is in the form of 'TRANSACTION_ID=1234'. Remove
139         // the TRANSACTION_ID characters plus the (=) sign to do the comparison.
140         // 'data + transactionIdVarOffset' will be in the form of '1234'.
141         // 'length - transactionIdVarOffset' will be the length of '1234'.
142         if ((length <= (transactionIdVarOffset)) ||
143             (transactionIdStr.compare(0, transactionIdStr.size(),
144                                       data + transactionIdVarOffset,
145                                       length - transactionIdVarOffset) != 0))
146         {
147             // The value of the TRANSACTION_ID metadata is not the requested
148             // transaction id number.
149             continue;
150         }
151 
152         // Search for all metadata variables in the current journal entry.
153         for (auto i = metalist.cbegin(); i != metalist.cend();)
154         {
155             rc = sd_journal_get_data(j, (*i).c_str(), (const void**)&data,
156                                      &length);
157             if (rc < 0)
158             {
159                 // Metadata variable not found, check next metadata variable.
160                 i++;
161                 continue;
162             }
163 
164             // Metadata variable found, save it and remove it from the set.
165             additionalData.emplace_back(data, length);
166             i = metalist.erase(i);
167         }
168         if (metalist.empty())
169         {
170             // All metadata variables found, break out of journal loop.
171             break;
172         }
173     }
174     if (!metalist.empty())
175     {
176         // Not all the metadata variables were found in the journal.
177         for (auto& metaVarStr : metalist)
178         {
179             logging::log<logging::level::INFO>(
180                 "Failed to find metadata",
181                 logging::entry("META_FIELD=%s", metaVarStr.c_str()));
182         }
183     }
184 
185     sd_journal_close(j);
186 
187     createEntry(errMsg, errLvl, additionalData);
188 }
189 
190 void Manager::createEntry(std::string errMsg, Entry::Level errLvl,
191                           std::vector<std::string> additionalData,
192                           const FFDCEntries& ffdc)
193 {
194     if (!Extensions::disableDefaultLogCaps())
195     {
196         if (errLvl < Entry::sevLowerLimit)
197         {
198             if (realErrors.size() >= ERROR_CAP)
199             {
200                 erase(realErrors.front());
201             }
202         }
203         else
204         {
205             if (infoErrors.size() >= ERROR_INFO_CAP)
206             {
207                 erase(infoErrors.front());
208             }
209         }
210     }
211 
212     entryId++;
213     if (errLvl >= Entry::sevLowerLimit)
214     {
215         infoErrors.push_back(entryId);
216     }
217     else
218     {
219         realErrors.push_back(entryId);
220     }
221     auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(
222                   std::chrono::system_clock::now().time_since_epoch())
223                   .count();
224     auto objPath = std::string(OBJ_ENTRY) + '/' + std::to_string(entryId);
225 
226     AssociationList objects{};
227     processMetadata(errMsg, additionalData, objects);
228 
229     auto e = std::make_unique<Entry>(busLog, objPath, entryId,
230                                      ms, // Milliseconds since 1970
231                                      errLvl, std::move(errMsg),
232                                      std::move(additionalData),
233                                      std::move(objects), fwVersion, *this);
234     auto path = serialize(*e);
235     e->path(path);
236 
237     if (isQuiesceOnErrorEnabled() && isCalloutPresent(*e))
238     {
239         quiesceOnError(entryId);
240     }
241 
242     // Add entry before calling the extensions so that they have access to it
243     entries.insert(std::make_pair(entryId, std::move(e)));
244 
245     doExtensionLogCreate(*entries.find(entryId)->second, ffdc);
246 
247     // Note: No need to close the file descriptors in the FFDC.
248 }
249 
250 bool Manager::isQuiesceOnErrorEnabled()
251 {
252     std::variant<bool> property;
253 
254     auto method = this->busLog.new_method_call(
255         "xyz.openbmc_project.Settings", "/xyz/openbmc_project/logging/settings",
256         "org.freedesktop.DBus.Properties", "Get");
257 
258     method.append("xyz.openbmc_project.Logging.Settings", "QuiesceOnHwError");
259 
260     try
261     {
262         auto reply = this->busLog.call(method);
263         reply.read(property);
264     }
265     catch (const SdBusError& e)
266     {
267         log<level::ERR>("Error reading QuiesceOnHwError property",
268                         entry("ERROR=%s", e.what()));
269         throw;
270     }
271 
272     return std::get<bool>(property);
273 }
274 
275 bool Manager::isCalloutPresent(const Entry& entry)
276 {
277     for (const auto& c : entry.additionalData())
278     {
279         if (c.find("CALLOUT_") != std::string::npos)
280         {
281             return true;
282         }
283     }
284 
285     return false;
286 }
287 
288 void Manager::findAndRemoveResolvedBlocks()
289 {
290     for (auto& entry : entries)
291     {
292         if (entry.second->resolved())
293         {
294             checkAndRemoveBlockingError(entry.first);
295         }
296     }
297 }
298 
299 void Manager::onEntryResolve(sdbusplus::message::message& msg)
300 {
301     using Interface = std::string;
302     using Property = std::string;
303     using Value = std::string;
304     using Properties = std::map<Property, std::variant<Value>>;
305 
306     Interface interface;
307     Properties properties;
308 
309     msg.read(interface, properties);
310 
311     for (const auto& p : properties)
312     {
313         if (p.first == "Resolved")
314         {
315             findAndRemoveResolvedBlocks();
316             return;
317         }
318     }
319 }
320 
321 void Manager::checkAndQuiesceHost()
322 {
323     // First check host state
324     std::variant<std::string> property;
325 
326     auto method = this->busLog.new_method_call(
327         "xyz.openbmc_project.State.Host", "/xyz/openbmc_project/state/host0",
328         "org.freedesktop.DBus.Properties", "Get");
329 
330     method.append("xyz.openbmc_project.State.Host", "CurrentHostState");
331 
332     try
333     {
334         auto reply = this->busLog.call(method);
335         reply.read(property);
336     }
337     catch (const SdBusError& e)
338     {
339         // Quiescing the host is a "best effort" type function. If unable to
340         // read the host state or it comes back empty, just return.
341         // The boot block object will still be created and the associations to
342         // find the log will be present. Don't want a dependency with
343         // phosphor-state-manager service
344         log<level::INFO>("Error reading QuiesceOnHwError property",
345                          entry("ERROR=%s", e.what()));
346         return;
347     }
348 
349     std::string hostState = std::get<std::string>(property);
350 
351     // If host state is empty, do nothing
352     if (hostState.empty())
353     {
354         return;
355     }
356 
357     using Host = sdbusplus::xyz::openbmc_project::State::server::Host;
358     auto state = Host::convertHostStateFromString(hostState);
359     if (state != Host::HostState::Running)
360     {
361         return;
362     }
363 
364     auto quiesce = this->busLog.new_method_call(
365         "org.freedesktop.systemd1", "/org/freedesktop/systemd1",
366         "org.freedesktop.systemd1.Manager", "StartUnit");
367 
368     quiesce.append("obmc-host-quiesce@0.target");
369     quiesce.append("replace");
370 
371     this->busLog.call_noreply(quiesce);
372 }
373 
374 void Manager::quiesceOnError(const uint32_t entryId)
375 {
376     // Verify we don't already have this entry blocking
377     auto it = find_if(
378         this->blockingErrors.begin(), this->blockingErrors.end(),
379         [&](std::unique_ptr<Block>& obj) { return obj->entryId == entryId; });
380     if (it != this->blockingErrors.end())
381     {
382         // Already recorded so just return
383         logging::log<logging::level::DEBUG>(
384             "QuiesceOnError set and callout present but entry already logged");
385         return;
386     }
387 
388     logging::log<logging::level::INFO>(
389         "QuiesceOnError set and callout present");
390 
391     auto blockPath =
392         std::string(OBJ_LOGGING) + "/block" + std::to_string(entryId);
393     auto blockObj = std::make_unique<Block>(this->busLog, blockPath, entryId);
394     this->blockingErrors.push_back(std::move(blockObj));
395 
396     // Register call back if log is resolved
397     using namespace sdbusplus::bus::match::rules;
398     auto entryPath = std::string(OBJ_ENTRY) + '/' + std::to_string(entryId);
399     auto callback = std::make_unique<sdbusplus::bus::match::match>(
400         this->busLog,
401         propertiesChanged(entryPath, "xyz.openbmc_project.Logging.Entry"),
402         std::bind(std::mem_fn(&Manager::onEntryResolve), this,
403                   std::placeholders::_1));
404 
405     propChangedEntryCallback.insert(
406         std::make_pair(entryId, std::move(callback)));
407 
408     checkAndQuiesceHost();
409 }
410 
411 void Manager::doExtensionLogCreate(const Entry& entry, const FFDCEntries& ffdc)
412 {
413     // Make the association <endpointpath>/<endpointtype> paths
414     std::vector<std::string> assocs;
415     for (const auto& [forwardType, reverseType, endpoint] :
416          entry.associations())
417     {
418         std::string e{endpoint};
419         e += '/' + reverseType;
420         assocs.push_back(e);
421     }
422 
423     for (auto& create : Extensions::getCreateFunctions())
424     {
425         try
426         {
427             create(entry.message(), entry.id(), entry.timestamp(),
428                    entry.severity(), entry.additionalData(), assocs, ffdc);
429         }
430         catch (std::exception& e)
431         {
432             log<level::ERR>("An extension's create function threw an exception",
433                             phosphor::logging::entry("ERROR=%s", e.what()));
434         }
435     }
436 }
437 
438 void Manager::processMetadata(const std::string& errorName,
439                               const std::vector<std::string>& additionalData,
440                               AssociationList& objects) const
441 {
442     // additionalData is a list of "metadata=value"
443     constexpr auto separator = '=';
444     for (const auto& entryItem : additionalData)
445     {
446         auto found = entryItem.find(separator);
447         if (std::string::npos != found)
448         {
449             auto metadata = entryItem.substr(0, found);
450             auto iter = meta.find(metadata);
451             if (meta.end() != iter)
452             {
453                 (iter->second)(metadata, additionalData, objects);
454             }
455         }
456     }
457 }
458 
459 void Manager::checkAndRemoveBlockingError(uint32_t entryId)
460 {
461     // First look for blocking object and remove
462     auto it = find_if(
463         blockingErrors.begin(), blockingErrors.end(),
464         [&](std::unique_ptr<Block>& obj) { return obj->entryId == entryId; });
465     if (it != blockingErrors.end())
466     {
467         blockingErrors.erase(it);
468     }
469 
470     // Now remove the callback looking for the error to be resolved
471     auto resolveFind = propChangedEntryCallback.find(entryId);
472     if (resolveFind != propChangedEntryCallback.end())
473     {
474         propChangedEntryCallback.erase(resolveFind);
475     }
476 
477     return;
478 }
479 
480 void Manager::erase(uint32_t entryId)
481 {
482     auto entryFound = entries.find(entryId);
483     if (entries.end() != entryFound)
484     {
485         for (auto& func : Extensions::getDeleteProhibitedFunctions())
486         {
487             try
488             {
489                 bool prohibited = false;
490                 func(entryId, prohibited);
491                 if (prohibited)
492                 {
493                     // Future work remains to throw an error here.
494                     return;
495                 }
496             }
497             catch (std::exception& e)
498             {
499                 log<level::ERR>(
500                     "An extension's deleteProhibited function threw "
501                     "an exception",
502                     entry("ERROR=%s", e.what()));
503             }
504         }
505 
506         // Delete the persistent representation of this error.
507         fs::path errorPath(ERRLOG_PERSIST_PATH);
508         errorPath /= std::to_string(entryId);
509         fs::remove(errorPath);
510 
511         auto removeId = [](std::list<uint32_t>& ids, uint32_t id) {
512             auto it = std::find(ids.begin(), ids.end(), id);
513             if (it != ids.end())
514             {
515                 ids.erase(it);
516             }
517         };
518         if (entryFound->second->severity() >= Entry::sevLowerLimit)
519         {
520             removeId(infoErrors, entryId);
521         }
522         else
523         {
524             removeId(realErrors, entryId);
525         }
526         entries.erase(entryFound);
527 
528         checkAndRemoveBlockingError(entryId);
529 
530         for (auto& remove : Extensions::getDeleteFunctions())
531         {
532             try
533             {
534                 remove(entryId);
535             }
536             catch (std::exception& e)
537             {
538                 log<level::ERR>("An extension's delete function threw an "
539                                 "exception",
540                                 entry("ERROR=%s", e.what()));
541             }
542         }
543     }
544     else
545     {
546         logging::log<level::ERR>("Invalid entry ID to delete",
547                                  logging::entry("ID=%d", entryId));
548     }
549 }
550 
551 void Manager::restore()
552 {
553     auto sanity = [](const auto& id, const auto& restoredId) {
554         return id == restoredId;
555     };
556     std::vector<uint32_t> errorIds;
557 
558     fs::path dir(ERRLOG_PERSIST_PATH);
559     if (!fs::exists(dir) || fs::is_empty(dir))
560     {
561         return;
562     }
563 
564     for (auto& file : fs::directory_iterator(dir))
565     {
566         auto id = file.path().filename().c_str();
567         auto idNum = std::stol(id);
568         auto e = std::make_unique<Entry>(
569             busLog, std::string(OBJ_ENTRY) + '/' + id, idNum, *this);
570         if (deserialize(file.path(), *e))
571         {
572             // validate the restored error entry id
573             if (sanity(static_cast<uint32_t>(idNum), e->id()))
574             {
575                 e->path(file.path());
576                 e->emit_object_added();
577                 if (e->severity() >= Entry::sevLowerLimit)
578                 {
579                     infoErrors.push_back(idNum);
580                 }
581                 else
582                 {
583                     realErrors.push_back(idNum);
584                 }
585 
586                 entries.insert(std::make_pair(idNum, std::move(e)));
587                 errorIds.push_back(idNum);
588             }
589             else
590             {
591                 logging::log<logging::level::ERR>(
592                     "Failed in sanity check while restoring error entry. "
593                     "Ignoring error entry",
594                     logging::entry("ID_NUM=%d", idNum),
595                     logging::entry("ENTRY_ID=%d", e->id()));
596             }
597         }
598     }
599 
600     if (!errorIds.empty())
601     {
602         entryId = *(std::max_element(errorIds.begin(), errorIds.end()));
603     }
604 }
605 
606 void Manager::journalSync()
607 {
608     bool syncRequested = false;
609     auto fd = -1;
610     auto rc = -1;
611     auto wd = -1;
612     auto bus = sdbusplus::bus::new_default();
613 
614     auto start =
615         duration_cast<microseconds>(steady_clock::now().time_since_epoch())
616             .count();
617 
618     // Each time an error log is committed, a request to sync the journal
619     // must occur and block that error log commit until it completes. A 5sec
620     // block is done to allow sufficient time for the journal to be synced.
621     //
622     // Number of loop iterations = 3 for the following reasons:
623     // Iteration #1: Requests a journal sync by killing the journald service.
624     // Iteration #2: Setup an inotify watch to monitor the synced file that
625     //               journald updates with the timestamp the last time the
626     //               journal was flushed.
627     // Iteration #3: Poll to wait until inotify reports an event which blocks
628     //               the error log from being commited until the sync completes.
629     constexpr auto maxRetry = 3;
630     for (int i = 0; i < maxRetry; i++)
631     {
632         // Read timestamp from synced file
633         constexpr auto syncedPath = "/run/systemd/journal/synced";
634         std::ifstream syncedFile(syncedPath);
635         if (syncedFile.fail())
636         {
637             // If the synced file doesn't exist, a sync request will create it.
638             if (errno != ENOENT)
639             {
640                 log<level::ERR>("Failed to open journal synced file",
641                                 entry("FILENAME=%s", syncedPath),
642                                 entry("ERRNO=%d", errno));
643                 return;
644             }
645         }
646         else
647         {
648             // Only read the synced file if it exists.
649             // See if a sync happened by now
650             std::string timestampStr;
651             std::getline(syncedFile, timestampStr);
652             auto timestamp = std::stoll(timestampStr);
653             if (timestamp >= start)
654             {
655                 break;
656             }
657         }
658 
659         // Let's ask for a sync, but only once
660         if (!syncRequested)
661         {
662             syncRequested = true;
663 
664             constexpr auto JOURNAL_UNIT = "systemd-journald.service";
665             auto signal = SIGRTMIN + 1;
666 
667             auto method = bus.new_method_call(SYSTEMD_BUSNAME, SYSTEMD_PATH,
668                                               SYSTEMD_INTERFACE, "KillUnit");
669             method.append(JOURNAL_UNIT, "main", signal);
670             bus.call(method);
671             if (method.is_method_error())
672             {
673                 log<level::ERR>("Failed to kill journal service");
674                 break;
675             }
676             continue;
677         }
678 
679         // Let's install the inotify watch, if we didn't do that yet. This watch
680         // monitors the syncedFile for when journald updates it with a newer
681         // timestamp. This means the journal has been flushed.
682         if (fd < 0)
683         {
684             fd = inotify_init1(IN_NONBLOCK | IN_CLOEXEC);
685             if (fd < 0)
686             {
687                 log<level::ERR>("Failed to create inotify watch",
688                                 entry("ERRNO=%d", errno));
689                 return;
690             }
691 
692             constexpr auto JOURNAL_RUN_PATH = "/run/systemd/journal";
693             wd = inotify_add_watch(fd, JOURNAL_RUN_PATH,
694                                    IN_MOVED_TO | IN_DONT_FOLLOW | IN_ONLYDIR);
695             if (wd < 0)
696             {
697                 log<level::ERR>("Failed to watch journal directory",
698                                 entry("PATH=%s", JOURNAL_RUN_PATH),
699                                 entry("ERRNO=%d", errno));
700                 close(fd);
701                 return;
702             }
703             continue;
704         }
705 
706         // Let's wait until inotify reports an event
707         struct pollfd fds = {
708             .fd = fd,
709             .events = POLLIN,
710         };
711         constexpr auto pollTimeout = 5; // 5 seconds
712         rc = poll(&fds, 1, pollTimeout * 1000);
713         if (rc < 0)
714         {
715             log<level::ERR>("Failed to add event", entry("ERRNO=%d", errno),
716                             entry("ERR=%s", strerror(-rc)));
717             inotify_rm_watch(fd, wd);
718             close(fd);
719             return;
720         }
721         else if (rc == 0)
722         {
723             log<level::INFO>("Poll timeout, no new journal synced data",
724                              entry("TIMEOUT=%d", pollTimeout));
725             break;
726         }
727 
728         // Read from the specified file descriptor until there is no new data,
729         // throwing away everything read since the timestamp will be read at the
730         // beginning of the loop.
731         constexpr auto maxBytes = 64;
732         uint8_t buffer[maxBytes];
733         while (read(fd, buffer, maxBytes) > 0)
734             ;
735     }
736 
737     if (fd != -1)
738     {
739         if (wd != -1)
740         {
741             inotify_rm_watch(fd, wd);
742         }
743         close(fd);
744     }
745 
746     return;
747 }
748 
749 std::string Manager::readFWVersion()
750 {
751     auto version = util::getOSReleaseValue("VERSION_ID");
752 
753     if (!version)
754     {
755         log<level::ERR>("Unable to read BMC firmware version");
756     }
757 
758     return version.value_or("");
759 }
760 
761 void Manager::create(const std::string& message, Entry::Level severity,
762                      const std::map<std::string, std::string>& additionalData)
763 {
764     // Convert the map into a vector of "key=value" strings
765     std::vector<std::string> ad;
766     metadata::associations::combine(additionalData, ad);
767 
768     createEntry(message, severity, ad);
769 }
770 
771 void Manager::createWithFFDC(
772     const std::string& message, Entry::Level severity,
773     const std::map<std::string, std::string>& additionalData,
774     const FFDCEntries& ffdc)
775 {
776     // Convert the map into a vector of "key=value" strings
777     std::vector<std::string> ad;
778     metadata::associations::combine(additionalData, ad);
779 
780     createEntry(message, severity, ad, ffdc);
781 }
782 
783 } // namespace internal
784 } // namespace logging
785 } // namespace phosphor
786