1 #include "config.h"
2 
3 #include "log_manager.hpp"
4 
5 #include "elog_entry.hpp"
6 #include "elog_meta.hpp"
7 #include "elog_serialize.hpp"
8 #include "extensions.hpp"
9 #include "util.hpp"
10 
11 #include <poll.h>
12 #include <sys/inotify.h>
13 #include <systemd/sd-bus.h>
14 #include <systemd/sd-journal.h>
15 #include <unistd.h>
16 
17 #include <cassert>
18 #include <chrono>
19 #include <cstdio>
20 #include <cstring>
21 #include <fstream>
22 #include <functional>
23 #include <future>
24 #include <iostream>
25 #include <map>
26 #include <phosphor-logging/log.hpp>
27 #include <sdbusplus/vtable.hpp>
28 #include <set>
29 #include <string>
30 #include <vector>
31 #include <xyz/openbmc_project/State/Host/server.hpp>
32 
33 using namespace phosphor::logging;
34 using namespace std::chrono;
35 using sdbusplus::exception::SdBusError;
36 extern const std::map<metadata::Metadata,
37                       std::function<metadata::associations::Type>>
38     meta;
39 
40 namespace phosphor
41 {
42 namespace logging
43 {
44 namespace internal
45 {
46 inline auto getLevel(const std::string& errMsg)
47 {
48     auto reqLevel = Entry::Level::Error; // Default to Error
49 
50     auto levelmap = g_errLevelMap.find(errMsg);
51     if (levelmap != g_errLevelMap.end())
52     {
53         reqLevel = static_cast<Entry::Level>(levelmap->second);
54     }
55 
56     return reqLevel;
57 }
58 
59 int Manager::getRealErrSize()
60 {
61     return realErrors.size();
62 }
63 
64 int Manager::getInfoErrSize()
65 {
66     return infoErrors.size();
67 }
68 
69 void Manager::commit(uint64_t transactionId, std::string errMsg)
70 {
71     auto level = getLevel(errMsg);
72     _commit(transactionId, std::move(errMsg), level);
73 }
74 
75 void Manager::commitWithLvl(uint64_t transactionId, std::string errMsg,
76                             uint32_t errLvl)
77 {
78     _commit(transactionId, std::move(errMsg),
79             static_cast<Entry::Level>(errLvl));
80 }
81 
82 void Manager::_commit(uint64_t transactionId, std::string&& errMsg,
83                       Entry::Level errLvl)
84 {
85     constexpr const auto transactionIdVar = "TRANSACTION_ID";
86     // Length of 'TRANSACTION_ID' string.
87     constexpr const auto transactionIdVarSize = std::strlen(transactionIdVar);
88     // Length of 'TRANSACTION_ID=' string.
89     constexpr const auto transactionIdVarOffset = transactionIdVarSize + 1;
90 
91     // Flush all the pending log messages into the journal
92     journalSync();
93 
94     sd_journal* j = nullptr;
95     int rc = sd_journal_open(&j, SD_JOURNAL_LOCAL_ONLY);
96     if (rc < 0)
97     {
98         logging::log<logging::level::ERR>(
99             "Failed to open journal",
100             logging::entry("DESCRIPTION=%s", strerror(-rc)));
101         return;
102     }
103 
104     std::string transactionIdStr = std::to_string(transactionId);
105     std::set<std::string> metalist;
106     auto metamap = g_errMetaMap.find(errMsg);
107     if (metamap != g_errMetaMap.end())
108     {
109         metalist.insert(metamap->second.begin(), metamap->second.end());
110     }
111 
112     // Add _PID field information in AdditionalData.
113     metalist.insert("_PID");
114 
115     std::vector<std::string> additionalData;
116 
117     // Read the journal from the end to get the most recent entry first.
118     // The result from the sd_journal_get_data() is of the form VARIABLE=value.
119     SD_JOURNAL_FOREACH_BACKWARDS(j)
120     {
121         const char* data = nullptr;
122         size_t length = 0;
123 
124         // Look for the transaction id metadata variable
125         rc = sd_journal_get_data(j, transactionIdVar, (const void**)&data,
126                                  &length);
127         if (rc < 0)
128         {
129             // This journal entry does not have the TRANSACTION_ID
130             // metadata variable.
131             continue;
132         }
133 
134         // journald does not guarantee that sd_journal_get_data() returns NULL
135         // terminated strings, so need to specify the size to use to compare,
136         // use the returned length instead of anything that relies on NULL
137         // terminators like strlen().
138         // The data variable is in the form of 'TRANSACTION_ID=1234'. Remove
139         // the TRANSACTION_ID characters plus the (=) sign to do the comparison.
140         // 'data + transactionIdVarOffset' will be in the form of '1234'.
141         // 'length - transactionIdVarOffset' will be the length of '1234'.
142         if ((length <= (transactionIdVarOffset)) ||
143             (transactionIdStr.compare(0, transactionIdStr.size(),
144                                       data + transactionIdVarOffset,
145                                       length - transactionIdVarOffset) != 0))
146         {
147             // The value of the TRANSACTION_ID metadata is not the requested
148             // transaction id number.
149             continue;
150         }
151 
152         // Search for all metadata variables in the current journal entry.
153         for (auto i = metalist.cbegin(); i != metalist.cend();)
154         {
155             rc = sd_journal_get_data(j, (*i).c_str(), (const void**)&data,
156                                      &length);
157             if (rc < 0)
158             {
159                 // Metadata variable not found, check next metadata variable.
160                 i++;
161                 continue;
162             }
163 
164             // Metadata variable found, save it and remove it from the set.
165             additionalData.emplace_back(data, length);
166             i = metalist.erase(i);
167         }
168         if (metalist.empty())
169         {
170             // All metadata variables found, break out of journal loop.
171             break;
172         }
173     }
174     if (!metalist.empty())
175     {
176         // Not all the metadata variables were found in the journal.
177         for (auto& metaVarStr : metalist)
178         {
179             logging::log<logging::level::INFO>(
180                 "Failed to find metadata",
181                 logging::entry("META_FIELD=%s", metaVarStr.c_str()));
182         }
183     }
184 
185     sd_journal_close(j);
186 
187     createEntry(errMsg, errLvl, additionalData);
188 }
189 
190 void Manager::createEntry(std::string errMsg, Entry::Level errLvl,
191                           std::vector<std::string> additionalData,
192                           const FFDCEntries& ffdc)
193 {
194     if (!Extensions::disableDefaultLogCaps())
195     {
196         if (errLvl < Entry::sevLowerLimit)
197         {
198             if (realErrors.size() >= ERROR_CAP)
199             {
200                 erase(realErrors.front());
201             }
202         }
203         else
204         {
205             if (infoErrors.size() >= ERROR_INFO_CAP)
206             {
207                 erase(infoErrors.front());
208             }
209         }
210     }
211 
212     entryId++;
213     if (errLvl >= Entry::sevLowerLimit)
214     {
215         infoErrors.push_back(entryId);
216     }
217     else
218     {
219         realErrors.push_back(entryId);
220     }
221     auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(
222                   std::chrono::system_clock::now().time_since_epoch())
223                   .count();
224     auto objPath = std::string(OBJ_ENTRY) + '/' + std::to_string(entryId);
225 
226     AssociationList objects{};
227     processMetadata(errMsg, additionalData, objects);
228 
229     auto e = std::make_unique<Entry>(busLog, objPath, entryId,
230                                      ms, // Milliseconds since 1970
231                                      errLvl, std::move(errMsg),
232                                      std::move(additionalData),
233                                      std::move(objects), fwVersion, *this);
234     serialize(*e);
235 
236     if (isQuiesceOnErrorEnabled() && isCalloutPresent(*e))
237     {
238         quiesceOnError(entryId);
239     }
240 
241     doExtensionLogCreate(*e, ffdc);
242 
243     // Note: No need to close the file descriptors in the FFDC.
244 
245     entries.insert(std::make_pair(entryId, std::move(e)));
246 }
247 
248 bool Manager::isQuiesceOnErrorEnabled()
249 {
250     std::variant<bool> property;
251 
252     auto method = this->busLog.new_method_call(
253         "xyz.openbmc_project.Settings", "/xyz/openbmc_project/logging/settings",
254         "org.freedesktop.DBus.Properties", "Get");
255 
256     method.append("xyz.openbmc_project.Logging.Settings", "QuiesceOnHwError");
257 
258     try
259     {
260         auto reply = this->busLog.call(method);
261         reply.read(property);
262     }
263     catch (const SdBusError& e)
264     {
265         log<level::ERR>("Error reading QuiesceOnHwError property",
266                         entry("ERROR=%s", e.what()));
267         throw;
268     }
269 
270     return std::get<bool>(property);
271 }
272 
273 bool Manager::isCalloutPresent(const Entry& entry)
274 {
275     for (const auto& c : entry.additionalData())
276     {
277         if (c.find("CALLOUT_") != std::string::npos)
278         {
279             return true;
280         }
281     }
282 
283     return false;
284 }
285 
286 void Manager::findAndRemoveResolvedBlocks()
287 {
288     for (auto& entry : entries)
289     {
290         if (entry.second->resolved())
291         {
292             checkAndRemoveBlockingError(entry.first);
293         }
294     }
295 }
296 
297 void Manager::onEntryResolve(sdbusplus::message::message& msg)
298 {
299     using Interface = std::string;
300     using Property = std::string;
301     using Value = std::string;
302     using Properties = std::map<Property, std::variant<Value>>;
303 
304     Interface interface;
305     Properties properties;
306 
307     msg.read(interface, properties);
308 
309     for (const auto& p : properties)
310     {
311         if (p.first == "Resolved")
312         {
313             findAndRemoveResolvedBlocks();
314             return;
315         }
316     }
317 }
318 
319 void Manager::checkAndQuiesceHost()
320 {
321     // First check host state
322     std::variant<std::string> property;
323 
324     auto method = this->busLog.new_method_call(
325         "xyz.openbmc_project.State.Host", "/xyz/openbmc_project/state/host0",
326         "org.freedesktop.DBus.Properties", "Get");
327 
328     method.append("xyz.openbmc_project.State.Host", "CurrentHostState");
329 
330     try
331     {
332         auto reply = this->busLog.call(method);
333         reply.read(property);
334     }
335     catch (const SdBusError& e)
336     {
337         // Quiescing the host is a "best effort" type function. If unable to
338         // read the host state or it comes back empty, just return.
339         // The boot block object will still be created and the associations to
340         // find the log will be present. Don't want a dependency with
341         // phosphor-state-manager service
342         log<level::INFO>("Error reading QuiesceOnHwError property",
343                          entry("ERROR=%s", e.what()));
344         return;
345     }
346 
347     std::string hostState = std::get<std::string>(property);
348 
349     // If host state is empty, do nothing
350     if (hostState.empty())
351     {
352         return;
353     }
354 
355     using Host = sdbusplus::xyz::openbmc_project::State::server::Host;
356     auto state = Host::convertHostStateFromString(hostState);
357     if (state != Host::HostState::Running)
358     {
359         return;
360     }
361 
362     auto quiesce = this->busLog.new_method_call(
363         "org.freedesktop.systemd1", "/org/freedesktop/systemd1",
364         "org.freedesktop.systemd1.Manager", "StartUnit");
365 
366     quiesce.append("obmc-host-quiesce@0.target");
367     quiesce.append("replace");
368 
369     this->busLog.call_noreply(quiesce);
370 }
371 
372 void Manager::quiesceOnError(const uint32_t entryId)
373 {
374     // Verify we don't already have this entry blocking
375     auto it = find_if(
376         this->blockingErrors.begin(), this->blockingErrors.end(),
377         [&](std::unique_ptr<Block>& obj) { return obj->entryId == entryId; });
378     if (it != this->blockingErrors.end())
379     {
380         // Already recorded so just return
381         logging::log<logging::level::DEBUG>(
382             "QuiesceOnError set and callout present but entry already logged");
383         return;
384     }
385 
386     logging::log<logging::level::INFO>(
387         "QuiesceOnError set and callout present");
388 
389     auto blockPath =
390         std::string(OBJ_LOGGING) + "/block" + std::to_string(entryId);
391     auto blockObj = std::make_unique<Block>(this->busLog, blockPath, entryId);
392     this->blockingErrors.push_back(std::move(blockObj));
393 
394     // Register call back if log is resolved
395     using namespace sdbusplus::bus::match::rules;
396     auto entryPath = std::string(OBJ_ENTRY) + '/' + std::to_string(entryId);
397     auto callback = std::make_unique<sdbusplus::bus::match::match>(
398         this->busLog,
399         propertiesChanged(entryPath, "xyz.openbmc_project.Logging.Entry"),
400         std::bind(std::mem_fn(&Manager::onEntryResolve), this,
401                   std::placeholders::_1));
402 
403     propChangedEntryCallback.insert(
404         std::make_pair(entryId, std::move(callback)));
405 
406     checkAndQuiesceHost();
407 }
408 
409 void Manager::doExtensionLogCreate(const Entry& entry, const FFDCEntries& ffdc)
410 {
411     // Make the association <endpointpath>/<endpointtype> paths
412     std::vector<std::string> assocs;
413     for (const auto& [forwardType, reverseType, endpoint] :
414          entry.associations())
415     {
416         std::string e{endpoint};
417         e += '/' + reverseType;
418         assocs.push_back(e);
419     }
420 
421     for (auto& create : Extensions::getCreateFunctions())
422     {
423         try
424         {
425             create(entry.message(), entry.id(), entry.timestamp(),
426                    entry.severity(), entry.additionalData(), assocs, ffdc);
427         }
428         catch (std::exception& e)
429         {
430             log<level::ERR>("An extension's create function threw an exception",
431                             phosphor::logging::entry("ERROR=%s", e.what()));
432         }
433     }
434 }
435 
436 void Manager::processMetadata(const std::string& errorName,
437                               const std::vector<std::string>& additionalData,
438                               AssociationList& objects) const
439 {
440     // additionalData is a list of "metadata=value"
441     constexpr auto separator = '=';
442     for (const auto& entryItem : additionalData)
443     {
444         auto found = entryItem.find(separator);
445         if (std::string::npos != found)
446         {
447             auto metadata = entryItem.substr(0, found);
448             auto iter = meta.find(metadata);
449             if (meta.end() != iter)
450             {
451                 (iter->second)(metadata, additionalData, objects);
452             }
453         }
454     }
455 }
456 
457 void Manager::checkAndRemoveBlockingError(uint32_t entryId)
458 {
459     // First look for blocking object and remove
460     auto it = find_if(
461         blockingErrors.begin(), blockingErrors.end(),
462         [&](std::unique_ptr<Block>& obj) { return obj->entryId == entryId; });
463     if (it != blockingErrors.end())
464     {
465         blockingErrors.erase(it);
466     }
467 
468     // Now remove the callback looking for the error to be resolved
469     auto resolveFind = propChangedEntryCallback.find(entryId);
470     if (resolveFind != propChangedEntryCallback.end())
471     {
472         propChangedEntryCallback.erase(resolveFind);
473     }
474 
475     return;
476 }
477 
478 void Manager::erase(uint32_t entryId)
479 {
480     auto entryFound = entries.find(entryId);
481     if (entries.end() != entryFound)
482     {
483         for (auto& func : Extensions::getDeleteProhibitedFunctions())
484         {
485             try
486             {
487                 bool prohibited = false;
488                 func(entryId, prohibited);
489                 if (prohibited)
490                 {
491                     // Future work remains to throw an error here.
492                     return;
493                 }
494             }
495             catch (std::exception& e)
496             {
497                 log<level::ERR>(
498                     "An extension's deleteProhibited function threw "
499                     "an exception",
500                     entry("ERROR=%s", e.what()));
501             }
502         }
503 
504         // Delete the persistent representation of this error.
505         fs::path errorPath(ERRLOG_PERSIST_PATH);
506         errorPath /= std::to_string(entryId);
507         fs::remove(errorPath);
508 
509         auto removeId = [](std::list<uint32_t>& ids, uint32_t id) {
510             auto it = std::find(ids.begin(), ids.end(), id);
511             if (it != ids.end())
512             {
513                 ids.erase(it);
514             }
515         };
516         if (entryFound->second->severity() >= Entry::sevLowerLimit)
517         {
518             removeId(infoErrors, entryId);
519         }
520         else
521         {
522             removeId(realErrors, entryId);
523         }
524         entries.erase(entryFound);
525 
526         checkAndRemoveBlockingError(entryId);
527 
528         for (auto& remove : Extensions::getDeleteFunctions())
529         {
530             try
531             {
532                 remove(entryId);
533             }
534             catch (std::exception& e)
535             {
536                 log<level::ERR>("An extension's delete function threw an "
537                                 "exception",
538                                 entry("ERROR=%s", e.what()));
539             }
540         }
541     }
542     else
543     {
544         logging::log<level::ERR>("Invalid entry ID to delete",
545                                  logging::entry("ID=%d", entryId));
546     }
547 }
548 
549 void Manager::restore()
550 {
551     auto sanity = [](const auto& id, const auto& restoredId) {
552         return id == restoredId;
553     };
554     std::vector<uint32_t> errorIds;
555 
556     fs::path dir(ERRLOG_PERSIST_PATH);
557     if (!fs::exists(dir) || fs::is_empty(dir))
558     {
559         return;
560     }
561 
562     for (auto& file : fs::directory_iterator(dir))
563     {
564         auto id = file.path().filename().c_str();
565         auto idNum = std::stol(id);
566         auto e = std::make_unique<Entry>(
567             busLog, std::string(OBJ_ENTRY) + '/' + id, idNum, *this);
568         if (deserialize(file.path(), *e))
569         {
570             // validate the restored error entry id
571             if (sanity(static_cast<uint32_t>(idNum), e->id()))
572             {
573                 e->emit_object_added();
574                 if (e->severity() >= Entry::sevLowerLimit)
575                 {
576                     infoErrors.push_back(idNum);
577                 }
578                 else
579                 {
580                     realErrors.push_back(idNum);
581                 }
582 
583                 entries.insert(std::make_pair(idNum, std::move(e)));
584                 errorIds.push_back(idNum);
585             }
586             else
587             {
588                 logging::log<logging::level::ERR>(
589                     "Failed in sanity check while restoring error entry. "
590                     "Ignoring error entry",
591                     logging::entry("ID_NUM=%d", idNum),
592                     logging::entry("ENTRY_ID=%d", e->id()));
593             }
594         }
595     }
596 
597     if (!errorIds.empty())
598     {
599         entryId = *(std::max_element(errorIds.begin(), errorIds.end()));
600     }
601 }
602 
603 void Manager::journalSync()
604 {
605     bool syncRequested = false;
606     auto fd = -1;
607     auto rc = -1;
608     auto wd = -1;
609     auto bus = sdbusplus::bus::new_default();
610 
611     auto start =
612         duration_cast<microseconds>(steady_clock::now().time_since_epoch())
613             .count();
614 
615     // Each time an error log is committed, a request to sync the journal
616     // must occur and block that error log commit until it completes. A 5sec
617     // block is done to allow sufficient time for the journal to be synced.
618     //
619     // Number of loop iterations = 3 for the following reasons:
620     // Iteration #1: Requests a journal sync by killing the journald service.
621     // Iteration #2: Setup an inotify watch to monitor the synced file that
622     //               journald updates with the timestamp the last time the
623     //               journal was flushed.
624     // Iteration #3: Poll to wait until inotify reports an event which blocks
625     //               the error log from being commited until the sync completes.
626     constexpr auto maxRetry = 3;
627     for (int i = 0; i < maxRetry; i++)
628     {
629         // Read timestamp from synced file
630         constexpr auto syncedPath = "/run/systemd/journal/synced";
631         std::ifstream syncedFile(syncedPath);
632         if (syncedFile.fail())
633         {
634             // If the synced file doesn't exist, a sync request will create it.
635             if (errno != ENOENT)
636             {
637                 log<level::ERR>("Failed to open journal synced file",
638                                 entry("FILENAME=%s", syncedPath),
639                                 entry("ERRNO=%d", errno));
640                 return;
641             }
642         }
643         else
644         {
645             // Only read the synced file if it exists.
646             // See if a sync happened by now
647             std::string timestampStr;
648             std::getline(syncedFile, timestampStr);
649             auto timestamp = std::stoll(timestampStr);
650             if (timestamp >= start)
651             {
652                 break;
653             }
654         }
655 
656         // Let's ask for a sync, but only once
657         if (!syncRequested)
658         {
659             syncRequested = true;
660 
661             constexpr auto JOURNAL_UNIT = "systemd-journald.service";
662             auto signal = SIGRTMIN + 1;
663 
664             auto method = bus.new_method_call(SYSTEMD_BUSNAME, SYSTEMD_PATH,
665                                               SYSTEMD_INTERFACE, "KillUnit");
666             method.append(JOURNAL_UNIT, "main", signal);
667             bus.call(method);
668             if (method.is_method_error())
669             {
670                 log<level::ERR>("Failed to kill journal service");
671                 break;
672             }
673             continue;
674         }
675 
676         // Let's install the inotify watch, if we didn't do that yet. This watch
677         // monitors the syncedFile for when journald updates it with a newer
678         // timestamp. This means the journal has been flushed.
679         if (fd < 0)
680         {
681             fd = inotify_init1(IN_NONBLOCK | IN_CLOEXEC);
682             if (fd < 0)
683             {
684                 log<level::ERR>("Failed to create inotify watch",
685                                 entry("ERRNO=%d", errno));
686                 return;
687             }
688 
689             constexpr auto JOURNAL_RUN_PATH = "/run/systemd/journal";
690             wd = inotify_add_watch(fd, JOURNAL_RUN_PATH,
691                                    IN_MOVED_TO | IN_DONT_FOLLOW | IN_ONLYDIR);
692             if (wd < 0)
693             {
694                 log<level::ERR>("Failed to watch journal directory",
695                                 entry("PATH=%s", JOURNAL_RUN_PATH),
696                                 entry("ERRNO=%d", errno));
697                 close(fd);
698                 return;
699             }
700             continue;
701         }
702 
703         // Let's wait until inotify reports an event
704         struct pollfd fds = {
705             .fd = fd,
706             .events = POLLIN,
707         };
708         constexpr auto pollTimeout = 5; // 5 seconds
709         rc = poll(&fds, 1, pollTimeout * 1000);
710         if (rc < 0)
711         {
712             log<level::ERR>("Failed to add event", entry("ERRNO=%d", errno),
713                             entry("ERR=%s", strerror(-rc)));
714             inotify_rm_watch(fd, wd);
715             close(fd);
716             return;
717         }
718         else if (rc == 0)
719         {
720             log<level::INFO>("Poll timeout, no new journal synced data",
721                              entry("TIMEOUT=%d", pollTimeout));
722             break;
723         }
724 
725         // Read from the specified file descriptor until there is no new data,
726         // throwing away everything read since the timestamp will be read at the
727         // beginning of the loop.
728         constexpr auto maxBytes = 64;
729         uint8_t buffer[maxBytes];
730         while (read(fd, buffer, maxBytes) > 0)
731             ;
732     }
733 
734     if (fd != -1)
735     {
736         if (wd != -1)
737         {
738             inotify_rm_watch(fd, wd);
739         }
740         close(fd);
741     }
742 
743     return;
744 }
745 
746 std::string Manager::readFWVersion()
747 {
748     auto version = util::getOSReleaseValue("VERSION_ID");
749 
750     if (!version)
751     {
752         log<level::ERR>("Unable to read BMC firmware version");
753     }
754 
755     return version.value_or("");
756 }
757 
758 void Manager::create(const std::string& message, Entry::Level severity,
759                      const std::map<std::string, std::string>& additionalData)
760 {
761     // Convert the map into a vector of "key=value" strings
762     std::vector<std::string> ad;
763     metadata::associations::combine(additionalData, ad);
764 
765     createEntry(message, severity, ad);
766 }
767 
768 void Manager::createWithFFDC(
769     const std::string& message, Entry::Level severity,
770     const std::map<std::string, std::string>& additionalData,
771     const FFDCEntries& ffdc)
772 {
773     // Convert the map into a vector of "key=value" strings
774     std::vector<std::string> ad;
775     metadata::associations::combine(additionalData, ad);
776 
777     createEntry(message, severity, ad, ffdc);
778 }
779 
780 } // namespace internal
781 } // namespace logging
782 } // namespace phosphor
783