1 #include "config.h"
2 
3 #include "log_manager.hpp"
4 
5 #include "elog_entry.hpp"
6 #include "elog_meta.hpp"
7 #include "elog_serialize.hpp"
8 #include "extensions.hpp"
9 #include "util.hpp"
10 
11 #include <poll.h>
12 #include <sys/inotify.h>
13 #include <systemd/sd-bus.h>
14 #include <systemd/sd-journal.h>
15 #include <unistd.h>
16 
17 #include <cassert>
18 #include <chrono>
19 #include <cstdio>
20 #include <cstring>
21 #include <fstream>
22 #include <functional>
23 #include <future>
24 #include <iostream>
25 #include <map>
26 #include <phosphor-logging/lg2.hpp>
27 #include <sdbusplus/vtable.hpp>
28 #include <set>
29 #include <string>
30 #include <string_view>
31 #include <vector>
32 #include <xyz/openbmc_project/State/Host/server.hpp>
33 
34 using namespace std::chrono;
35 using sdbusplus::exception::SdBusError;
36 extern const std::map<
37     phosphor::logging::metadata::Metadata,
38     std::function<phosphor::logging::metadata::associations::Type>>
39     meta;
40 
41 namespace phosphor
42 {
43 namespace logging
44 {
45 namespace internal
46 {
47 inline auto getLevel(const std::string& errMsg)
48 {
49     auto reqLevel = Entry::Level::Error; // Default to Error
50 
51     auto levelmap = g_errLevelMap.find(errMsg);
52     if (levelmap != g_errLevelMap.end())
53     {
54         reqLevel = static_cast<Entry::Level>(levelmap->second);
55     }
56 
57     return reqLevel;
58 }
59 
60 int Manager::getRealErrSize()
61 {
62     return realErrors.size();
63 }
64 
65 int Manager::getInfoErrSize()
66 {
67     return infoErrors.size();
68 }
69 
70 uint32_t Manager::commit(uint64_t transactionId, std::string errMsg)
71 {
72     auto level = getLevel(errMsg);
73     _commit(transactionId, std::move(errMsg), level);
74     return entryId;
75 }
76 
77 uint32_t Manager::commitWithLvl(uint64_t transactionId, std::string errMsg,
78                                 uint32_t errLvl)
79 {
80     _commit(transactionId, std::move(errMsg),
81             static_cast<Entry::Level>(errLvl));
82     return entryId;
83 }
84 
85 void Manager::_commit(uint64_t transactionId [[maybe_unused]],
86                       std::string&& errMsg, Entry::Level errLvl)
87 {
88     std::vector<std::string> additionalData{};
89 
90     // When running as a test-case, the system may have a LOT of journal
91     // data and we may not have permissions to do some of the journal sync
92     // operations.  Just skip over them.
93     if (!IS_UNIT_TEST)
94     {
95         static constexpr auto transactionIdVar =
96             std::string_view{"TRANSACTION_ID"};
97         // Length of 'TRANSACTION_ID' string.
98         static constexpr auto transactionIdVarSize = transactionIdVar.size();
99         // Length of 'TRANSACTION_ID=' string.
100         static constexpr auto transactionIdVarOffset = transactionIdVarSize + 1;
101 
102         // Flush all the pending log messages into the journal
103         journalSync();
104 
105         sd_journal* j = nullptr;
106         int rc = sd_journal_open(&j, SD_JOURNAL_LOCAL_ONLY);
107         if (rc < 0)
108         {
109             lg2::error("Failed to open journal: {ERROR}", "ERROR",
110                        strerror(-rc));
111             return;
112         }
113 
114         std::string transactionIdStr = std::to_string(transactionId);
115         std::set<std::string> metalist;
116         auto metamap = g_errMetaMap.find(errMsg);
117         if (metamap != g_errMetaMap.end())
118         {
119             metalist.insert(metamap->second.begin(), metamap->second.end());
120         }
121 
122         // Add _PID field information in AdditionalData.
123         metalist.insert("_PID");
124 
125         // Read the journal from the end to get the most recent entry first.
126         // The result from the sd_journal_get_data() is of the form
127         // VARIABLE=value.
128         SD_JOURNAL_FOREACH_BACKWARDS(j)
129         {
130             const char* data = nullptr;
131             size_t length = 0;
132 
133             // Look for the transaction id metadata variable
134             rc = sd_journal_get_data(j, transactionIdVar.data(),
135                                      (const void**)&data, &length);
136             if (rc < 0)
137             {
138                 // This journal entry does not have the TRANSACTION_ID
139                 // metadata variable.
140                 continue;
141             }
142 
143             // journald does not guarantee that sd_journal_get_data() returns
144             // NULL terminated strings, so need to specify the size to use to
145             // compare, use the returned length instead of anything that relies
146             // on NULL terminators like strlen(). The data variable is in the
147             // form of 'TRANSACTION_ID=1234'. Remove the TRANSACTION_ID
148             // characters plus the (=) sign to do the comparison. 'data +
149             // transactionIdVarOffset' will be in the form of '1234'. 'length -
150             // transactionIdVarOffset' will be the length of '1234'.
151             if ((length <= (transactionIdVarOffset)) ||
152                 (transactionIdStr.compare(
153                      0, transactionIdStr.size(), data + transactionIdVarOffset,
154                      length - transactionIdVarOffset) != 0))
155             {
156                 // The value of the TRANSACTION_ID metadata is not the requested
157                 // transaction id number.
158                 continue;
159             }
160 
161             // Search for all metadata variables in the current journal entry.
162             for (auto i = metalist.cbegin(); i != metalist.cend();)
163             {
164                 rc = sd_journal_get_data(j, (*i).c_str(), (const void**)&data,
165                                          &length);
166                 if (rc < 0)
167                 {
168                     // Metadata variable not found, check next metadata
169                     // variable.
170                     i++;
171                     continue;
172                 }
173 
174                 // Metadata variable found, save it and remove it from the set.
175                 additionalData.emplace_back(data, length);
176                 i = metalist.erase(i);
177             }
178             if (metalist.empty())
179             {
180                 // All metadata variables found, break out of journal loop.
181                 break;
182             }
183         }
184         if (!metalist.empty())
185         {
186             // Not all the metadata variables were found in the journal.
187             for (auto& metaVarStr : metalist)
188             {
189                 lg2::info("Failed to find metadata: {META_FIELD}", "META_FIELD",
190                           metaVarStr);
191             }
192         }
193 
194         sd_journal_close(j);
195     }
196     createEntry(errMsg, errLvl, additionalData);
197 }
198 
199 void Manager::createEntry(std::string errMsg, Entry::Level errLvl,
200                           std::vector<std::string> additionalData,
201                           const FFDCEntries& ffdc)
202 {
203     if (!Extensions::disableDefaultLogCaps())
204     {
205         if (errLvl < Entry::sevLowerLimit)
206         {
207             if (realErrors.size() >= ERROR_CAP)
208             {
209                 erase(realErrors.front());
210             }
211         }
212         else
213         {
214             if (infoErrors.size() >= ERROR_INFO_CAP)
215             {
216                 erase(infoErrors.front());
217             }
218         }
219     }
220 
221     entryId++;
222     if (errLvl >= Entry::sevLowerLimit)
223     {
224         infoErrors.push_back(entryId);
225     }
226     else
227     {
228         realErrors.push_back(entryId);
229     }
230     auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(
231                   std::chrono::system_clock::now().time_since_epoch())
232                   .count();
233     auto objPath = std::string(OBJ_ENTRY) + '/' + std::to_string(entryId);
234 
235     AssociationList objects{};
236     processMetadata(errMsg, additionalData, objects);
237 
238     auto e = std::make_unique<Entry>(busLog, objPath, entryId,
239                                      ms, // Milliseconds since 1970
240                                      errLvl, std::move(errMsg),
241                                      std::move(additionalData),
242                                      std::move(objects), fwVersion, *this);
243     auto path = serialize(*e);
244     e->path(path);
245 
246     if (isQuiesceOnErrorEnabled() && isCalloutPresent(*e))
247     {
248         quiesceOnError(entryId);
249     }
250 
251     // Add entry before calling the extensions so that they have access to it
252     entries.insert(std::make_pair(entryId, std::move(e)));
253 
254     doExtensionLogCreate(*entries.find(entryId)->second, ffdc);
255 
256     // Note: No need to close the file descriptors in the FFDC.
257 }
258 
259 bool Manager::isQuiesceOnErrorEnabled()
260 {
261     // When running under tests, the Logging.Settings service will not be
262     // present.  Assume false.
263     if (IS_UNIT_TEST)
264     {
265         return false;
266     }
267 
268     std::variant<bool> property;
269 
270     auto method = this->busLog.new_method_call(
271         "xyz.openbmc_project.Settings", "/xyz/openbmc_project/logging/settings",
272         "org.freedesktop.DBus.Properties", "Get");
273 
274     method.append("xyz.openbmc_project.Logging.Settings", "QuiesceOnHwError");
275 
276     try
277     {
278         auto reply = this->busLog.call(method);
279         reply.read(property);
280     }
281     catch (const SdBusError& e)
282     {
283         lg2::error("Error reading QuiesceOnHwError property: {ERROR}", "ERROR",
284                    e);
285         throw;
286     }
287 
288     return std::get<bool>(property);
289 }
290 
291 bool Manager::isCalloutPresent(const Entry& entry)
292 {
293     for (const auto& c : entry.additionalData())
294     {
295         if (c.find("CALLOUT_") != std::string::npos)
296         {
297             return true;
298         }
299     }
300 
301     return false;
302 }
303 
304 void Manager::findAndRemoveResolvedBlocks()
305 {
306     for (auto& entry : entries)
307     {
308         if (entry.second->resolved())
309         {
310             checkAndRemoveBlockingError(entry.first);
311         }
312     }
313 }
314 
315 void Manager::onEntryResolve(sdbusplus::message::message& msg)
316 {
317     using Interface = std::string;
318     using Property = std::string;
319     using Value = std::string;
320     using Properties = std::map<Property, std::variant<Value>>;
321 
322     Interface interface;
323     Properties properties;
324 
325     msg.read(interface, properties);
326 
327     for (const auto& p : properties)
328     {
329         if (p.first == "Resolved")
330         {
331             findAndRemoveResolvedBlocks();
332             return;
333         }
334     }
335 }
336 
337 void Manager::checkAndQuiesceHost()
338 {
339     using Host = sdbusplus::xyz::openbmc_project::State::server::Host;
340 
341     // First check host state
342     std::variant<Host::HostState> property;
343 
344     auto method = this->busLog.new_method_call(
345         "xyz.openbmc_project.State.Host", "/xyz/openbmc_project/state/host0",
346         "org.freedesktop.DBus.Properties", "Get");
347 
348     method.append("xyz.openbmc_project.State.Host", "CurrentHostState");
349 
350     try
351     {
352         auto reply = this->busLog.call(method);
353         reply.read(property);
354     }
355     catch (const SdBusError& e)
356     {
357         // Quiescing the host is a "best effort" type function. If unable to
358         // read the host state or it comes back empty, just return.
359         // The boot block object will still be created and the associations to
360         // find the log will be present. Don't want a dependency with
361         // phosphor-state-manager service
362         lg2::info("Error reading QuiesceOnHwError property: {ERROR}", "ERROR",
363                   e);
364         return;
365     }
366 
367     auto hostState = std::get<Host::HostState>(property);
368     if (hostState != Host::HostState::Running)
369     {
370         return;
371     }
372 
373     auto quiesce = this->busLog.new_method_call(
374         "org.freedesktop.systemd1", "/org/freedesktop/systemd1",
375         "org.freedesktop.systemd1.Manager", "StartUnit");
376 
377     quiesce.append("obmc-host-quiesce@0.target");
378     quiesce.append("replace");
379 
380     this->busLog.call_noreply(quiesce);
381 }
382 
383 void Manager::quiesceOnError(const uint32_t entryId)
384 {
385     // Verify we don't already have this entry blocking
386     auto it = find_if(
387         this->blockingErrors.begin(), this->blockingErrors.end(),
388         [&](std::unique_ptr<Block>& obj) { return obj->entryId == entryId; });
389     if (it != this->blockingErrors.end())
390     {
391         // Already recorded so just return
392         lg2::debug(
393             "QuiesceOnError set and callout present but entry already logged");
394         return;
395     }
396 
397     lg2::info("QuiesceOnError set and callout present");
398 
399     auto blockPath =
400         std::string(OBJ_LOGGING) + "/block" + std::to_string(entryId);
401     auto blockObj = std::make_unique<Block>(this->busLog, blockPath, entryId);
402     this->blockingErrors.push_back(std::move(blockObj));
403 
404     // Register call back if log is resolved
405     using namespace sdbusplus::bus::match::rules;
406     auto entryPath = std::string(OBJ_ENTRY) + '/' + std::to_string(entryId);
407     auto callback = std::make_unique<sdbusplus::bus::match::match>(
408         this->busLog,
409         propertiesChanged(entryPath, "xyz.openbmc_project.Logging.Entry"),
410         std::bind(std::mem_fn(&Manager::onEntryResolve), this,
411                   std::placeholders::_1));
412 
413     propChangedEntryCallback.insert(
414         std::make_pair(entryId, std::move(callback)));
415 
416     checkAndQuiesceHost();
417 }
418 
419 void Manager::doExtensionLogCreate(const Entry& entry, const FFDCEntries& ffdc)
420 {
421     // Make the association <endpointpath>/<endpointtype> paths
422     std::vector<std::string> assocs;
423     for (const auto& [forwardType, reverseType, endpoint] :
424          entry.associations())
425     {
426         std::string e{endpoint};
427         e += '/' + reverseType;
428         assocs.push_back(e);
429     }
430 
431     for (auto& create : Extensions::getCreateFunctions())
432     {
433         try
434         {
435             create(entry.message(), entry.id(), entry.timestamp(),
436                    entry.severity(), entry.additionalData(), assocs, ffdc);
437         }
438         catch (std::exception& e)
439         {
440             lg2::error(
441                 "An extension's create function threw an exception: {ERROR}",
442                 "ERROR", e);
443         }
444     }
445 }
446 
447 void Manager::processMetadata(const std::string& /*errorName*/,
448                               const std::vector<std::string>& additionalData,
449                               AssociationList& objects) const
450 {
451     // additionalData is a list of "metadata=value"
452     constexpr auto separator = '=';
453     for (const auto& entryItem : additionalData)
454     {
455         auto found = entryItem.find(separator);
456         if (std::string::npos != found)
457         {
458             auto metadata = entryItem.substr(0, found);
459             auto iter = meta.find(metadata);
460             if (meta.end() != iter)
461             {
462                 (iter->second)(metadata, additionalData, objects);
463             }
464         }
465     }
466 }
467 
468 void Manager::checkAndRemoveBlockingError(uint32_t entryId)
469 {
470     // First look for blocking object and remove
471     auto it = find_if(
472         blockingErrors.begin(), blockingErrors.end(),
473         [&](std::unique_ptr<Block>& obj) { return obj->entryId == entryId; });
474     if (it != blockingErrors.end())
475     {
476         blockingErrors.erase(it);
477     }
478 
479     // Now remove the callback looking for the error to be resolved
480     auto resolveFind = propChangedEntryCallback.find(entryId);
481     if (resolveFind != propChangedEntryCallback.end())
482     {
483         propChangedEntryCallback.erase(resolveFind);
484     }
485 
486     return;
487 }
488 
489 void Manager::erase(uint32_t entryId)
490 {
491     auto entryFound = entries.find(entryId);
492     if (entries.end() != entryFound)
493     {
494         for (auto& func : Extensions::getDeleteProhibitedFunctions())
495         {
496             try
497             {
498                 bool prohibited = false;
499                 func(entryId, prohibited);
500                 if (prohibited)
501                 {
502                     // Future work remains to throw an error here.
503                     return;
504                 }
505             }
506             catch (std::exception& e)
507             {
508                 lg2::error("An extension's deleteProhibited function threw an "
509                            "exception: {ERROR}",
510                            "ERROR", e);
511             }
512         }
513 
514         // Delete the persistent representation of this error.
515         fs::path errorPath(ERRLOG_PERSIST_PATH);
516         errorPath /= std::to_string(entryId);
517         fs::remove(errorPath);
518 
519         auto removeId = [](std::list<uint32_t>& ids, uint32_t id) {
520             auto it = std::find(ids.begin(), ids.end(), id);
521             if (it != ids.end())
522             {
523                 ids.erase(it);
524             }
525         };
526         if (entryFound->second->severity() >= Entry::sevLowerLimit)
527         {
528             removeId(infoErrors, entryId);
529         }
530         else
531         {
532             removeId(realErrors, entryId);
533         }
534         entries.erase(entryFound);
535 
536         checkAndRemoveBlockingError(entryId);
537 
538         for (auto& remove : Extensions::getDeleteFunctions())
539         {
540             try
541             {
542                 remove(entryId);
543             }
544             catch (std::exception& e)
545             {
546                 lg2::error("An extension's delete function threw an exception: "
547                            "{ERROR}",
548                            "ERROR", e);
549             }
550         }
551     }
552     else
553     {
554         lg2::error("Invalid entry ID ({ID}) to delete", "ID", entryId);
555     }
556 }
557 
558 void Manager::restore()
559 {
560     auto sanity = [](const auto& id, const auto& restoredId) {
561         return id == restoredId;
562     };
563     std::vector<uint32_t> errorIds;
564 
565     fs::path dir(ERRLOG_PERSIST_PATH);
566     if (!fs::exists(dir) || fs::is_empty(dir))
567     {
568         return;
569     }
570 
571     for (auto& file : fs::directory_iterator(dir))
572     {
573         auto id = file.path().filename().c_str();
574         auto idNum = std::stol(id);
575         auto e = std::make_unique<Entry>(
576             busLog, std::string(OBJ_ENTRY) + '/' + id, idNum, *this);
577         if (deserialize(file.path(), *e))
578         {
579             // validate the restored error entry id
580             if (sanity(static_cast<uint32_t>(idNum), e->id()))
581             {
582                 e->path(file.path(), true);
583                 e->emit_object_added();
584                 if (e->severity() >= Entry::sevLowerLimit)
585                 {
586                     infoErrors.push_back(idNum);
587                 }
588                 else
589                 {
590                     realErrors.push_back(idNum);
591                 }
592 
593                 entries.insert(std::make_pair(idNum, std::move(e)));
594                 errorIds.push_back(idNum);
595             }
596             else
597             {
598                 lg2::error(
599                     "Failed in sanity check while restoring error entry. "
600                     "Ignoring error entry {ID_NUM}/{ENTRY_ID}.",
601                     "ID_NUM", idNum, "ENTRY_ID", e->id());
602             }
603         }
604     }
605 
606     if (!errorIds.empty())
607     {
608         entryId = *(std::max_element(errorIds.begin(), errorIds.end()));
609     }
610 }
611 
612 void Manager::journalSync()
613 {
614     bool syncRequested = false;
615     auto fd = -1;
616     auto rc = -1;
617     auto wd = -1;
618     auto bus = sdbusplus::bus::new_default();
619 
620     auto start =
621         duration_cast<microseconds>(steady_clock::now().time_since_epoch())
622             .count();
623 
624     // Each time an error log is committed, a request to sync the journal
625     // must occur and block that error log commit until it completes. A 5sec
626     // block is done to allow sufficient time for the journal to be synced.
627     //
628     // Number of loop iterations = 3 for the following reasons:
629     // Iteration #1: Requests a journal sync by killing the journald service.
630     // Iteration #2: Setup an inotify watch to monitor the synced file that
631     //               journald updates with the timestamp the last time the
632     //               journal was flushed.
633     // Iteration #3: Poll to wait until inotify reports an event which blocks
634     //               the error log from being commited until the sync completes.
635     constexpr auto maxRetry = 3;
636     for (int i = 0; i < maxRetry; i++)
637     {
638         // Read timestamp from synced file
639         constexpr auto syncedPath = "/run/systemd/journal/synced";
640         std::ifstream syncedFile(syncedPath);
641         if (syncedFile.fail())
642         {
643             // If the synced file doesn't exist, a sync request will create it.
644             if (errno != ENOENT)
645             {
646                 lg2::error(
647                     "Failed to open journal synced file {FILENAME}: {ERROR}",
648                     "FILENAME", syncedPath, "ERROR", strerror(errno));
649                 return;
650             }
651         }
652         else
653         {
654             // Only read the synced file if it exists.
655             // See if a sync happened by now
656             std::string timestampStr;
657             std::getline(syncedFile, timestampStr);
658             auto timestamp = std::stoll(timestampStr);
659             if (timestamp >= start)
660             {
661                 break;
662             }
663         }
664 
665         // Let's ask for a sync, but only once
666         if (!syncRequested)
667         {
668             syncRequested = true;
669 
670             constexpr auto JOURNAL_UNIT = "systemd-journald.service";
671             auto signal = SIGRTMIN + 1;
672 
673             auto method = bus.new_method_call(SYSTEMD_BUSNAME, SYSTEMD_PATH,
674                                               SYSTEMD_INTERFACE, "KillUnit");
675             method.append(JOURNAL_UNIT, "main", signal);
676             bus.call(method);
677             if (method.is_method_error())
678             {
679                 lg2::error("Failed to kill journal service");
680                 break;
681             }
682 
683             continue;
684         }
685 
686         // Let's install the inotify watch, if we didn't do that yet. This watch
687         // monitors the syncedFile for when journald updates it with a newer
688         // timestamp. This means the journal has been flushed.
689         if (fd < 0)
690         {
691             fd = inotify_init1(IN_NONBLOCK | IN_CLOEXEC);
692             if (fd < 0)
693             {
694                 lg2::error("Failed to create inotify watch: {ERROR}", "ERROR",
695                            strerror(errno));
696                 return;
697             }
698 
699             constexpr auto JOURNAL_RUN_PATH = "/run/systemd/journal";
700             wd = inotify_add_watch(fd, JOURNAL_RUN_PATH,
701                                    IN_MOVED_TO | IN_DONT_FOLLOW | IN_ONLYDIR);
702             if (wd < 0)
703             {
704                 lg2::error("Failed to watch journal directory: {PATH}: {ERROR}",
705                            "PATH", JOURNAL_RUN_PATH, "ERROR", strerror(errno));
706                 close(fd);
707                 return;
708             }
709             continue;
710         }
711 
712         // Let's wait until inotify reports an event
713         struct pollfd fds = {
714             fd,
715             POLLIN,
716             0,
717         };
718         constexpr auto pollTimeout = 5; // 5 seconds
719         rc = poll(&fds, 1, pollTimeout * 1000);
720         if (rc < 0)
721         {
722             lg2::error("Failed to add event: {ERROR}", "ERROR",
723                        strerror(errno));
724             inotify_rm_watch(fd, wd);
725             close(fd);
726             return;
727         }
728         else if (rc == 0)
729         {
730             lg2::info("Poll timeout ({TIMEOUT}), no new journal synced data",
731                       "TIMEOUT", pollTimeout);
732             break;
733         }
734 
735         // Read from the specified file descriptor until there is no new data,
736         // throwing away everything read since the timestamp will be read at the
737         // beginning of the loop.
738         constexpr auto maxBytes = 64;
739         uint8_t buffer[maxBytes];
740         while (read(fd, buffer, maxBytes) > 0)
741             ;
742     }
743 
744     if (fd != -1)
745     {
746         if (wd != -1)
747         {
748             inotify_rm_watch(fd, wd);
749         }
750         close(fd);
751     }
752 
753     return;
754 }
755 
756 std::string Manager::readFWVersion()
757 {
758     auto version = util::getOSReleaseValue("VERSION_ID");
759 
760     if (!version)
761     {
762         lg2::error("Unable to read BMC firmware version");
763     }
764 
765     return version.value_or("");
766 }
767 
768 void Manager::create(const std::string& message, Entry::Level severity,
769                      const std::map<std::string, std::string>& additionalData)
770 {
771     // Convert the map into a vector of "key=value" strings
772     std::vector<std::string> ad;
773     metadata::associations::combine(additionalData, ad);
774 
775     createEntry(message, severity, ad);
776 }
777 
778 void Manager::createWithFFDC(
779     const std::string& message, Entry::Level severity,
780     const std::map<std::string, std::string>& additionalData,
781     const FFDCEntries& ffdc)
782 {
783     // Convert the map into a vector of "key=value" strings
784     std::vector<std::string> ad;
785     metadata::associations::combine(additionalData, ad);
786 
787     createEntry(message, severity, ad, ffdc);
788 }
789 
790 } // namespace internal
791 } // namespace logging
792 } // namespace phosphor
793