xref: /openbmc/phosphor-logging/extensions/openpower-pels/host_notifier.cpp (revision 40fb54935ce7367636a7156039396ee91cc4d5e2)
1 // SPDX-License-Identifier: Apache-2.0
2 // SPDX-FileCopyrightText: Copyright 2019 IBM Corporation
3 
4 #include "host_notifier.hpp"
5 
6 #include <phosphor-logging/lg2.hpp>
7 
8 namespace openpower::pels
9 {
10 
11 const auto subscriptionName = "PELHostNotifier";
12 const size_t maxRetryAttempts = 15;
13 
HostNotifier(Repository & repo,DataInterfaceBase & dataIface,std::unique_ptr<HostInterface> hostIface)14 HostNotifier::HostNotifier(Repository& repo, DataInterfaceBase& dataIface,
15                            std::unique_ptr<HostInterface> hostIface) :
16     _repo(repo), _dataIface(dataIface), _hostIface(std::move(hostIface)),
17     _retryTimer(_hostIface->getEvent(),
18                 std::bind(std::mem_fn(&HostNotifier::retryTimerExpired), this)),
19     _hostFullTimer(
20         _hostIface->getEvent(),
21         std::bind(std::mem_fn(&HostNotifier::hostFullTimerExpired), this)),
22     _hostUpTimer(_hostIface->getEvent(),
23                  std::bind(std::mem_fn(&HostNotifier::hostUpTimerExpired),
24                            this))
25 {
26     // Subscribe to be told about new PELs.
27     _repo.subscribeToAdds(subscriptionName,
28                           std::bind(std::mem_fn(&HostNotifier::newLogCallback),
29                                     this, std::placeholders::_1));
30 
31     // Subscribe to be told about deleted PELs.
32     _repo.subscribeToDeletes(
33         subscriptionName,
34         std::bind(std::mem_fn(&HostNotifier::deleteLogCallback), this,
35                   std::placeholders::_1));
36 
37     // Add any existing PELs to the queue to send them if necessary.
38     _repo.for_each(std::bind(std::mem_fn(&HostNotifier::addPELToQueue), this,
39                              std::placeholders::_1));
40 
41     // Subscribe to be told about host state changes.
42     _dataIface.subscribeToHostStateChange(
43         subscriptionName, std::bind(std::mem_fn(&HostNotifier::hostStateChange),
44                                     this, std::placeholders::_1));
45 
46     // Set the function to call when the async reponse is received.
47     _hostIface->setResponseFunction(
48         std::bind(std::mem_fn(&HostNotifier::commandResponse), this,
49                   std::placeholders::_1));
50 
51     // Start sending logs if the host is running
52     if (!_pelQueue.empty() && _dataIface.isHostUp())
53     {
54         lg2::debug("Host is already up at startup");
55         _hostUpTimer.restartOnce(_hostIface->getHostUpDelay());
56     }
57 }
58 
~HostNotifier()59 HostNotifier::~HostNotifier()
60 {
61     _repo.unsubscribeFromAdds(subscriptionName);
62     _dataIface.unsubscribeFromHostStateChange(subscriptionName);
63 }
64 
hostUpTimerExpired()65 void HostNotifier::hostUpTimerExpired()
66 {
67     lg2::debug("Host up timer expired");
68     doNewLogNotify();
69 }
70 
addPELToQueue(const PEL & pel)71 bool HostNotifier::addPELToQueue(const PEL& pel)
72 {
73     if (enqueueRequired(pel.id()))
74     {
75         _pelQueue.push_back(pel.id());
76     }
77 
78     // Return false so that Repo::for_each keeps going.
79     return false;
80 }
81 
enqueueRequired(uint32_t id) const82 bool HostNotifier::enqueueRequired(uint32_t id) const
83 {
84     bool required = true;
85     Repository::LogID i{Repository::LogID::Pel{id}};
86 
87     // Manufacturing testing may turn off sending up PELs
88     if (!_dataIface.getHostPELEnablement())
89     {
90         return false;
91     }
92 
93     if (auto attributes = _repo.getPELAttributes(i); attributes)
94     {
95         auto a = attributes.value().get();
96 
97         if ((a.hostState == TransmissionState::acked) ||
98             (a.hostState == TransmissionState::badPEL))
99         {
100             required = false;
101         }
102         else if (a.actionFlags.test(hiddenFlagBit) &&
103                  (a.hmcState == TransmissionState::acked))
104         {
105             required = false;
106         }
107         else if (a.actionFlags.test(dontReportToHostFlagBit))
108         {
109             required = false;
110         }
111     }
112     else
113     {
114         using namespace phosphor::logging;
115         lg2::error("Host Enqueue: Unable to find PEL ID {ID} in repository",
116                    "ID", lg2::hex, id);
117         required = false;
118     }
119 
120     return required;
121 }
122 
notifyRequired(uint32_t id) const123 bool HostNotifier::notifyRequired(uint32_t id) const
124 {
125     bool notify = true;
126     Repository::LogID i{Repository::LogID::Pel{id}};
127 
128     if (auto attributes = _repo.getPELAttributes(i); attributes)
129     {
130         // If already acked by the host, don't send again.
131         // (A safety check as it shouldn't get to this point.)
132         auto a = attributes.value().get();
133         if (a.hostState == TransmissionState::acked)
134         {
135             notify = false;
136         }
137         else if (a.actionFlags.test(hiddenFlagBit))
138         {
139             // If hidden and acked (or will be) acked by the HMC,
140             // also don't send it. (HMC management can come and
141             // go at any time)
142             if ((a.hmcState == TransmissionState::acked) ||
143                 _dataIface.isHMCManaged())
144             {
145                 notify = false;
146             }
147         }
148     }
149     else
150     {
151         // Must have been deleted since put on the queue.
152         notify = false;
153     }
154 
155     return notify;
156 }
157 
newLogCallback(const PEL & pel)158 void HostNotifier::newLogCallback(const PEL& pel)
159 {
160     if (!enqueueRequired(pel.id()))
161     {
162         return;
163     }
164 
165     lg2::debug("New PEL added to queue, PEL ID = {ID}", "ID", lg2::hex,
166                pel.id());
167 
168     _pelQueue.push_back(pel.id());
169 
170     // Notify shouldn't happen if host is down, not up long enough, or full
171     if (!_dataIface.isHostUp() || _hostFull || _hostUpTimer.isEnabled())
172     {
173         return;
174     }
175 
176     // Dispatch a command now if there isn't currently a command
177     // in progress and this is the first log in the queue or it
178     // previously gave up from a hard failure.
179     auto inProgress = (_inProgressPEL != 0) || _hostIface->cmdInProgress() ||
180                       _retryTimer.isEnabled();
181 
182     auto firstPEL = _pelQueue.size() == 1;
183     auto gaveUp = _retryCount >= maxRetryAttempts;
184 
185     if (!inProgress && (firstPEL || gaveUp))
186     {
187         _retryCount = 0;
188 
189         // Send a log, but from the event loop, not from here.
190         scheduleDispatch();
191     }
192 }
193 
deleteLogCallback(uint32_t id)194 void HostNotifier::deleteLogCallback(uint32_t id)
195 {
196     auto queueIt = std::find(_pelQueue.begin(), _pelQueue.end(), id);
197     if (queueIt != _pelQueue.end())
198     {
199         lg2::debug("Host notifier removing deleted log from queue");
200         _pelQueue.erase(queueIt);
201     }
202 
203     auto sentIt = std::find(_sentPELs.begin(), _sentPELs.end(), id);
204     if (sentIt != _sentPELs.end())
205     {
206         lg2::debug("Host notifier removing deleted log from sent list");
207         _sentPELs.erase(sentIt);
208     }
209 
210     // Nothing we can do about this...
211     if (id == _inProgressPEL)
212     {
213         lg2::warning(
214             "A PEL was deleted while its host notification was in progress, PEL ID = {ID}",
215             "ID", lg2::hex, id);
216     }
217 }
218 
scheduleDispatch()219 void HostNotifier::scheduleDispatch()
220 {
221     _dispatcher = std::make_unique<sdeventplus::source::Defer>(
222         _hostIface->getEvent(), std::bind(std::mem_fn(&HostNotifier::dispatch),
223                                           this, std::placeholders::_1));
224 }
225 
dispatch(sdeventplus::source::EventBase &)226 void HostNotifier::dispatch(sdeventplus::source::EventBase& /*source*/)
227 {
228     _dispatcher.reset();
229 
230     doNewLogNotify();
231 }
232 
doNewLogNotify()233 void HostNotifier::doNewLogNotify()
234 {
235     if (!_dataIface.isHostUp() || _retryTimer.isEnabled() ||
236         _hostFullTimer.isEnabled())
237     {
238         return;
239     }
240 
241     if (_retryCount >= maxRetryAttempts)
242     {
243         // Give up until a new log comes in.
244         if (_retryCount == maxRetryAttempts)
245         {
246             // If this were to really happen, the PLDM interface
247             // would be down and isolating that shouldn't left to
248             // a logging daemon, so just trace.  Also, this will start
249             // trying again when the next new log comes in.
250             lg2::error(
251                 "PEL Host notifier hit max retry attempts. Giving up for now. PEL ID = {ID}",
252                 "ID", lg2::hex, _pelQueue.front());
253 
254             // Tell the host interface object to clean itself up, especially to
255             // release the PLDM instance ID it's been using.
256             _hostIface->cancelCmd();
257         }
258         return;
259     }
260 
261     bool doNotify = false;
262     uint32_t id = 0;
263 
264     // Find the PEL to send
265     while (!doNotify && !_pelQueue.empty())
266     {
267         id = _pelQueue.front();
268         _pelQueue.pop_front();
269 
270         if (notifyRequired(id))
271         {
272             doNotify = true;
273         }
274     }
275 
276     if (doNotify)
277     {
278         // Get the size using the repo attributes
279         Repository::LogID i{Repository::LogID::Pel{id}};
280         if (auto attributes = _repo.getPELAttributes(i); attributes)
281         {
282             auto size = static_cast<size_t>(
283                 std::filesystem::file_size((*attributes).get().path));
284 
285             lg2::debug("sendNewLogCmd: ID {ID} size {SIZE}", "ID", lg2::hex, id,
286                        "SIZE", size);
287 
288             auto rc = _hostIface->sendNewLogCmd(id, size);
289 
290             if (rc == CmdStatus::success)
291             {
292                 _inProgressPEL = id;
293             }
294             else
295             {
296                 // It failed.  Retry
297                 lg2::error("PLDM send failed, PEL ID = {ID}", "ID", lg2::hex,
298                            id);
299                 _pelQueue.push_front(id);
300                 _inProgressPEL = 0;
301                 _retryTimer.restartOnce(_hostIface->getSendRetryDelay());
302             }
303         }
304         else
305         {
306             lg2::error(
307                 "PEL ID is not in repository. Cannot notify host. PEL ID = {ID}",
308                 "ID", lg2::hex, id);
309         }
310     }
311 }
312 
hostStateChange(bool hostUp)313 void HostNotifier::hostStateChange(bool hostUp)
314 {
315     _retryCount = 0;
316     _hostFull = false;
317 
318     if (hostUp && !_pelQueue.empty())
319     {
320         lg2::debug("Host state change to on");
321         _hostUpTimer.restartOnce(_hostIface->getHostUpDelay());
322     }
323     else if (!hostUp)
324     {
325         lg2::debug("Host state change to off");
326 
327         stopCommand();
328 
329         // Reset the state on any PELs that were sent but not acked back
330         // to new so they'll get sent again.
331         for (auto id : _sentPELs)
332         {
333             _pelQueue.push_back(id);
334             _repo.setPELHostTransState(id, TransmissionState::newPEL);
335         }
336 
337         _sentPELs.clear();
338 
339         if (_hostFullTimer.isEnabled())
340         {
341             _hostFullTimer.setEnabled(false);
342         }
343 
344         if (_hostUpTimer.isEnabled())
345         {
346             _hostUpTimer.setEnabled(false);
347         }
348     }
349 }
350 
commandResponse(ResponseStatus status)351 void HostNotifier::commandResponse(ResponseStatus status)
352 {
353     auto id = _inProgressPEL;
354     _inProgressPEL = 0;
355 
356     if (status == ResponseStatus::success)
357     {
358         lg2::debug("HostNotifier command response success, PEL ID = {ID}", "ID",
359                    lg2::hex, id);
360         _retryCount = 0;
361 
362         _sentPELs.push_back(id);
363 
364         _repo.setPELHostTransState(id, TransmissionState::sent);
365 
366         // If the host is full, don't send off the next PEL
367         if (!_hostFull && !_pelQueue.empty())
368         {
369             doNewLogNotify();
370         }
371     }
372     else
373     {
374         lg2::error("PLDM command response failure, PEL ID = {ID}", "ID",
375                    lg2::hex, id);
376         // Retry
377         _pelQueue.push_front(id);
378         _retryTimer.restartOnce(_hostIface->getReceiveRetryDelay());
379     }
380 }
381 
retryTimerExpired()382 void HostNotifier::retryTimerExpired()
383 {
384     if (_dataIface.isHostUp())
385     {
386         lg2::info("Attempting command retry, PEL ID = {ID}", "ID", lg2::hex,
387                   _pelQueue.front());
388         _retryCount++;
389         doNewLogNotify();
390     }
391 }
392 
hostFullTimerExpired()393 void HostNotifier::hostFullTimerExpired()
394 {
395     lg2::debug("Host full timer expired, trying send again");
396     doNewLogNotify();
397 }
398 
stopCommand()399 void HostNotifier::stopCommand()
400 {
401     _retryCount = 0;
402 
403     if (_inProgressPEL != 0)
404     {
405         _pelQueue.push_front(_inProgressPEL);
406         _inProgressPEL = 0;
407     }
408 
409     if (_retryTimer.isEnabled())
410     {
411         _retryTimer.setEnabled(false);
412     }
413 
414     // Ensure the PLDM instance ID is released
415     _hostIface->cancelCmd();
416 }
417 
ackPEL(uint32_t id)418 void HostNotifier::ackPEL(uint32_t id)
419 {
420     _repo.setPELHostTransState(id, TransmissionState::acked);
421 
422     // No longer just 'sent', so remove it from the sent list.
423     auto sent = std::find(_sentPELs.begin(), _sentPELs.end(), id);
424     if (sent != _sentPELs.end())
425     {
426         _sentPELs.erase(sent);
427     }
428 
429     // An ack means the host is no longer full
430     if (_hostFullTimer.isEnabled())
431     {
432         _hostFullTimer.setEnabled(false);
433     }
434 
435     if (_hostFull)
436     {
437         _hostFull = false;
438 
439         lg2::debug("Host previously full, not anymore after this ack");
440 
441         // Start sending PELs again, from the event loop
442         if (!_pelQueue.empty())
443         {
444             scheduleDispatch();
445         }
446     }
447 }
448 
setHostFull(uint32_t id)449 void HostNotifier::setHostFull(uint32_t id)
450 {
451     lg2::debug("Received Host full indication, PEL ID = {ID}", "ID", lg2::hex,
452                id);
453 
454     _hostFull = true;
455 
456     // This PEL needs to get re-sent
457     auto sent = std::find(_sentPELs.begin(), _sentPELs.end(), id);
458     if (sent != _sentPELs.end())
459     {
460         _sentPELs.erase(sent);
461         _repo.setPELHostTransState(id, TransmissionState::newPEL);
462 
463         if (std::find(_pelQueue.begin(), _pelQueue.end(), id) ==
464             _pelQueue.end())
465         {
466             _pelQueue.push_front(id);
467         }
468     }
469 
470     // The only PELs that will be sent when the
471     // host is full is from this timer callback.
472     if (!_hostFullTimer.isEnabled())
473     {
474         lg2::debug("Starting host full timer");
475         _hostFullTimer.restartOnce(_hostIface->getHostFullRetryDelay());
476     }
477 }
478 
setBadPEL(uint32_t id)479 void HostNotifier::setBadPEL(uint32_t id)
480 {
481     lg2::error("PEL rejected by the host, PEL ID = {ID}", "ID", lg2::hex, id);
482 
483     auto sent = std::find(_sentPELs.begin(), _sentPELs.end(), id);
484     if (sent != _sentPELs.end())
485     {
486         _sentPELs.erase(sent);
487     }
488 
489     _repo.setPELHostTransState(id, TransmissionState::badPEL);
490 }
491 
492 } // namespace openpower::pels
493