xref: /openbmc/phosphor-logging/extensions/openpower-pels/host_notifier.cpp (revision f2131442a3dd9ccb44aad106aa6f4c14e3c051ba)
1 /**
2  * Copyright © 2019 IBM Corporation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #include "host_notifier.hpp"
17 
18 #include <phosphor-logging/log.hpp>
19 
20 namespace openpower::pels
21 {
22 
23 const auto subscriptionName = "PELHostNotifier";
24 const size_t maxRetryAttempts = 15;
25 
26 using namespace phosphor::logging;
27 
28 HostNotifier::HostNotifier(Repository& repo, DataInterfaceBase& dataIface,
29                            std::unique_ptr<HostInterface> hostIface) :
30     _repo(repo),
31     _dataIface(dataIface), _hostIface(std::move(hostIface)),
32     _retryTimer(_hostIface->getEvent(),
33                 std::bind(std::mem_fn(&HostNotifier::retryTimerExpired), this)),
34     _hostFullTimer(
35         _hostIface->getEvent(),
36         std::bind(std::mem_fn(&HostNotifier::hostFullTimerExpired), this)),
37     _hostUpTimer(
38         _hostIface->getEvent(),
39         std::bind(std::mem_fn(&HostNotifier::hostUpTimerExpired), this))
40 {
41     // Subscribe to be told about new PELs.
42     _repo.subscribeToAdds(subscriptionName,
43                           std::bind(std::mem_fn(&HostNotifier::newLogCallback),
44                                     this, std::placeholders::_1));
45 
46     // Subscribe to be told about deleted PELs.
47     _repo.subscribeToDeletes(
48         subscriptionName,
49         std::bind(std::mem_fn(&HostNotifier::deleteLogCallback), this,
50                   std::placeholders::_1));
51 
52     // Add any existing PELs to the queue to send them if necessary.
53     _repo.for_each(std::bind(std::mem_fn(&HostNotifier::addPELToQueue), this,
54                              std::placeholders::_1));
55 
56     // Subscribe to be told about host state changes.
57     _dataIface.subscribeToHostStateChange(
58         subscriptionName, std::bind(std::mem_fn(&HostNotifier::hostStateChange),
59                                     this, std::placeholders::_1));
60 
61     // Set the function to call when the async reponse is received.
62     _hostIface->setResponseFunction(
63         std::bind(std::mem_fn(&HostNotifier::commandResponse), this,
64                   std::placeholders::_1));
65 
66     // Start sending logs if the host is running
67     if (!_pelQueue.empty() && _dataIface.isHostUp())
68     {
69         log<level::DEBUG>("Host is already up at startup");
70         _hostUpTimer.restartOnce(_hostIface->getHostUpDelay());
71     }
72 }
73 
74 HostNotifier::~HostNotifier()
75 {
76     _repo.unsubscribeFromAdds(subscriptionName);
77     _dataIface.unsubscribeFromHostStateChange(subscriptionName);
78 }
79 
80 void HostNotifier::hostUpTimerExpired()
81 {
82     log<level::DEBUG>("Host up timer expired");
83     doNewLogNotify();
84 }
85 
86 bool HostNotifier::addPELToQueue(const PEL& pel)
87 {
88     if (enqueueRequired(pel.id()))
89     {
90         _pelQueue.push_back(pel.id());
91     }
92 
93     // Return false so that Repo::for_each keeps going.
94     return false;
95 }
96 
97 bool HostNotifier::enqueueRequired(uint32_t id) const
98 {
99     bool required = true;
100     Repository::LogID i{Repository::LogID::Pel{id}};
101 
102     // Manufacturing testing may turn off sending up PELs
103     if (!_dataIface.getHostPELEnablement())
104     {
105         return false;
106     }
107 
108     if (auto attributes = _repo.getPELAttributes(i); attributes)
109     {
110         auto a = attributes.value().get();
111 
112         if ((a.hostState == TransmissionState::acked) ||
113             (a.hostState == TransmissionState::badPEL))
114         {
115             required = false;
116         }
117         else if (a.actionFlags.test(hiddenFlagBit) &&
118                  (a.hmcState == TransmissionState::acked))
119         {
120             required = false;
121         }
122         else if (a.actionFlags.test(dontReportToHostFlagBit))
123         {
124             required = false;
125         }
126     }
127     else
128     {
129         using namespace phosphor::logging;
130         log<level::ERR>("Host Enqueue: Unable to find PEL ID in repository",
131                         entry("PEL_ID=0x%X", id));
132         required = false;
133     }
134 
135     return required;
136 }
137 
138 bool HostNotifier::notifyRequired(uint32_t id) const
139 {
140     bool notify = true;
141     Repository::LogID i{Repository::LogID::Pel{id}};
142 
143     if (auto attributes = _repo.getPELAttributes(i); attributes)
144     {
145         // If already acked by the host, don't send again.
146         // (A safety check as it shouldn't get to this point.)
147         auto a = attributes.value().get();
148         if (a.hostState == TransmissionState::acked)
149         {
150             notify = false;
151         }
152         else if (a.actionFlags.test(hiddenFlagBit))
153         {
154             // If hidden and acked (or will be) acked by the HMC,
155             // also don't send it. (HMC management can come and
156             // go at any time)
157             if ((a.hmcState == TransmissionState::acked) ||
158                 _dataIface.isHMCManaged())
159             {
160                 notify = false;
161             }
162         }
163     }
164     else
165     {
166         // Must have been deleted since put on the queue.
167         notify = false;
168     }
169 
170     return notify;
171 }
172 
173 void HostNotifier::newLogCallback(const PEL& pel)
174 {
175     if (!enqueueRequired(pel.id()))
176     {
177         return;
178     }
179 
180     log<level::DEBUG>("new PEL added to queue", entry("PEL_ID=0x%X", pel.id()));
181 
182     _pelQueue.push_back(pel.id());
183 
184     // Notify shouldn't happen if host is down, not up long enough, or full
185     if (!_dataIface.isHostUp() || _hostFull || _hostUpTimer.isEnabled())
186     {
187         return;
188     }
189 
190     // Dispatch a command now if there isn't currently a command
191     // in progress and this is the first log in the queue or it
192     // previously gave up from a hard failure.
193     auto inProgress = (_inProgressPEL != 0) || _hostIface->cmdInProgress() ||
194                       _retryTimer.isEnabled();
195 
196     auto firstPEL = _pelQueue.size() == 1;
197     auto gaveUp = _retryCount >= maxRetryAttempts;
198 
199     if (!inProgress && (firstPEL || gaveUp))
200     {
201         _retryCount = 0;
202 
203         // Send a log, but from the event loop, not from here.
204         scheduleDispatch();
205     }
206 }
207 
208 void HostNotifier::deleteLogCallback(uint32_t id)
209 {
210     auto queueIt = std::find(_pelQueue.begin(), _pelQueue.end(), id);
211     if (queueIt != _pelQueue.end())
212     {
213         log<level::DEBUG>("Host notifier removing deleted log from queue");
214         _pelQueue.erase(queueIt);
215     }
216 
217     auto sentIt = std::find(_sentPELs.begin(), _sentPELs.end(), id);
218     if (sentIt != _sentPELs.end())
219     {
220         log<level::DEBUG>("Host notifier removing deleted log from sent list");
221         _sentPELs.erase(sentIt);
222     }
223 
224     // Nothing we can do about this...
225     if (id == _inProgressPEL)
226     {
227         log<level::WARNING>(
228             "A PEL was deleted while its host notification was in progress",
229             entry("PEL_ID=0x%X", id));
230     }
231 }
232 
233 void HostNotifier::scheduleDispatch()
234 {
235     _dispatcher = std::make_unique<sdeventplus::source::Defer>(
236         _hostIface->getEvent(), std::bind(std::mem_fn(&HostNotifier::dispatch),
237                                           this, std::placeholders::_1));
238 }
239 
240 void HostNotifier::dispatch(sdeventplus::source::EventBase& /*source*/)
241 {
242     _dispatcher.reset();
243 
244     doNewLogNotify();
245 }
246 
247 void HostNotifier::doNewLogNotify()
248 {
249     if (!_dataIface.isHostUp() || _retryTimer.isEnabled() ||
250         _hostFullTimer.isEnabled())
251     {
252         return;
253     }
254 
255     if (_retryCount >= maxRetryAttempts)
256     {
257         // Give up until a new log comes in.
258         if (_retryCount == maxRetryAttempts)
259         {
260             // If this were to really happen, the PLDM interface
261             // would be down and isolating that shouldn't left to
262             // a logging daemon, so just trace.  Also, this will start
263             // trying again when the next new log comes in.
264             log<level::ERR>(
265                 "PEL Host notifier hit max retry attempts. Giving up for now.",
266                 entry("PEL_ID=0x%X", _pelQueue.front()));
267 
268             // Tell the host interface object to clean itself up, especially to
269             // release the PLDM instance ID it's been using.
270             _hostIface->cancelCmd();
271         }
272         return;
273     }
274 
275     bool doNotify = false;
276     uint32_t id = 0;
277 
278     // Find the PEL to send
279     while (!doNotify && !_pelQueue.empty())
280     {
281         id = _pelQueue.front();
282         _pelQueue.pop_front();
283 
284         if (notifyRequired(id))
285         {
286             doNotify = true;
287         }
288     }
289 
290     if (doNotify)
291     {
292         // Get the size using the repo attributes
293         Repository::LogID i{Repository::LogID::Pel{id}};
294         if (auto attributes = _repo.getPELAttributes(i); attributes)
295         {
296             auto size = static_cast<size_t>(
297                 std::filesystem::file_size((*attributes).get().path));
298 
299             log<level::DEBUG>("sendNewLogCmd", entry("PEL_ID=0x%X", id),
300                               entry("PEL_SIZE=%d", size));
301 
302             auto rc = _hostIface->sendNewLogCmd(id, size);
303 
304             if (rc == CmdStatus::success)
305             {
306                 _inProgressPEL = id;
307             }
308             else
309             {
310                 // It failed.  Retry
311                 log<level::ERR>("PLDM send failed", entry("PEL_ID=0x%X", id));
312                 _pelQueue.push_front(id);
313                 _inProgressPEL = 0;
314                 _retryTimer.restartOnce(_hostIface->getSendRetryDelay());
315             }
316         }
317         else
318         {
319             log<level::ERR>("PEL ID not in repository.  Cannot notify host",
320                             entry("PEL_ID=0x%X", id));
321         }
322     }
323 }
324 
325 void HostNotifier::hostStateChange(bool hostUp)
326 {
327     _retryCount = 0;
328     _hostFull = false;
329 
330     if (hostUp && !_pelQueue.empty())
331     {
332         log<level::DEBUG>("Host state change to on");
333         _hostUpTimer.restartOnce(_hostIface->getHostUpDelay());
334     }
335     else if (!hostUp)
336     {
337         log<level::DEBUG>("Host state change to off");
338 
339         stopCommand();
340 
341         // Reset the state on any PELs that were sent but not acked back
342         // to new so they'll get sent again.
343         for (auto id : _sentPELs)
344         {
345             _pelQueue.push_back(id);
346             _repo.setPELHostTransState(id, TransmissionState::newPEL);
347         }
348 
349         _sentPELs.clear();
350 
351         if (_hostFullTimer.isEnabled())
352         {
353             _hostFullTimer.setEnabled(false);
354         }
355 
356         if (_hostUpTimer.isEnabled())
357         {
358             _hostUpTimer.setEnabled(false);
359         }
360     }
361 }
362 
363 void HostNotifier::commandResponse(ResponseStatus status)
364 {
365     auto id = _inProgressPEL;
366     _inProgressPEL = 0;
367 
368     if (status == ResponseStatus::success)
369     {
370         log<level::DEBUG>("HostNotifier command response success",
371                           entry("PEL_ID=0x%X", id));
372         _retryCount = 0;
373 
374         _sentPELs.push_back(id);
375 
376         _repo.setPELHostTransState(id, TransmissionState::sent);
377 
378         // If the host is full, don't send off the next PEL
379         if (!_hostFull && !_pelQueue.empty())
380         {
381             doNewLogNotify();
382         }
383     }
384     else
385     {
386         log<level::ERR>("PLDM command response failure",
387                         entry("PEL_ID=0x%X", id));
388         // Retry
389         _pelQueue.push_front(id);
390         _retryTimer.restartOnce(_hostIface->getReceiveRetryDelay());
391     }
392 }
393 
394 void HostNotifier::retryTimerExpired()
395 {
396     if (_dataIface.isHostUp())
397     {
398         log<level::INFO>("Attempting command retry",
399                          entry("PEL_ID=0x%X", _pelQueue.front()));
400         _retryCount++;
401         doNewLogNotify();
402     }
403 }
404 
405 void HostNotifier::hostFullTimerExpired()
406 {
407     log<level::DEBUG>("Host full timer expired, trying send again");
408     doNewLogNotify();
409 }
410 
411 void HostNotifier::stopCommand()
412 {
413     _retryCount = 0;
414 
415     if (_inProgressPEL != 0)
416     {
417         _pelQueue.push_front(_inProgressPEL);
418         _inProgressPEL = 0;
419     }
420 
421     if (_retryTimer.isEnabled())
422     {
423         _retryTimer.setEnabled(false);
424     }
425 
426     // Ensure the PLDM instance ID is released
427     _hostIface->cancelCmd();
428 }
429 
430 void HostNotifier::ackPEL(uint32_t id)
431 {
432     _repo.setPELHostTransState(id, TransmissionState::acked);
433 
434     // No longer just 'sent', so remove it from the sent list.
435     auto sent = std::find(_sentPELs.begin(), _sentPELs.end(), id);
436     if (sent != _sentPELs.end())
437     {
438         _sentPELs.erase(sent);
439     }
440 
441     // An ack means the host is no longer full
442     if (_hostFullTimer.isEnabled())
443     {
444         _hostFullTimer.setEnabled(false);
445     }
446 
447     if (_hostFull)
448     {
449         _hostFull = false;
450 
451         log<level::DEBUG>("Host previously full, not anymore after this ack");
452 
453         // Start sending PELs again, from the event loop
454         if (!_pelQueue.empty())
455         {
456             scheduleDispatch();
457         }
458     }
459 }
460 
461 void HostNotifier::setHostFull(uint32_t id)
462 {
463     log<level::INFO>("Received Host full indication", entry("PEL_ID=0x%X", id));
464 
465     _hostFull = true;
466 
467     // This PEL needs to get re-sent
468     auto sent = std::find(_sentPELs.begin(), _sentPELs.end(), id);
469     if (sent != _sentPELs.end())
470     {
471         _sentPELs.erase(sent);
472         _repo.setPELHostTransState(id, TransmissionState::newPEL);
473 
474         if (std::find(_pelQueue.begin(), _pelQueue.end(), id) ==
475             _pelQueue.end())
476         {
477             _pelQueue.push_front(id);
478         }
479     }
480 
481     // The only PELs that will be sent when the
482     // host is full is from this timer callback.
483     if (!_hostFullTimer.isEnabled())
484     {
485         log<level::DEBUG>("Starting host full timer");
486         _hostFullTimer.restartOnce(_hostIface->getHostFullRetryDelay());
487     }
488 }
489 
490 void HostNotifier::setBadPEL(uint32_t id)
491 {
492     log<level::ERR>("PEL rejected by the host", entry("PEL_ID=0x%X", id));
493 
494     auto sent = std::find(_sentPELs.begin(), _sentPELs.end(), id);
495     if (sent != _sentPELs.end())
496     {
497         _sentPELs.erase(sent);
498     }
499 
500     _repo.setPELHostTransState(id, TransmissionState::badPEL);
501 }
502 
503 } // namespace openpower::pels
504