extensions/openpower-pels/host_notifier.cpp

/**
 * Copyright © 2019 IBM Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
#include "host_notifier.hpp"

#include <phosphor-logging/log.hpp>

namespace openpower::pels
{

const auto subscriptionName = "PELHostNotifier";
const size_t maxRetryAttempts = 15;

using namespace phosphor::logging;

HostNotifier::HostNotifier(Repository& repo, DataInterfaceBase& dataIface,
                           std::unique_ptr<HostInterface> hostIface) :
    _repo(repo),
    _dataIface(dataIface), _hostIface(std::move(hostIface)),
    _retryTimer(_hostIface->getEvent(),
                std::bind(std::mem_fn(&HostNotifier::retryTimerExpired), this)),
    _hostFullTimer(
        _hostIface->getEvent(),
        std::bind(std::mem_fn(&HostNotifier::hostFullTimerExpired), this)),
    _hostUpTimer(
        _hostIface->getEvent(),
        std::bind(std::mem_fn(&HostNotifier::hostUpTimerExpired), this))
{
    // Subscribe to be told about new PELs.
    _repo.subscribeToAdds(subscriptionName,
                          std::bind(std::mem_fn(&HostNotifier::newLogCallback),
                                    this, std::placeholders::_1));

    // Subscribe to be told about deleted PELs.
    _repo.subscribeToDeletes(
        subscriptionName,
        std::bind(std::mem_fn(&HostNotifier::deleteLogCallback), this,
                  std::placeholders::_1));

    // Add any existing PELs to the queue to send them if necessary.
    _repo.for_each(std::bind(std::mem_fn(&HostNotifier::addPELToQueue), this,
                             std::placeholders::_1));

    // Subscribe to be told about host state changes.
    _dataIface.subscribeToHostStateChange(
        subscriptionName, std::bind(std::mem_fn(&HostNotifier::hostStateChange),
                                    this, std::placeholders::_1));

    // Set the function to call when the async reponse is received.
    _hostIface->setResponseFunction(
        std::bind(std::mem_fn(&HostNotifier::commandResponse), this,
                  std::placeholders::_1));

    // Start sending logs if the host is running
    if (!_pelQueue.empty() && _dataIface.isHostUp())
    {
        log<level::DEBUG>("Host is already up at startup");
        _hostUpTimer.restartOnce(_hostIface->getHostUpDelay());
    }
}

HostNotifier::~HostNotifier()
{
    _repo.unsubscribeFromAdds(subscriptionName);
    _dataIface.unsubscribeFromHostStateChange(subscriptionName);
}

void HostNotifier::hostUpTimerExpired()
{
    log<level::DEBUG>("Host up timer expired");
    doNewLogNotify();
}

bool HostNotifier::addPELToQueue(const PEL& pel)
{
    if (enqueueRequired(pel.id()))
    {
        _pelQueue.push_back(pel.id());
    }

    // Return false so that Repo::for_each keeps going.
    return false;
}

bool HostNotifier::enqueueRequired(uint32_t id) const
{
    bool required = true;
    Repository::LogID i{Repository::LogID::Pel{id}};

    // Manufacturing testing may turn off sending up PELs
    if (!_dataIface.getHostPELEnablement())
    {
        return false;
    }

    if (auto attributes = _repo.getPELAttributes(i); attributes)
    {
        auto a = attributes.value().get();

        if ((a.hostState == TransmissionState::acked) ||
            (a.hostState == TransmissionState::badPEL))
        {
            required = false;
        }
        else if (a.actionFlags.test(hiddenFlagBit) &&
                 (a.hmcState == TransmissionState::acked))
        {
            required = false;
        }
        else if (a.actionFlags.test(dontReportToHostFlagBit))
        {
            required = false;
        }
    }
    else
    {
        using namespace phosphor::logging;
        log<level::ERR>("Host Enqueue: Unable to find PEL ID in repository",
                        entry("PEL_ID=0x%X", id));
        required = false;
    }

    return required;
}

bool HostNotifier::notifyRequired(uint32_t id) const
{
    bool notify = true;
    Repository::LogID i{Repository::LogID::Pel{id}};

    if (auto attributes = _repo.getPELAttributes(i); attributes)
    {
        // If already acked by the host, don't send again.
        // (A safety check as it shouldn't get to this point.)
        auto a = attributes.value().get();
        if (a.hostState == TransmissionState::acked)
        {
            notify = false;
        }
        else if (a.actionFlags.test(hiddenFlagBit))
        {
            // If hidden and acked (or will be) acked by the HMC,
            // also don't send it. (HMC management can come and
            // go at any time)
            if ((a.hmcState == TransmissionState::acked) ||
                _dataIface.isHMCManaged())
            {
                notify = false;
            }
        }
    }
    else
    {
        // Must have been deleted since put on the queue.
        notify = false;
    }

    return notify;
}

void HostNotifier::newLogCallback(const PEL& pel)
{
    if (!enqueueRequired(pel.id()))
    {
        return;
    }

    log<level::DEBUG>("new PEL added to queue", entry("PEL_ID=0x%X", pel.id()));

    _pelQueue.push_back(pel.id());

    // Notify shouldn't happen if host is down, not up long enough, or full
    if (!_dataIface.isHostUp() || _hostFull || _hostUpTimer.isEnabled())
    {
        return;
    }

    // Dispatch a command now if there isn't currently a command
    // in progress and this is the first log in the queue or it
    // previously gave up from a hard failure.
    auto inProgress = (_inProgressPEL != 0) || _hostIface->cmdInProgress() ||
                      _retryTimer.isEnabled();

    auto firstPEL = _pelQueue.size() == 1;
    auto gaveUp = _retryCount >= maxRetryAttempts;

    if (!inProgress && (firstPEL || gaveUp))
    {
        _retryCount = 0;

        // Send a log, but from the event loop, not from here.
        scheduleDispatch();
    }
}

void HostNotifier::deleteLogCallback(uint32_t id)
{
    auto queueIt = std::find(_pelQueue.begin(), _pelQueue.end(), id);
    if (queueIt != _pelQueue.end())
    {
        log<level::DEBUG>("Host notifier removing deleted log from queue");
        _pelQueue.erase(queueIt);
    }

    auto sentIt = std::find(_sentPELs.begin(), _sentPELs.end(), id);
    if (sentIt != _sentPELs.end())
    {
        log<level::DEBUG>("Host notifier removing deleted log from sent list");
        _sentPELs.erase(sentIt);
    }

    // Nothing we can do about this...
    if (id == _inProgressPEL)
    {
        log<level::WARNING>(
            "A PEL was deleted while its host notification was in progress",
            entry("PEL_ID=0x%X", id));
    }
}

void HostNotifier::scheduleDispatch()
{
    _dispatcher = std::make_unique<sdeventplus::source::Defer>(
        _hostIface->getEvent(), std::bind(std::mem_fn(&HostNotifier::dispatch),
                                          this, std::placeholders::_1));
}

void HostNotifier::dispatch(sdeventplus::source::EventBase& /*source*/)
{
    _dispatcher.reset();

    doNewLogNotify();
}

void HostNotifier::doNewLogNotify()
{
    if (!_dataIface.isHostUp() || _retryTimer.isEnabled() ||
        _hostFullTimer.isEnabled())
    {
        return;
    }

    if (_retryCount >= maxRetryAttempts)
    {
        // Give up until a new log comes in.
        if (_retryCount == maxRetryAttempts)
        {
            // If this were to really happen, the PLDM interface
            // would be down and isolating that shouldn't left to
            // a logging daemon, so just trace.  Also, this will start
            // trying again when the next new log comes in.
            log<level::ERR>(
                "PEL Host notifier hit max retry attempts. Giving up for now.",
                entry("PEL_ID=0x%X", _pelQueue.front()));

            // Tell the host interface object to clean itself up, especially to
            // release the PLDM instance ID it's been using.
            _hostIface->cancelCmd();
        }
        return;
    }

    bool doNotify = false;
    uint32_t id = 0;

    // Find the PEL to send
    while (!doNotify && !_pelQueue.empty())
    {
        id = _pelQueue.front();
        _pelQueue.pop_front();

        if (notifyRequired(id))
        {
            doNotify = true;
        }
    }

    if (doNotify)
    {
        // Get the size using the repo attributes
        Repository::LogID i{Repository::LogID::Pel{id}};
        if (auto attributes = _repo.getPELAttributes(i); attributes)
        {
            auto size = static_cast<size_t>(
                std::filesystem::file_size((*attributes).get().path));

            log<level::DEBUG>("sendNewLogCmd", entry("PEL_ID=0x%X", id),
                              entry("PEL_SIZE=%d", size));

            auto rc = _hostIface->sendNewLogCmd(id, size);

            if (rc == CmdStatus::success)
            {
                _inProgressPEL = id;
            }
            else
            {
                // It failed.  Retry
                log<level::ERR>("PLDM send failed", entry("PEL_ID=0x%X", id));
                _pelQueue.push_front(id);
                _inProgressPEL = 0;
                _retryTimer.restartOnce(_hostIface->getSendRetryDelay());
            }
        }
        else
        {
            log<level::ERR>("PEL ID not in repository.  Cannot notify host",
                            entry("PEL_ID=0x%X", id));
        }
    }
}

void HostNotifier::hostStateChange(bool hostUp)
{
    _retryCount = 0;
    _hostFull = false;

    if (hostUp && !_pelQueue.empty())
    {
        log<level::DEBUG>("Host state change to on");
        _hostUpTimer.restartOnce(_hostIface->getHostUpDelay());
    }
    else if (!hostUp)
    {
        log<level::DEBUG>("Host state change to off");

        stopCommand();

        // Reset the state on any PELs that were sent but not acked back
        // to new so they'll get sent again.
        for (auto id : _sentPELs)
        {
            _pelQueue.push_back(id);
            _repo.setPELHostTransState(id, TransmissionState::newPEL);
        }

        _sentPELs.clear();

        if (_hostFullTimer.isEnabled())
        {
            _hostFullTimer.setEnabled(false);
        }

        if (_hostUpTimer.isEnabled())
        {
            _hostUpTimer.setEnabled(false);
        }
    }
}

void HostNotifier::commandResponse(ResponseStatus status)
{
    auto id = _inProgressPEL;
    _inProgressPEL = 0;

    if (status == ResponseStatus::success)
    {
        log<level::DEBUG>("HostNotifier command response success",
                          entry("PEL_ID=0x%X", id));
        _retryCount = 0;

        _sentPELs.push_back(id);

        _repo.setPELHostTransState(id, TransmissionState::sent);

        // If the host is full, don't send off the next PEL
        if (!_hostFull && !_pelQueue.empty())
        {
            doNewLogNotify();
        }
    }
    else
    {
        log<level::ERR>("PLDM command response failure",
                        entry("PEL_ID=0x%X", id));
        // Retry
        _pelQueue.push_front(id);
        _retryTimer.restartOnce(_hostIface->getReceiveRetryDelay());
    }
}

void HostNotifier::retryTimerExpired()
{
    if (_dataIface.isHostUp())
    {
        log<level::INFO>("Attempting command retry",
                         entry("PEL_ID=0x%X", _pelQueue.front()));
        _retryCount++;
        doNewLogNotify();
    }
}

void HostNotifier::hostFullTimerExpired()
{
    log<level::DEBUG>("Host full timer expired, trying send again");
    doNewLogNotify();
}

void HostNotifier::stopCommand()
{
    _retryCount = 0;

    if (_inProgressPEL != 0)
    {
        _pelQueue.push_front(_inProgressPEL);
        _inProgressPEL = 0;
    }

    if (_retryTimer.isEnabled())
    {
        _retryTimer.setEnabled(false);
    }

    // Ensure the PLDM instance ID is released
    _hostIface->cancelCmd();
}

void HostNotifier::ackPEL(uint32_t id)
{
    _repo.setPELHostTransState(id, TransmissionState::acked);

    // No longer just 'sent', so remove it from the sent list.
    auto sent = std::find(_sentPELs.begin(), _sentPELs.end(), id);
    if (sent != _sentPELs.end())
    {
        _sentPELs.erase(sent);
    }

    // An ack means the host is no longer full
    if (_hostFullTimer.isEnabled())
    {
        _hostFullTimer.setEnabled(false);
    }

    if (_hostFull)
    {
        _hostFull = false;

        log<level::DEBUG>("Host previously full, not anymore after this ack");

        // Start sending PELs again, from the event loop
        if (!_pelQueue.empty())
        {
            scheduleDispatch();
        }
    }
}

void HostNotifier::setHostFull(uint32_t id)
{
    log<level::INFO>("Received Host full indication", entry("PEL_ID=0x%X", id));

    _hostFull = true;

    // This PEL needs to get re-sent
    auto sent = std::find(_sentPELs.begin(), _sentPELs.end(), id);
    if (sent != _sentPELs.end())
    {
        _sentPELs.erase(sent);
        _repo.setPELHostTransState(id, TransmissionState::newPEL);

        if (std::find(_pelQueue.begin(), _pelQueue.end(), id) ==
            _pelQueue.end())
        {
            _pelQueue.push_front(id);
        }
    }

    // The only PELs that will be sent when the
    // host is full is from this timer callback.
    if (!_hostFullTimer.isEnabled())
    {
        log<level::DEBUG>("Starting host full timer");
        _hostFullTimer.restartOnce(_hostIface->getHostFullRetryDelay());
    }
}

void HostNotifier::setBadPEL(uint32_t id)
{
    log<level::ERR>("PEL rejected by the host", entry("PEL_ID=0x%X", id));

    auto sent = std::find(_sentPELs.begin(), _sentPELs.end(), id);
    if (sent != _sentPELs.end())
    {
        _sentPELs.erase(sent);
    }

    _repo.setPELHostTransState(id, TransmissionState::badPEL);
}

} // namespace openpower::pels