xref: /openbmc/openpower-proc-control/procedures/phal/check_host_running.cpp (revision 211f8a9a8ceac7e0d369c9b76c4ff542ccfb8ff0)
1 extern "C"
2 {
3 #include "libpdbg.h"
4 }
5 
6 #include "extensions/phal/common_utils.hpp"
7 #include "extensions/phal/create_pel.hpp"
8 #include "extensions/phal/pdbg_utils.hpp"
9 #include "p10_cfam.hpp"
10 #include "registration.hpp"
11 
12 #include <phosphor-logging/log.hpp>
13 #include <sdbusplus/bus.hpp>
14 
15 #include <cstdio>
16 #include <fstream>
17 #include <memory>
18 
19 namespace openpower
20 {
21 namespace phal
22 {
23 
24 using namespace openpower::cfam::p10;
25 using namespace phosphor::logging;
26 
27 /** Best effort function to create a BMC dump */
createBmcDump()28 void createBmcDump()
29 {
30     auto bus = sdbusplus::bus::new_default();
31 
32     auto method = bus.new_method_call(
33         "xyz.openbmc_project.Dump.Manager", "/xyz/openbmc_project/dump/bmc",
34         "xyz.openbmc_project.Dump.Create", "CreateDump");
35     method.append(
36         std::vector<
37             std::pair<std::string, std::variant<std::string, uint64_t>>>());
38     try
39     {
40         bus.call_noreply(method);
41     }
42     catch (const sdbusplus::exception_t& e)
43     {
44         log<level::ERR>("Exception raised creating BMC dump",
45                         entry("EXCEPTION=%s", e.what()));
46         // just continue, failing to collect a dump should not cause further
47         // issues in this path
48     }
49 }
50 
51 /**
52  * This is the backup plan to ensuring the host is not running before the
53  * BMC issues a power off to the system. Prior to this procedure being called,
54  * the BMC has tried all other communication mechanisms to talk with the host
55  * and they have failed. The design is that the host firmware will write the
56  * value 0xA5000001 to Mailbox scratch register 12 when they are up and running
57  * to a point where communication to the BMC is no longer required to function.
58  * On a power off or shutdown this register is cleared by the host and BMC
59  * firmware. If the BMC sees the 0xA5000001 pattern in the scratch register
60  * then it assumes the host is running and will leave power on to the system.
61  */
checkHostRunning()62 void checkHostRunning()
63 {
64     struct pdbg_target* procTarget;
65 
66     try
67     {
68         phal_init();
69     }
70     catch (const std::exception& ex)
71     {
72         // This should "never" happen so just throw the exception and let
73         // our systemd error handling process this
74         log<level::ERR>("Exception raised during init PHAL",
75                         entry("EXCEPTION=%s", ex.what()));
76         throw std::runtime_error("PHAL initialization failed");
77     }
78 
79     pdbg_for_each_class_target("proc", procTarget)
80     {
81         // Only check the primary proc
82         if (!isPrimaryProc(procTarget))
83         {
84             continue;
85         }
86 
87         uint32_t val = 0;
88         constexpr uint32_t HOST_RUNNING_INDICATION = 0xA5000001;
89         auto rc = getCFAM(procTarget, P10_SCRATCH_REG_12, val);
90         if ((rc == 0) && (val != HOST_RUNNING_INDICATION))
91         {
92             log<level::INFO>("CFAM read indicates host is not running",
93                              entry("CFAM=0x%X", val));
94             return;
95         }
96 
97         if (rc != 0)
98         {
99             // On error, we have to assume host is up so just fall through
100             // to code below
101             log<level::ERR>("CFAM read error, assume host is running");
102         }
103         else if (val == HOST_RUNNING_INDICATION)
104         {
105             // This is not good. Normal communication path to host did not work
106             // but CFAM indicates host is running.
107             log<level::ERR>("CFAM read indicates host is running");
108         }
109 
110         // Create an error so user knows system is in a bad state
111         openpower::pel::createPEL("org.open_power.PHAL.Error.HostRunning");
112 
113         // Create file for host instance and create in filesystem to
114         // indicate to services that host is running.
115         // This file is cleared by the phosphor-state-manager once the host
116         // start target completes.
117         constexpr auto HOST_RUNNING_FILE = "/run/openbmc/host@%d-on";
118         auto size = std::snprintf(nullptr, 0, HOST_RUNNING_FILE, 0);
119         size++; // null
120         std::unique_ptr<char[]> buf(new char[size]);
121         std::snprintf(buf.get(), size, HOST_RUNNING_FILE, 0);
122         std::ofstream outfile(buf.get());
123         outfile.close();
124 
125         // Try to create BMC dump for further debug
126         createBmcDump();
127 
128         return;
129     }
130 
131     // We should "never" make it here. If we did it implies no primary processor
132     // was found. Once again, rely on systemd recovery if this happens
133     log<level::ERR>("No primary processor found in checkHostRunning");
134     throw std::runtime_error("No primary processor found in checkHostRunning");
135 }
136 
137 /**
138  * The BMC is to make a best effort to clear the CFAM register used by PHYP
139  * to indicate it is running when the host is stopped. This procedure will do
140  * that.
141  */
clearHostRunning()142 void clearHostRunning()
143 {
144     struct pdbg_target* procTarget;
145     log<level::INFO>("Entering clearHostRunning");
146 
147     try
148     {
149         phal_init();
150     }
151     catch (const std::exception& ex)
152     {
153         // This should "never" happen so just throw the exception and let
154         // our systemd error handling process this
155         log<level::ERR>("Exception raised during init PHAL",
156                         entry("EXCEPTION=%s", ex.what()));
157         throw std::runtime_error("PHAL initialization failed");
158     }
159 
160     pdbg_for_each_class_target("proc", procTarget)
161     {
162         // Only check the primary proc
163         if (!isPrimaryProc(procTarget))
164         {
165             continue;
166         }
167 
168         constexpr uint32_t HOST_NOT_RUNNING_INDICATION = 0;
169         auto rc = putCFAM(procTarget, P10_SCRATCH_REG_12,
170                           HOST_NOT_RUNNING_INDICATION);
171         if (rc != 0)
172         {
173             log<level::ERR>("CFAM write to clear host running status failed");
174         }
175 
176         // It's best effort, so just return either way
177         return;
178     }
179     log<level::ERR>("No primary processor found in clearHostRunning");
180 }
181 
182 REGISTER_PROCEDURE("checkHostRunning", checkHostRunning)
183 REGISTER_PROCEDURE("clearHostRunning", clearHostRunning)
184 
185 } // namespace phal
186 } // namespace openpower
187