xref: /openbmc/openpower-occ-control/occ_status.cpp (revision 7bd6d148e4f204266fe8ffd3df8aa93cf0284cd0)
1 #include "occ_status.hpp"
2 
3 #include "occ_manager.hpp"
4 #include "occ_sensor.hpp"
5 #include "powermode.hpp"
6 #include "utils.hpp"
7 
8 #include <fmt/core.h>
9 
10 #ifdef POWER10
11 #include <com/ibm/Host/Target/server.hpp>
12 #endif
13 #include <phosphor-logging/log.hpp>
14 
15 #include <filesystem>
16 
17 namespace open_power
18 {
19 namespace occ
20 {
21 
22 using namespace phosphor::logging;
23 
24 // Handles updates to occActive property
25 bool Status::occActive(bool value)
26 {
27     if (value != this->occActive())
28     {
29         log<level::INFO>(fmt::format("Status::occActive OCC{} changed to {}",
30                                      instance, value)
31                              .c_str());
32         if (value)
33         {
34             // Set the device active
35             device.setActive(true);
36 
37             // Update the OCC active sensor
38             Base::Status::occActive(value);
39 
40             // Start watching for errors
41             addErrorWatch();
42 
43             // Reset last OCC state
44             lastState = 0;
45 
46             if (device.master())
47             {
48                 // Update powercap bounds from OCC
49                 manager.updatePcapBounds();
50             }
51 
52             // Call into Manager to let know that we have bound
53             if (this->managerCallBack)
54             {
55                 this->managerCallBack(instance, value);
56             }
57         }
58         else
59         {
60 #ifdef POWER10
61             if (pmode && device.master())
62             {
63                 // Prevent mode changes
64                 pmode->setMasterActive(false);
65             }
66             if (safeStateDelayTimer.isEnabled())
67             {
68                 // stop safe delay timer
69                 safeStateDelayTimer.setEnabled(false);
70             }
71 #endif
72             // Call into Manager to let know that we will unbind.
73             if (this->managerCallBack)
74             {
75                 this->managerCallBack(instance, value);
76             }
77 
78             // Stop watching for errors
79             removeErrorWatch();
80 
81             // Set the device inactive
82             device.setActive(false);
83         }
84     }
85     else if (value && !device.active())
86     {
87         // Existing error watch is on a dead file descriptor.
88         removeErrorWatch();
89 
90         /*
91          * In it's constructor, Status checks Device::bound() to see if OCC is
92          * active or not.
93          * Device::bound() checks for occX-dev0 directory.
94          * We will lose occX-dev0 directories during FSI rescan.
95          * So, if we start this application (and construct Status), and then
96          * later do FSI rescan, we will end up with occActive = true and device
97          * NOT bound. Lets correct that situation here.
98          */
99         device.setActive(true);
100 
101         // Add error watch again
102         addErrorWatch();
103     }
104     else if (!value && device.active())
105     {
106         removeErrorWatch();
107 
108         // In the event that the application never receives the active signal
109         // even though the OCC is active (this can occur if the BMC is rebooted
110         // with the host on, since the initial OCC driver probe will discover
111         // the OCCs), this application needs to be able to unbind the device
112         // when we get the OCC inactive signal.
113         device.setActive(false);
114     }
115     return Base::Status::occActive(value);
116 }
117 
118 // Callback handler when a device error is reported.
119 void Status::deviceError(Error::Descriptor d)
120 {
121 #ifdef POWER10
122     if (pmode && device.master())
123     {
124         // Prevent mode changes
125         pmode->setMasterActive(false);
126     }
127 #endif
128 
129     if (d.log)
130     {
131         FFDC::createOCCResetPEL(instance, d.path, d.err, d.callout);
132     }
133 
134     // This would deem OCC inactive
135     this->occActive(false);
136 
137     // Reset the OCC
138     this->resetOCC();
139 }
140 
141 // Sends message to host control command handler to reset OCC
142 void Status::resetOCC()
143 {
144     log<level::INFO>(
145         fmt::format(">>Status::resetOCC() - requesting reset for OCC{}",
146                     instance)
147             .c_str());
148 #ifdef PLDM
149     if (resetCallBack)
150     {
151         this->resetCallBack(instance);
152     }
153 #else
154     constexpr auto CONTROL_HOST_PATH = "/org/open_power/control/host0";
155     constexpr auto CONTROL_HOST_INTF = "org.open_power.Control.Host";
156 
157     // This will throw exception on failure
158     auto service = utils::getService(CONTROL_HOST_PATH, CONTROL_HOST_INTF);
159 
160     auto& bus = utils::getBus();
161     auto method = bus.new_method_call(service.c_str(), CONTROL_HOST_PATH,
162                                       CONTROL_HOST_INTF, "Execute");
163     // OCC Reset control command
164     method.append(convertForMessage(Control::Host::Command::OCCReset).c_str());
165 
166     // OCC Sensor ID for callout reasons
167     method.append(std::variant<uint8_t>(std::get<0>(sensorMap.at(instance))));
168     bus.call_noreply(method);
169     return;
170 #endif
171 }
172 
173 // Handler called by Host control command handler to convey the
174 // status of the executed command
175 void Status::hostControlEvent(sdbusplus::message_t& msg)
176 {
177     std::string cmdCompleted{};
178     std::string cmdStatus{};
179 
180     msg.read(cmdCompleted, cmdStatus);
181 
182     log<level::DEBUG>("Host control signal values",
183                       entry("COMMAND=%s", cmdCompleted.c_str()),
184                       entry("STATUS=%s", cmdStatus.c_str()));
185 
186     if (Control::Host::convertResultFromString(cmdStatus) !=
187         Control::Host::Result::Success)
188     {
189         if (Control::Host::convertCommandFromString(cmdCompleted) ==
190             Control::Host::Command::OCCReset)
191         {
192             // Must be a Timeout. Log an Error trace
193             log<level::ERR>(
194                 "Error resetting the OCC.", entry("PATH=%s", path.c_str()),
195                 entry("SENSORID=0x%X", std::get<0>(sensorMap.at(instance))));
196         }
197     }
198     return;
199 }
200 
201 // Called from Manager::pollerTimerExpired() in preperation to POLL OCC.
202 void Status::readOccState()
203 {
204     currentOccReadRetriesCount = occReadRetries;
205     occReadStateNow();
206 }
207 
208 #ifdef POWER10
209 // Special processing that needs to happen once the OCCs change to ACTIVE state
210 void Status::occsWentActive()
211 {
212     CmdStatus status = CmdStatus::SUCCESS;
213 
214     status = pmode->sendModeChange();
215     if (status != CmdStatus::SUCCESS)
216     {
217         log<level::ERR>(
218             fmt::format(
219                 "Status::occsWentActive: OCC mode change failed with status {}",
220                 status)
221                 .c_str());
222 
223         // Disable and reset to try recovering
224         deviceError();
225     }
226 
227     status = pmode->sendIpsData();
228     if (status != CmdStatus::SUCCESS)
229     {
230         log<level::ERR>(
231             fmt::format(
232                 "Status::occsWentActive: Sending Idle Power Save Config data failed with status {}",
233                 status)
234                 .c_str());
235 
236         if (status == CmdStatus::COMM_FAILURE)
237         {
238             // Disable and reset to try recovering
239             deviceError();
240         }
241     }
242 }
243 
244 // Send Ambient and Altitude to the OCC
245 CmdStatus Status::sendAmbient(const uint8_t inTemp, const uint16_t inAltitude)
246 {
247     CmdStatus status = CmdStatus::FAILURE;
248     bool ambientValid = true;
249     uint8_t ambientTemp = inTemp;
250     uint16_t altitude = inAltitude;
251 
252     if (ambientTemp == 0xFF)
253     {
254         // Get latest readings from manager
255         manager.getAmbientData(ambientValid, ambientTemp, altitude);
256         log<level::DEBUG>(
257             fmt::format("sendAmbient: valid: {}, Ambient: {}C, altitude: {}m",
258                         ambientValid, ambientTemp, altitude)
259                 .c_str());
260     }
261 
262     std::vector<std::uint8_t> cmd, rsp;
263     cmd.reserve(11);
264     cmd.push_back(uint8_t(CmdType::SEND_AMBIENT));
265     cmd.push_back(0x00);                    // Data Length (2 bytes)
266     cmd.push_back(0x08);                    //
267     cmd.push_back(0x00);                    // Version
268     cmd.push_back(ambientValid ? 0 : 0xFF); // Ambient Status
269     cmd.push_back(ambientTemp);             // Ambient Temperature
270     cmd.push_back(altitude >> 8);           // Altitude in meters (2 bytes)
271     cmd.push_back(altitude & 0xFF);         //
272     cmd.push_back(0x00);                    // Reserved (3 bytes)
273     cmd.push_back(0x00);
274     cmd.push_back(0x00);
275     log<level::DEBUG>(fmt::format("sendAmbient: SEND_AMBIENT "
276                                   "command to OCC{} ({} bytes)",
277                                   instance, cmd.size())
278                           .c_str());
279     status = occCmd.send(cmd, rsp);
280     if (status == CmdStatus::SUCCESS)
281     {
282         if (rsp.size() == 5)
283         {
284             if (RspStatus::SUCCESS != RspStatus(rsp[2]))
285             {
286                 log<level::ERR>(
287                     fmt::format(
288                         "sendAmbient: SEND_AMBIENT failed with rspStatus 0x{:02X}",
289                         rsp[2])
290                         .c_str());
291                 dump_hex(rsp);
292                 status = CmdStatus::FAILURE;
293             }
294         }
295         else
296         {
297             log<level::ERR>(
298                 fmt::format(
299                     "sendAmbient: INVALID SEND_AMBIENT response length:{}",
300                     rsp.size())
301                     .c_str());
302             dump_hex(rsp);
303             status = CmdStatus::FAILURE;
304         }
305     }
306     else
307     {
308         log<level::ERR>(
309             fmt::format(
310                 "sendAmbient: SEND_AMBIENT FAILED! with status 0x{:02X}",
311                 status)
312                 .c_str());
313 
314         if (status == CmdStatus::COMM_FAILURE)
315         {
316             // Disable and reset to try recovering
317             deviceError();
318         }
319     }
320 
321     return status;
322 }
323 
324 // Called when safe timer expires to determine if OCCs need to be reset
325 void Status::safeStateDelayExpired()
326 {
327     if (this->occActive())
328     {
329         log<level::INFO>(
330             fmt::format(
331                 "safeStateDelayExpired: OCC{} is in SAFE state, requesting reset",
332                 instance)
333                 .c_str());
334         // Disable and reset to try recovering
335         deviceError(Error::Descriptor(SAFE_ERROR_PATH));
336     }
337 }
338 #endif // POWER10
339 
340 fs::path Status::getHwmonPath()
341 {
342     using namespace std::literals::string_literals;
343 
344     if (!fs::exists(hwmonPath))
345     {
346         static bool tracedFail[8] = {0};
347 
348         if (!hwmonPath.empty())
349         {
350             log<level::ERR>(
351                 fmt::format("Status::getHwmonPath(): path no longer exists: {}",
352                             hwmonPath.c_str())
353                     .c_str());
354             hwmonPath.clear();
355         }
356 
357         // Build the base HWMON path
358         fs::path prefixPath =
359             fs::path{OCC_HWMON_PATH + "occ-hwmon."s +
360                      std::to_string(instance + 1) + "/hwmon/"s};
361 
362         // Get the hwmonXX directory name
363         try
364         {
365             // there should only be one directory
366             const int numDirs = std::distance(
367                 fs::directory_iterator(prefixPath), fs::directory_iterator{});
368             if (numDirs == 1)
369             {
370                 hwmonPath = *fs::directory_iterator(prefixPath);
371                 tracedFail[instance] = false;
372             }
373             else
374             {
375                 if (!tracedFail[instance])
376                 {
377                     log<level::ERR>(
378                         fmt::format(
379                             "Status::getHwmonPath(): Found multiple ({}) hwmon paths!",
380                             numDirs)
381                             .c_str());
382                     tracedFail[instance] = true;
383                 }
384             }
385         }
386         catch (const fs::filesystem_error& e)
387         {
388             if (!tracedFail[instance])
389             {
390                 log<level::ERR>(
391                     fmt::format(
392                         "Status::getHwmonPath(): error accessing {}: {}",
393                         prefixPath.c_str(), e.what())
394                         .c_str());
395                 tracedFail[instance] = true;
396             }
397         }
398     }
399 
400     return hwmonPath;
401 }
402 
403 // Called to read state and upon failure to read after occReadStateFailTimer.
404 void Status::occReadStateNow()
405 {
406     unsigned int state;
407     const fs::path filename =
408         fs::path(DEV_PATH) /
409         fs::path(sysfsName + "." + std::to_string(instance + 1)) / "occ_state";
410 
411     std::ifstream file;
412     bool goodFile = false;
413 
414     // open file.
415     file.open(filename, std::ios::in);
416     const int openErrno = errno;
417 
418     // File is open and state can be used.
419     if (file.is_open() && file.good())
420     {
421         goodFile = true;
422         file >> state;
423 
424         if (state != lastState)
425         {
426             // Trace OCC state changes
427             log<level::INFO>(
428                 fmt::format(
429                     "Status::readOccState: OCC{} state 0x{:02X} (lastState: 0x{:02X})",
430                     instance, state, lastState)
431                     .c_str());
432             lastState = state;
433 #ifdef POWER10
434             if (OccState(state) == OccState::ACTIVE)
435             {
436                 if (pmode && device.master())
437                 {
438                     // Set the master OCC on the PowerMode object
439                     pmode->setMasterOcc(path);
440                     // Enable mode changes
441                     pmode->setMasterActive();
442 
443                     // Special processing by master OCC when it goes active
444                     occsWentActive();
445                 }
446 
447                 CmdStatus status = sendAmbient();
448                 if (status != CmdStatus::SUCCESS)
449                 {
450                     log<level::ERR>(
451                         fmt::format(
452                             "readOccState: Sending Ambient failed with status {}",
453                             status)
454                             .c_str());
455                 }
456             }
457 
458             // If OCC in known Good State.
459             if ((OccState(state) == OccState::ACTIVE) ||
460                 (OccState(state) == OccState::CHARACTERIZATION) ||
461                 (OccState(state) == OccState::OBSERVATION))
462             {
463                 // Good OCC State then sensors valid again
464                 stateValid = true;
465 
466                 if (safeStateDelayTimer.isEnabled())
467                 {
468                     // stop safe delay timer (no longer in SAFE state)
469                     safeStateDelayTimer.setEnabled(false);
470                 }
471             }
472             // Else not Valid state We would be in SAFE mode.
473             // This captures both SAFE mode, and 0x00, or other invalid
474             // state values.
475             else
476             {
477                 if (!safeStateDelayTimer.isEnabled())
478                 {
479                     // start safe delay timer (before requesting reset)
480                     using namespace std::literals::chrono_literals;
481                     safeStateDelayTimer.restartOnce(60s);
482                 }
483                 // Not valid state, update sensors to Nan & not functional.
484                 stateValid = false;
485             }
486 #else
487             // Before P10 state not checked, only used good file open.
488             stateValid = true;
489 #endif
490         }
491     }
492     file.close();
493 
494     // if failed to Read a state or not a valid state -> Attempt retry
495     // after 1 Second delay if allowed.
496     if ((!goodFile) || (!stateValid))
497     {
498         if (!goodFile)
499         {
500             // If not able to read, OCC may be offline
501             log<level::ERR>(
502                 fmt::format("Status::readOccState: open failed (errno={})",
503                             openErrno)
504                     .c_str());
505         }
506         else
507         {
508             // else this failed due to state not valid.
509             if (state != lastState)
510             {
511                 log<level::ERR>(
512                     fmt::format(
513                         "Status::readOccState: OCC{} Invalid state 0x{:02X} (last state: 0x{:02X})",
514                         instance, state, lastState)
515                         .c_str());
516             }
517         }
518 
519 #ifdef READ_OCC_SENSORS
520         manager.setSensorValueToNaN(instance);
521 #endif
522 
523         // See occReadRetries for number of retry attempts.
524         if (currentOccReadRetriesCount > 0)
525         {
526             --currentOccReadRetriesCount;
527 #ifdef POWER10
528             using namespace std::chrono_literals;
529             occReadStateFailTimer.restartOnce(1s);
530 #endif
531         }
532         else
533         {
534 #ifdef POWER10
535             if (!stateValid && occActive())
536             {
537                 if (!safeStateDelayTimer.isEnabled())
538                 {
539                     log<level::ERR>(
540                         "Starting 60 sec delay timer before requesting a reset");
541                     // start safe delay timer (before requesting reset)
542                     using namespace std::literals::chrono_literals;
543                     safeStateDelayTimer.restartOnce(60s);
544                 }
545             }
546 #else
547             // State could not be determined, set it to NO State.
548             lastState = 0;
549 
550             // Disable the ability to send Failed actions until OCC is
551             // Active again.
552             stateValid = false;
553 
554             // Disable and reset to try recovering
555             deviceError();
556 #endif
557         }
558     }
559 }
560 
561 } // namespace occ
562 } // namespace open_power
563