1 #include "occ_status.hpp"
2 
3 #include "occ_manager.hpp"
4 #include "occ_sensor.hpp"
5 #include "powermode.hpp"
6 #include "utils.hpp"
7 
8 #include <fmt/core.h>
9 
10 #include <phosphor-logging/log.hpp>
11 
12 #include <filesystem>
13 
14 namespace open_power
15 {
16 namespace occ
17 {
18 
19 using namespace phosphor::logging;
20 
21 // Handles updates to occActive property
22 bool Status::occActive(bool value)
23 {
24     if (value != this->occActive())
25     {
26         log<level::INFO>(fmt::format("Status::occActive OCC{} changed to {}",
27                                      instance, value)
28                              .c_str());
29         if (value)
30         {
31             // Set the device active
32             device.setActive(true);
33 
34             // Update the OCC active sensor
35             Base::Status::occActive(value);
36 
37             // Start watching for errors
38             addErrorWatch();
39 
40             // Reset last OCC state
41             lastState = 0;
42 
43             if (device.master())
44             {
45                 // Update powercap bounds from OCC
46                 manager.updatePcapBounds();
47             }
48 
49             // Call into Manager to let know that we have bound
50             if (this->managerCallBack)
51             {
52                 this->managerCallBack(instance, value);
53             }
54         }
55         else
56         {
57 #ifdef POWER10
58             if (pmode && device.master())
59             {
60                 // Prevent mode changes
61                 pmode->setMasterActive(false);
62             }
63             if (safeStateDelayTimer.isEnabled())
64             {
65                 // stop safe delay timer
66                 safeStateDelayTimer.setEnabled(false);
67             }
68 #endif
69             // Call into Manager to let know that we will unbind.
70             if (this->managerCallBack)
71             {
72                 this->managerCallBack(instance, value);
73             }
74 
75             // Stop watching for errors
76             removeErrorWatch();
77 
78             // Set the device inactive
79             device.setActive(false);
80         }
81     }
82     else if (value && !device.active())
83     {
84         // Existing error watch is on a dead file descriptor.
85         removeErrorWatch();
86 
87         /*
88          * In it's constructor, Status checks Device::bound() to see if OCC is
89          * active or not.
90          * Device::bound() checks for occX-dev0 directory.
91          * We will lose occX-dev0 directories during FSI rescan.
92          * So, if we start this application (and construct Status), and then
93          * later do FSI rescan, we will end up with occActive = true and device
94          * NOT bound. Lets correct that situation here.
95          */
96         device.setActive(true);
97 
98         // Add error watch again
99         addErrorWatch();
100     }
101     else if (!value && device.active())
102     {
103         removeErrorWatch();
104 
105         // In the event that the application never receives the active signal
106         // even though the OCC is active (this can occur if the BMC is rebooted
107         // with the host on, since the initial OCC driver probe will discover
108         // the OCCs), this application needs to be able to unbind the device
109         // when we get the OCC inactive signal.
110         device.setActive(false);
111     }
112     return Base::Status::occActive(value);
113 }
114 
115 // Callback handler when a device error is reported.
116 void Status::deviceError(Error::Descriptor d)
117 {
118 #ifdef POWER10
119     if (pmode && device.master())
120     {
121         // Prevent mode changes
122         pmode->setMasterActive(false);
123     }
124 #endif
125 
126     if (d.log)
127     {
128         FFDC::createOCCResetPEL(instance, d.path, d.err, d.callout);
129     }
130 
131     // This would deem OCC inactive
132     this->occActive(false);
133 
134     // Reset the OCC
135     this->resetOCC();
136 }
137 
138 // Sends message to host control command handler to reset OCC
139 void Status::resetOCC()
140 {
141     log<level::INFO>(
142         fmt::format(">>Status::resetOCC() - requesting reset for OCC{}",
143                     instance)
144             .c_str());
145 #ifdef PLDM
146     if (resetCallBack)
147     {
148         this->resetCallBack(instance);
149     }
150 #else
151     constexpr auto CONTROL_HOST_PATH = "/org/open_power/control/host0";
152     constexpr auto CONTROL_HOST_INTF = "org.open_power.Control.Host";
153 
154     // This will throw exception on failure
155     auto service = utils::getService(CONTROL_HOST_PATH, CONTROL_HOST_INTF);
156 
157     auto& bus = utils::getBus();
158     auto method = bus.new_method_call(service.c_str(), CONTROL_HOST_PATH,
159                                       CONTROL_HOST_INTF, "Execute");
160     // OCC Reset control command
161     method.append(convertForMessage(Control::Host::Command::OCCReset).c_str());
162 
163     // OCC Sensor ID for callout reasons
164     method.append(std::variant<uint8_t>(std::get<0>(sensorMap.at(instance))));
165     bus.call_noreply(method);
166     return;
167 #endif
168 }
169 
170 // Handler called by Host control command handler to convey the
171 // status of the executed command
172 void Status::hostControlEvent(sdbusplus::message_t& msg)
173 {
174     std::string cmdCompleted{};
175     std::string cmdStatus{};
176 
177     msg.read(cmdCompleted, cmdStatus);
178 
179     log<level::DEBUG>("Host control signal values",
180                       entry("COMMAND=%s", cmdCompleted.c_str()),
181                       entry("STATUS=%s", cmdStatus.c_str()));
182 
183     if (Control::Host::convertResultFromString(cmdStatus) !=
184         Control::Host::Result::Success)
185     {
186         if (Control::Host::convertCommandFromString(cmdCompleted) ==
187             Control::Host::Command::OCCReset)
188         {
189             // Must be a Timeout. Log an Error trace
190             log<level::ERR>(
191                 "Error resetting the OCC.", entry("PATH=%s", path.c_str()),
192                 entry("SENSORID=0x%X", std::get<0>(sensorMap.at(instance))));
193         }
194     }
195     return;
196 }
197 
198 // Called from Manager::pollerTimerExpired() in preperation to POLL OCC.
199 void Status::readOccState()
200 {
201     currentOccReadRetriesCount = occReadRetries;
202     occReadStateNow();
203 }
204 
205 #ifdef POWER10
206 // Special processing that needs to happen once the OCCs change to ACTIVE state
207 void Status::occsWentActive()
208 {
209     CmdStatus status = CmdStatus::SUCCESS;
210 
211     status = pmode->sendModeChange();
212     if (status != CmdStatus::SUCCESS)
213     {
214         log<level::ERR>(
215             fmt::format(
216                 "Status::occsWentActive: OCC mode change failed with status {}",
217                 status)
218                 .c_str());
219 
220         // Disable and reset to try recovering
221         deviceError();
222     }
223 
224     status = pmode->sendIpsData();
225     if (status != CmdStatus::SUCCESS)
226     {
227         log<level::ERR>(
228             fmt::format(
229                 "Status::occsWentActive: Sending Idle Power Save Config data failed with status {}",
230                 status)
231                 .c_str());
232 
233         if (status == CmdStatus::COMM_FAILURE)
234         {
235             // Disable and reset to try recovering
236             deviceError();
237         }
238     }
239 }
240 
241 // Send Ambient and Altitude to the OCC
242 CmdStatus Status::sendAmbient(const uint8_t inTemp, const uint16_t inAltitude)
243 {
244     CmdStatus status = CmdStatus::FAILURE;
245     bool ambientValid = true;
246     uint8_t ambientTemp = inTemp;
247     uint16_t altitude = inAltitude;
248 
249     if (ambientTemp == 0xFF)
250     {
251         // Get latest readings from manager
252         manager.getAmbientData(ambientValid, ambientTemp, altitude);
253         log<level::DEBUG>(
254             fmt::format("sendAmbient: valid: {}, Ambient: {}C, altitude: {}m",
255                         ambientValid, ambientTemp, altitude)
256                 .c_str());
257     }
258 
259     std::vector<std::uint8_t> cmd, rsp;
260     cmd.reserve(11);
261     cmd.push_back(uint8_t(CmdType::SEND_AMBIENT));
262     cmd.push_back(0x00);                    // Data Length (2 bytes)
263     cmd.push_back(0x08);                    //
264     cmd.push_back(0x00);                    // Version
265     cmd.push_back(ambientValid ? 0 : 0xFF); // Ambient Status
266     cmd.push_back(ambientTemp);             // Ambient Temperature
267     cmd.push_back(altitude >> 8);           // Altitude in meters (2 bytes)
268     cmd.push_back(altitude & 0xFF);         //
269     cmd.push_back(0x00);                    // Reserved (3 bytes)
270     cmd.push_back(0x00);
271     cmd.push_back(0x00);
272     log<level::DEBUG>(fmt::format("sendAmbient: SEND_AMBIENT "
273                                   "command to OCC{} ({} bytes)",
274                                   instance, cmd.size())
275                           .c_str());
276     status = occCmd.send(cmd, rsp);
277     if (status == CmdStatus::SUCCESS)
278     {
279         if (rsp.size() == 5)
280         {
281             if (RspStatus::SUCCESS != RspStatus(rsp[2]))
282             {
283                 log<level::ERR>(
284                     fmt::format(
285                         "sendAmbient: SEND_AMBIENT failed with rspStatus 0x{:02X}",
286                         rsp[2])
287                         .c_str());
288                 dump_hex(rsp);
289                 status = CmdStatus::FAILURE;
290             }
291         }
292         else
293         {
294             log<level::ERR>(
295                 fmt::format(
296                     "sendAmbient: INVALID SEND_AMBIENT response length:{}",
297                     rsp.size())
298                     .c_str());
299             dump_hex(rsp);
300             status = CmdStatus::FAILURE;
301         }
302     }
303     else
304     {
305         log<level::ERR>(
306             fmt::format(
307                 "sendAmbient: SEND_AMBIENT FAILED! with status 0x{:02X}",
308                 status)
309                 .c_str());
310 
311         if (status == CmdStatus::COMM_FAILURE)
312         {
313             // Disable and reset to try recovering
314             deviceError();
315         }
316     }
317 
318     return status;
319 }
320 
321 // Called when safe timer expires to determine if OCCs need to be reset
322 void Status::safeStateDelayExpired()
323 {
324     if (this->occActive())
325     {
326         log<level::INFO>(
327             fmt::format(
328                 "safeStateDelayExpired: OCC{} is in SAFE state, requesting reset",
329                 instance)
330                 .c_str());
331         // Disable and reset to try recovering
332         deviceError(Error::Descriptor(SAFE_ERROR_PATH));
333     }
334 }
335 #endif // POWER10
336 
337 fs::path Status::getHwmonPath()
338 {
339     using namespace std::literals::string_literals;
340 
341     if (!fs::exists(hwmonPath))
342     {
343         static bool tracedFail[8] = {0};
344 
345         if (!hwmonPath.empty())
346         {
347             log<level::ERR>(
348                 fmt::format("Status::getHwmonPath(): path no longer exists: {}",
349                             hwmonPath.c_str())
350                     .c_str());
351             hwmonPath.clear();
352         }
353 
354         // Build the base HWMON path
355         fs::path prefixPath = fs::path{OCC_HWMON_PATH + "occ-hwmon."s +
356                                        std::to_string(instance + 1) +
357                                        "/hwmon/"s};
358 
359         // Get the hwmonXX directory name
360         try
361         {
362             // there should only be one directory
363             const int numDirs = std::distance(
364                 fs::directory_iterator(prefixPath), fs::directory_iterator{});
365             if (numDirs == 1)
366             {
367                 hwmonPath = *fs::directory_iterator(prefixPath);
368                 tracedFail[instance] = false;
369             }
370             else
371             {
372                 if (!tracedFail[instance])
373                 {
374                     log<level::ERR>(
375                         fmt::format(
376                             "Status::getHwmonPath(): Found multiple ({}) hwmon paths!",
377                             numDirs)
378                             .c_str());
379                     tracedFail[instance] = true;
380                 }
381             }
382         }
383         catch (const fs::filesystem_error& e)
384         {
385             if (!tracedFail[instance])
386             {
387                 log<level::ERR>(
388                     fmt::format(
389                         "Status::getHwmonPath(): error accessing {}: {}",
390                         prefixPath.c_str(), e.what())
391                         .c_str());
392                 tracedFail[instance] = true;
393             }
394         }
395     }
396 
397     return hwmonPath;
398 }
399 
400 // Called to read state and upon failure to read after occReadStateFailTimer.
401 void Status::occReadStateNow()
402 {
403     unsigned int state;
404     const fs::path filename =
405         fs::path(DEV_PATH) /
406         fs::path(sysfsName + "." + std::to_string(instance + 1)) / "occ_state";
407 
408     std::ifstream file;
409     bool goodFile = false;
410 
411     // open file.
412     file.open(filename, std::ios::in);
413     const int openErrno = errno;
414 
415     // File is open and state can be used.
416     if (file.is_open() && file.good())
417     {
418         goodFile = true;
419         file >> state;
420 
421         if (state != lastState)
422         {
423             // Trace OCC state changes
424             log<level::INFO>(
425                 fmt::format(
426                     "Status::readOccState: OCC{} state 0x{:02X} (lastState: 0x{:02X})",
427                     instance, state, lastState)
428                     .c_str());
429             lastState = state;
430 #ifdef POWER10
431             if (OccState(state) == OccState::ACTIVE)
432             {
433                 if (pmode && device.master())
434                 {
435                     // Set the master OCC on the PowerMode object
436                     pmode->setMasterOcc(path);
437                     // Enable mode changes
438                     pmode->setMasterActive();
439 
440                     // Special processing by master OCC when it goes active
441                     occsWentActive();
442                 }
443 
444                 CmdStatus status = sendAmbient();
445                 if (status != CmdStatus::SUCCESS)
446                 {
447                     log<level::ERR>(
448                         fmt::format(
449                             "readOccState: Sending Ambient failed with status {}",
450                             status)
451                             .c_str());
452                 }
453             }
454 
455             // If OCC in known Good State.
456             if ((OccState(state) == OccState::ACTIVE) ||
457                 (OccState(state) == OccState::CHARACTERIZATION) ||
458                 (OccState(state) == OccState::OBSERVATION))
459             {
460                 // Good OCC State then sensors valid again
461                 stateValid = true;
462 
463                 if (safeStateDelayTimer.isEnabled())
464                 {
465                     // stop safe delay timer (no longer in SAFE state)
466                     safeStateDelayTimer.setEnabled(false);
467                 }
468             }
469             // Else not Valid state We would be in SAFE mode.
470             // This captures both SAFE mode, and 0x00, or other invalid
471             // state values.
472             else
473             {
474                 if (!safeStateDelayTimer.isEnabled())
475                 {
476                     // start safe delay timer (before requesting reset)
477                     using namespace std::literals::chrono_literals;
478                     safeStateDelayTimer.restartOnce(60s);
479                 }
480                 // Not valid state, update sensors to Nan & not functional.
481                 stateValid = false;
482             }
483 #else
484             // Before P10 state not checked, only used good file open.
485             stateValid = true;
486 #endif
487         }
488     }
489     file.close();
490 
491     // if failed to Read a state or not a valid state -> Attempt retry
492     // after 1 Second delay if allowed.
493     if ((!goodFile) || (!stateValid))
494     {
495         if (!goodFile)
496         {
497             // If not able to read, OCC may be offline
498             log<level::ERR>(
499                 fmt::format("Status::readOccState: open failed (errno={})",
500                             openErrno)
501                     .c_str());
502         }
503         else
504         {
505             // else this failed due to state not valid.
506             if (state != lastState)
507             {
508                 log<level::ERR>(
509                     fmt::format(
510                         "Status::readOccState: OCC{} Invalid state 0x{:02X} (last state: 0x{:02X})",
511                         instance, state, lastState)
512                         .c_str());
513             }
514         }
515 
516 #ifdef READ_OCC_SENSORS
517         manager.setSensorValueToNaN(instance);
518 #endif
519 
520         // See occReadRetries for number of retry attempts.
521         if (currentOccReadRetriesCount > 0)
522         {
523             --currentOccReadRetriesCount;
524 #ifdef POWER10
525             using namespace std::chrono_literals;
526             occReadStateFailTimer.restartOnce(1s);
527 #endif
528         }
529         else
530         {
531 #ifdef POWER10
532             if (!stateValid && occActive())
533             {
534                 if (!safeStateDelayTimer.isEnabled())
535                 {
536                     log<level::ERR>(
537                         "Starting 60 sec delay timer before requesting a reset");
538                     // start safe delay timer (before requesting reset)
539                     using namespace std::literals::chrono_literals;
540                     safeStateDelayTimer.restartOnce(60s);
541                 }
542             }
543 #else
544             // State could not be determined, set it to NO State.
545             lastState = 0;
546 
547             // Disable the ability to send Failed actions until OCC is
548             // Active again.
549             stateValid = false;
550 
551             // Disable and reset to try recovering
552             deviceError();
553 #endif
554         }
555     }
556 }
557 
558 } // namespace occ
559 } // namespace open_power
560