xref: /openbmc/openpower-occ-control/occ_status.cpp (revision 16a5adb204d261be727c67c4ea3b64a0965303e0)
1 #include "occ_status.hpp"
2 
3 #include "occ_manager.hpp"
4 #include "occ_sensor.hpp"
5 #include "powermode.hpp"
6 #include "utils.hpp"
7 
8 #include <phosphor-logging/lg2.hpp>
9 
10 #include <filesystem>
11 
12 namespace open_power
13 {
14 namespace occ
15 {
16 
17 using namespace phosphor::logging;
18 
19 using ThrottleObj =
20     sdbusplus::xyz::openbmc_project::Control::Power::server::Throttle;
21 
22 // Handles updates to occActive property
occActive(bool value)23 bool Status::occActive(bool value)
24 {
25     if (value != this->occActive())
26     {
27         lg2::info("Status::occActive OCC{INST} changed to {STATE}", "INST",
28                   instance, "STATE", value);
29         if (value)
30         {
31             // OCC is active
32             // Clear prior throttle reason (before setting device active)
33             updateThrottle(false, THROTTLED_ALL);
34 
35             // Set the device active
36             device.setActive(true);
37 
38             // Reset last OCC state
39             lastState = 0;
40 
41             // Start watching for errors (throttles, etc)
42             try
43             {
44                 addErrorWatch();
45             }
46             catch (const OpenFailure& e)
47             {
48                 // Failed to add watch for throttle events, request reset to try
49                 // to recover comm
50                 lg2::error(
51                     "Status::occActive: Unable to add error watch(s) for OCC{INST} watch: {ERROR}",
52                     "INST", instance, "ERROR", e.what());
53                 deviceError(Error::Descriptor(OCC_COMM_ERROR_PATH));
54                 return Base::Status::occActive(false);
55             }
56 
57             // Update the OCC active sensor
58             Base::Status::occActive(value);
59 
60             if (device.master())
61             {
62                 // Update powercap bounds from OCC
63                 manager.updatePcapBounds();
64             }
65 
66             // Call into Manager to let know that we have bound
67             if (this->managerCallBack)
68             {
69                 this->managerCallBack(instance, value);
70             }
71         }
72         else
73         {
74             // OCC is no longer active
75             if (sensorsValid)
76             {
77                 sensorsValid = false;
78                 // Sensors not supported (update to NaN and not functional)
79                 manager.setSensorValueToNaN(instance);
80             }
81 
82             if (pmode && device.master())
83             {
84                 // Prevent mode changes
85                 pmode->setMasterActive(false);
86             }
87             if (safeStateDelayTimer.isEnabled())
88             {
89                 // stop safe delay timer
90                 safeStateDelayTimer.setEnabled(false);
91             }
92 
93             // Call into Manager to let know that we will unbind.
94             if (this->managerCallBack)
95             {
96                 this->managerCallBack(instance, value);
97             }
98 
99             // Stop watching for errors
100             removeErrorWatch();
101 
102             // Set the device inactive
103             device.setActive(false);
104 
105             // Clear throttles (OCC not active after disabling device)
106             updateThrottle(false, THROTTLED_ALL);
107         }
108     }
109     else if (value && !device.active())
110     {
111         // Existing error watch is on a dead file descriptor.
112         removeErrorWatch();
113 
114         /*
115          * In it's constructor, Status checks Device::bound() to see if OCC is
116          * active or not.
117          * Device::bound() checks for occX-dev0 directory.
118          * We will lose occX-dev0 directories during FSI rescan.
119          * So, if we start this application (and construct Status), and then
120          * later do FSI rescan, we will end up with occActive = true and device
121          * NOT bound. Lets correct that situation here.
122          */
123         device.setActive(true);
124 
125         // Add error watch again
126         try
127         {
128             addErrorWatch();
129         }
130         catch (const OpenFailure& e)
131         {
132             // Failed to add watch for throttle events, request reset to try to
133             // recover comm
134             lg2::error(
135                 "Status::occActive: Unable to add error watch(s) again for OCC{INST} watch: {ERROR}",
136                 "INST", instance, "ERROR", e.what());
137             deviceError(Error::Descriptor(OCC_COMM_ERROR_PATH));
138             return Base::Status::occActive(false);
139         }
140     }
141     else if (!value && device.active())
142     {
143         removeErrorWatch();
144 
145         // In the event that the application never receives the active signal
146         // even though the OCC is active (this can occur if the BMC is rebooted
147         // with the host on, since the initial OCC driver probe will discover
148         // the OCCs), this application needs to be able to unbind the device
149         // when we get the OCC inactive signal.
150         device.setActive(false);
151     }
152     return Base::Status::occActive(value);
153 }
154 
155 // Callback handler when a device error is reported.
deviceError(Error::Descriptor d)156 void Status::deviceError(Error::Descriptor d)
157 {
158     if (pmode && device.master())
159     {
160         // Prevent mode changes
161         pmode->setMasterActive(false);
162     }
163 
164     if (d.log)
165     {
166         FFDC::createOCCResetPEL(instance, d.path, d.err, d.callout,
167                                 d.isInventoryCallout);
168     }
169 
170     // This would deem OCC inactive
171     this->occActive(false);
172 
173     // Reset the OCC
174     this->resetOCC();
175 }
176 
177 // Sends message to host control command handler to reset OCC
resetOCC()178 void Status::resetOCC()
179 {
180     lg2::info(">>Status::resetOCC() - requesting reset for OCC{INST}", "INST",
181               instance);
182     this->occActive(false);
183     if (resetCallBack)
184     {
185         this->resetCallBack(instance);
186     }
187 }
188 
189 // Handler called by Host control command handler to convey the
190 // status of the executed command
hostControlEvent(sdbusplus::message_t & msg)191 void Status::hostControlEvent(sdbusplus::message_t& msg)
192 {
193     std::string cmdCompleted{};
194     std::string cmdStatus{};
195 
196     msg.read(cmdCompleted, cmdStatus);
197 
198     lg2::debug("Host control signal values: command={CMD}, status={STATUS}",
199                "CMD", cmdCompleted, "STATUS", cmdStatus);
200 
201     if (Control::Host::convertResultFromString(cmdStatus) !=
202         Control::Host::Result::Success)
203     {
204         if (Control::Host::convertCommandFromString(cmdCompleted) ==
205             Control::Host::Command::OCCReset)
206         {
207             // Must be a Timeout. Log an Error trace
208             lg2::error(
209                 "Error resetting the OCC: path={PATH}, sensorid={SENSOR}",
210                 "PATH", path, "SENSOR", std::get<0>(sensorMap.at(instance)));
211         }
212     }
213     return;
214 }
215 
216 // Called from Manager::pollerTimerExpired() in preperation to POLL OCC.
readOccState()217 void Status::readOccState()
218 {
219     if (stateValid)
220     {
221         // Reset retry count (since state is good)
222         currentOccReadRetriesCount = occReadRetries;
223     }
224     occReadStateNow();
225 }
226 
227 // Special processing that needs to happen once the OCCs change to ACTIVE state
occsWentActive()228 void Status::occsWentActive()
229 {
230     CmdStatus status = CmdStatus::SUCCESS;
231 
232     // IPS data will get sent automatically after a mode change if the mode
233     // supports it.
234     pmode->needToSendIPS();
235 
236     status = pmode->sendModeChange();
237     if (status != CmdStatus::SUCCESS)
238     {
239         lg2::error(
240             "Status::occsWentActive: OCC mode change failed with status {STATUS}",
241             "STATUS", status);
242 
243         // Disable and reset to try recovering
244         deviceError();
245     }
246 }
247 
248 // Send Ambient and Altitude to the OCC
sendAmbient(const uint8_t inTemp,const uint16_t inAltitude)249 CmdStatus Status::sendAmbient(const uint8_t inTemp, const uint16_t inAltitude)
250 {
251     CmdStatus status = CmdStatus::FAILURE;
252     bool ambientValid = true;
253     uint8_t ambientTemp = inTemp;
254     uint16_t altitude = inAltitude;
255 
256     if (ambientTemp == 0xFF)
257     {
258         // Get latest readings from manager
259         manager.getAmbientData(ambientValid, ambientTemp, altitude);
260         lg2::debug(
261             "sendAmbient: valid: {VALID}, Ambient: {TEMP}C, altitude: {ALT}m",
262             "VALID", ambientValid, "TEMP", ambientTemp, "ALT", altitude);
263     }
264 
265     std::vector<std::uint8_t> cmd, rsp;
266     cmd.reserve(11);
267     cmd.push_back(uint8_t(CmdType::SEND_AMBIENT));
268     cmd.push_back(0x00);                    // Data Length (2 bytes)
269     cmd.push_back(0x08);                    //
270     cmd.push_back(0x00);                    // Version
271     cmd.push_back(ambientValid ? 0 : 0xFF); // Ambient Status
272     cmd.push_back(ambientTemp);             // Ambient Temperature
273     cmd.push_back(altitude >> 8);           // Altitude in meters (2 bytes)
274     cmd.push_back(altitude & 0xFF);         //
275     cmd.push_back(0x00);                    // Reserved (3 bytes)
276     cmd.push_back(0x00);
277     cmd.push_back(0x00);
278     lg2::debug("sendAmbient: SEND_AMBIENT "
279                "command to OCC{INST} ({SIZE} bytes)",
280                "INST", instance, "SIZE", cmd.size());
281     status = occCmd.send(cmd, rsp);
282     if (status == CmdStatus::SUCCESS)
283     {
284         if (rsp.size() == 5)
285         {
286             if (RspStatus::SUCCESS != RspStatus(rsp[2]))
287             {
288                 lg2::error(
289                     "sendAmbient: SEND_AMBIENT failed with rspStatus {STATUS}",
290                     "STATUS", lg2::hex, rsp[2]);
291                 dump_hex(rsp);
292                 status = CmdStatus::FAILURE;
293             }
294         }
295         else
296         {
297             lg2::error(
298                 "sendAmbient: INVALID SEND_AMBIENT response length:{SIZE}",
299                 "SIZE", rsp.size());
300             dump_hex(rsp);
301             status = CmdStatus::FAILURE;
302         }
303     }
304     else
305     {
306         lg2::error("sendAmbient: SEND_AMBIENT FAILED! with status {STATUS}",
307                    "STATUS", lg2::hex, uint8_t(status));
308 
309         if (status == CmdStatus::COMM_FAILURE)
310         {
311             // Disable due to OCC comm failure and reset to try recovering
312             deviceError(Error::Descriptor(OCC_COMM_ERROR_PATH));
313         }
314     }
315 
316     return status;
317 }
318 
319 // Called when safe timer expires to determine if OCCs need to be reset
safeStateDelayExpired()320 void Status::safeStateDelayExpired()
321 {
322     if (this->occActive())
323     {
324         lg2::info(
325             "safeStateDelayExpired: OCC{INST} state missing or not valid, requesting reset",
326             "INST", instance);
327         // Disable and reset to try recovering
328         deviceError(Error::Descriptor(SAFE_ERROR_PATH));
329     }
330 }
331 
getHwmonPath()332 fs::path Status::getHwmonPath()
333 {
334     using namespace std::literals::string_literals;
335 
336     if (!fs::exists(hwmonPath))
337     {
338         static bool tracedFail[8] = {0};
339 
340         if (!hwmonPath.empty())
341         {
342             lg2::warning(
343                 "Status::getHwmonPath(): path no longer exists: {PATH}", "PATH",
344                 hwmonPath);
345             hwmonPath.clear();
346         }
347 
348         // Build the base HWMON path
349         fs::path prefixPath =
350             fs::path{OCC_HWMON_PATH + "occ-hwmon."s +
351                      std::to_string(instance + 1) + "/hwmon/"s};
352 
353         // Get the hwmonXX directory name
354         try
355         {
356             // there should only be one directory
357             const int numDirs = std::distance(
358                 fs::directory_iterator(prefixPath), fs::directory_iterator{});
359             if (numDirs == 1)
360             {
361                 hwmonPath = *fs::directory_iterator(prefixPath);
362                 tracedFail[instance] = false;
363             }
364             else
365             {
366                 if (!tracedFail[instance])
367                 {
368                     lg2::error(
369                         "Status::getHwmonPath(): Found multiple ({NUM}) hwmon paths!",
370                         "NUM", numDirs);
371                     tracedFail[instance] = true;
372                 }
373             }
374         }
375         catch (const fs::filesystem_error& e)
376         {
377             if (!tracedFail[instance])
378             {
379                 lg2::error(
380                     "Status::getHwmonPath(): error accessing {PATH}: {ERROR}",
381                     "PATH", prefixPath, "ERROR", e.what());
382                 tracedFail[instance] = true;
383             }
384         }
385     }
386 
387     return hwmonPath;
388 }
389 
390 // Called to read state and handle any errors
occReadStateNow()391 void Status::occReadStateNow()
392 {
393     unsigned int state;
394     const fs::path filename =
395         fs::path(DEV_PATH) /
396         fs::path(sysfsName + "." + std::to_string(instance + 1)) / "occ_state";
397 
398     std::ifstream file;
399     bool stateWasRead = false;
400 
401     // open file.
402     file.open(filename, std::ios::in);
403     const int openErrno = errno;
404 
405     // File is open and state can be used.
406     if (file.is_open() && file.good())
407     {
408         stateWasRead = true;
409         file >> state;
410         // Read the error code (if any) to check status of the read
411         std::ios_base::iostate readState = file.rdstate();
412         if (readState)
413         {
414             // There was a failure reading the file
415             if (lastOccReadStatus != -1)
416             {
417                 // Trace error bits
418                 std::string errorBits = "";
419                 if (readState & std::ios_base::eofbit)
420                 {
421                     errorBits += " EOF";
422                 }
423                 if (readState & std::ios_base::failbit)
424                 {
425                     errorBits += " failbit";
426                 }
427                 if (readState & std::ios_base::badbit)
428                 {
429                     errorBits += " badbit";
430                 }
431                 lg2::error(
432                     "readOccState: Failed to read OCC{INST} state: Read error on I/O operation - {ERROR}",
433                     "INST", instance, "ERROR", errorBits);
434                 lastOccReadStatus = -1;
435             }
436             stateWasRead = false;
437         }
438 
439         if (stateWasRead && (state != lastState))
440         {
441             // Trace OCC state changes
442             lg2::info(
443                 "Status::readOccState: OCC{INST} state {STATE} (lastState: {PRIOR})",
444                 "INST", instance, "STATE", lg2::hex, state, "PRIOR", lg2::hex,
445                 lastState);
446             lastState = state;
447 
448             if (OccState(state) == OccState::ACTIVE)
449             {
450                 if (pmode && device.master())
451                 {
452                     // Set the master OCC on the PowerMode object
453                     pmode->setMasterOcc(path);
454                     // Enable mode changes
455                     pmode->setMasterActive();
456 
457                     // Special processing by master OCC when it goes active
458                     occsWentActive();
459                 }
460 
461                 CmdStatus status = sendAmbient();
462                 if (status != CmdStatus::SUCCESS)
463                 {
464                     lg2::error(
465                         "readOccState: Sending Ambient failed with status {STATUS}",
466                         "STATUS", status);
467                 }
468             }
469 
470             // If OCC in known Good State.
471             if ((OccState(state) == OccState::ACTIVE) ||
472                 (OccState(state) == OccState::CHARACTERIZATION) ||
473                 (OccState(state) == OccState::OBSERVATION))
474             {
475                 // Good OCC State then sensors valid again
476                 stateValid = true;
477                 sensorsValid = true;
478 
479                 if (safeStateDelayTimer.isEnabled())
480                 {
481                     // stop safe delay timer (no longer in SAFE state)
482                     safeStateDelayTimer.setEnabled(false);
483                 }
484             }
485             else
486             {
487                 // OCC is in SAFE or some other unsupported state
488                 if (!safeStateDelayTimer.isEnabled())
489                 {
490                     lg2::error(
491                         "readOccState: Invalid OCC{INST} state of {STATE} (last state: {PRIOR}), starting safe state delay timer",
492                         "INST", instance, "STATE", lg2::hex, state, "PRIOR",
493                         lg2::hex, lastState);
494                     // start safe delay timer (before requesting reset)
495                     using namespace std::literals::chrono_literals;
496                     safeStateDelayTimer.restartOnce(60s);
497                 }
498 
499                 if (sensorsValid)
500                 {
501                     sensorsValid = false;
502                     // Sensors not supported (update to NaN and not functional)
503                     manager.setSensorValueToNaN(instance);
504                 }
505             }
506         }
507     }
508     else
509     {
510         // Unable to read state
511         stateValid = false;
512     }
513 
514     file.close();
515 
516     // if failed to read the OCC state -> Attempt retry
517     if (!stateWasRead)
518     {
519         if (sensorsValid)
520         {
521             sensorsValid = false;
522             manager.setSensorValueToNaN(instance);
523         }
524 
525         // If not able to read, OCC may be offline
526         if (openErrno != lastOccReadStatus)
527         {
528             lg2::error(
529                 "Status::readOccState: open/read failed trying to read OCC{INST} state (open errno={ERROR})",
530                 "INST", instance, "ERROR", openErrno);
531             lastOccReadStatus = openErrno;
532         }
533 
534         // See occReadRetries for number of retry attempts.
535         if (currentOccReadRetriesCount > 0)
536         {
537             --currentOccReadRetriesCount;
538         }
539         else
540         {
541             lg2::error(
542                 "readOccState: failed to read OCC{INST} state! (last state: {PRIOR})",
543                 "INST", instance, "PRIOR", lg2::hex, lastState);
544 
545             // State could not be determined, set it to NO State.
546             lastState = 0;
547 
548             // Disable the ability to send Failed actions until OCC is
549             // Active again.
550             stateValid = false;
551 
552             // Disable due to OCC comm failure and reset to try recovering
553             // (processor callout will be added)
554             deviceError(Error::Descriptor(OCC_COMM_ERROR_PATH, ECOMM,
555                                           procPath.c_str(), true));
556 
557             // Reset retry count (for next attempt after recovery)
558             currentOccReadRetriesCount = occReadRetries;
559         }
560     }
561     else if (lastOccReadStatus != 0)
562     {
563         lg2::info("readOccState: successfully read OCC{INST} state: {STATE}",
564                   "INST", instance, "STATE", state);
565         lastOccReadStatus = 0; // no error
566     }
567 }
568 
569 // Update processor throttle status on dbus
updateThrottle(const bool isThrottled,const uint8_t newReason)570 void Status::updateThrottle(const bool isThrottled, const uint8_t newReason)
571 {
572     if (!throttleHandle)
573     {
574         return;
575     }
576 
577     uint8_t newThrottleCause = throttleCause;
578 
579     if (isThrottled) // throttled due to newReason
580     {
581         if ((newReason & throttleCause) == 0)
582         {
583             // set the bit(s) for passed in reason
584             newThrottleCause |= newReason;
585         }
586         // else no change
587     }
588     else // no longer throttled due to newReason
589     {
590         if ((newReason & throttleCause) != 0)
591         {
592             // clear the bit(s) for passed in reason
593             newThrottleCause &= ~newReason;
594         }
595         // else no change
596     }
597 
598     if (newThrottleCause != throttleCause)
599     {
600         if (newThrottleCause == THROTTLED_NONE)
601         {
602             lg2::debug(
603                 "updateThrottle: OCC{INST} no longer throttled (prior reason: {REASON})",
604                 "INST", instance, "REASON", throttleCause);
605             throttleCause = THROTTLED_NONE;
606             throttleHandle->throttled(false);
607             throttleHandle->throttleCauses({});
608         }
609         else
610         {
611             lg2::debug(
612                 "updateThrottle: OCC{INST} is throttled with reason {REASON} (prior reason: {PRIOR})",
613                 "INST", instance, "REASON", newThrottleCause, "PRIOR",
614                 throttleCause);
615             throttleCause = newThrottleCause;
616 
617             std::vector<ThrottleObj::ThrottleReasons> updatedCauses;
618             if (throttleCause & THROTTLED_POWER)
619             {
620                 updatedCauses.push_back(
621                     throttleHandle->ThrottleReasons::PowerLimit);
622             }
623             if (throttleCause & THROTTLED_THERMAL)
624             {
625                 updatedCauses.push_back(
626                     throttleHandle->ThrottleReasons::ThermalLimit);
627             }
628             if (throttleCause & THROTTLED_SAFE)
629             {
630                 updatedCauses.push_back(
631                     throttleHandle->ThrottleReasons::ManagementDetectedFault);
632             }
633             throttleHandle->throttleCauses(updatedCauses);
634             throttleHandle->throttled(true);
635         }
636     }
637     // else no change to throttle status
638 }
639 
640 // Get processor path associated with this OCC
readProcAssociation()641 void Status::readProcAssociation()
642 {
643     std::string managingPath = path + "/power_managing";
644     lg2::debug("readProcAssociation: getting endpoints for {MANAGE} ({PATH})",
645                "MANAGE", managingPath, "PATH", path);
646     try
647     {
648         utils::PropertyValue procPathProperty{};
649         procPathProperty = utils::getProperty(
650             managingPath, "xyz.openbmc_project.Association", "endpoints");
651         auto result = std::get<std::vector<std::string>>(procPathProperty);
652         if (result.size() > 0)
653         {
654             procPath = result[0];
655             lg2::info("readProcAssociation: OCC{INST} has proc={PATH}", "INST",
656                       instance, "PATH", procPath);
657         }
658         else
659         {
660             lg2::error(
661                 "readProcAssociation: No processor associated with OCC{INST} / {PATH}",
662                 "INST", instance, "PATH", path);
663         }
664     }
665     catch (const sdbusplus::exception_t& e)
666     {
667         lg2::error(
668             "readProcAssociation: Unable to get proc assocated with {PATH} - {ERROR}",
669             "PATH", path, "ERROR", e.what());
670         procPath = {};
671     }
672 }
673 
674 } // namespace occ
675 } // namespace open_power
676