xref: /openbmc/openpower-occ-control/occ_status.cpp (revision 3d6cc9d24b09f1975ac6a103aa94ef99ccce4ad1)
1 #include "occ_status.hpp"
2 
3 #include "occ_manager.hpp"
4 #include "occ_sensor.hpp"
5 #include "powermode.hpp"
6 #include "utils.hpp"
7 
8 #include <phosphor-logging/lg2.hpp>
9 #include <xyz/openbmc_project/Control/Power/Throttle/server.hpp>
10 
11 #include <filesystem>
12 
13 namespace open_power
14 {
15 namespace occ
16 {
17 
18 using namespace phosphor::logging;
19 
20 using ThrottleObj =
21     sdbusplus::xyz::openbmc_project::Control::Power::server::Throttle;
22 using ThrottleReason = ThrottleObj::ThrottleReasons;
23 
24 // Handles updates to occActive property
occActive(bool value)25 bool Status::occActive(bool value)
26 {
27     if (value != this->occActive())
28     {
29         lg2::info("Status::occActive OCC{INST} changed to {STATE}", "INST",
30                   instance, "STATE", value);
31         if (value)
32         {
33             // OCC is active
34             // Clear prior throttle reason (before setting device active)
35             updateThrottle(false, THROTTLED_ALL);
36 
37             // Set the device active
38             device.setActive(true);
39 
40             // Reset last OCC state
41             lastState = 0;
42 
43             // Start watching for errors (throttles, etc)
44             try
45             {
46                 addErrorWatch();
47             }
48             catch (const OpenFailure& e)
49             {
50                 // Failed to add watch for throttle events, request reset to try
51                 // to recover comm
52                 lg2::error(
53                     "Status::occActive: Unable to add error watch(s) for OCC{INST} watch: {ERROR}",
54                     "INST", instance, "ERROR", e.what());
55                 deviceError(Error::Descriptor(OCC_COMM_ERROR_PATH));
56                 return Base::Status::occActive(false);
57             }
58 
59             // Update the OCC active sensor
60             Base::Status::occActive(value);
61 
62             if (device.master())
63             {
64                 // Update powercap bounds from OCC
65                 manager.updatePcapBounds();
66             }
67 
68             // Call into Manager to let know that we have bound
69             if (this->managerCallBack)
70             {
71                 this->managerCallBack(instance, value);
72             }
73         }
74         else
75         {
76             // OCC is no longer active
77             if (sensorsValid)
78             {
79                 sensorsValid = false;
80                 // Sensors not supported (update to NaN and not functional)
81                 manager.setSensorValueToNaN(instance);
82             }
83 
84             if (pmode && device.master())
85             {
86                 // Prevent mode changes
87                 pmode->setMasterActive(false);
88             }
89             if (safeStateDelayTimer.isEnabled())
90             {
91                 // stop safe delay timer
92                 safeStateDelayTimer.setEnabled(false);
93             }
94 
95             // Call into Manager to let know that we will unbind.
96             if (this->managerCallBack)
97             {
98                 this->managerCallBack(instance, value);
99             }
100 
101             // Stop watching for errors
102             removeErrorWatch();
103 
104             // Set the device inactive
105             device.setActive(false);
106 
107             // Clear throttles (OCC not active after disabling device)
108             updateThrottle(false, THROTTLED_ALL);
109         }
110     }
111     else if (value && !device.active())
112     {
113         // Existing error watch is on a dead file descriptor.
114         removeErrorWatch();
115 
116         /*
117          * In it's constructor, Status checks Device::bound() to see if OCC is
118          * active or not.
119          * Device::bound() checks for occX-dev0 directory.
120          * We will lose occX-dev0 directories during FSI rescan.
121          * So, if we start this application (and construct Status), and then
122          * later do FSI rescan, we will end up with occActive = true and device
123          * NOT bound. Lets correct that situation here.
124          */
125         device.setActive(true);
126 
127         // Add error watch again
128         try
129         {
130             addErrorWatch();
131         }
132         catch (const OpenFailure& e)
133         {
134             // Failed to add watch for throttle events, request reset to try to
135             // recover comm
136             lg2::error(
137                 "Status::occActive: Unable to add error watch(s) again for OCC{INST} watch: {ERROR}",
138                 "INST", instance, "ERROR", e.what());
139             deviceError(Error::Descriptor(OCC_COMM_ERROR_PATH));
140             return Base::Status::occActive(false);
141         }
142     }
143     else if (!value && device.active())
144     {
145         removeErrorWatch();
146 
147         // In the event that the application never receives the active signal
148         // even though the OCC is active (this can occur if the BMC is rebooted
149         // with the host on, since the initial OCC driver probe will discover
150         // the OCCs), this application needs to be able to unbind the device
151         // when we get the OCC inactive signal.
152         device.setActive(false);
153     }
154     return Base::Status::occActive(value);
155 }
156 
157 // Callback handler when a device error is reported.
deviceError(Error::Descriptor d)158 void Status::deviceError(Error::Descriptor d)
159 {
160     if (pmode && device.master())
161     {
162         // Prevent mode changes
163         pmode->setMasterActive(false);
164     }
165 
166     if (d.log)
167     {
168         FFDC::createOCCResetPEL(instance, d.path, d.err, d.callout,
169                                 d.isInventoryCallout);
170     }
171 
172     // This would deem OCC inactive
173     this->occActive(false);
174 
175     // Reset the OCC
176     this->resetOCC();
177 }
178 
179 // Sends message to host control command handler to reset OCC
resetOCC()180 void Status::resetOCC()
181 {
182     lg2::info(">>Status::resetOCC() - requesting reset for OCC{INST}", "INST",
183               instance);
184     this->occActive(false);
185     if (resetCallBack)
186     {
187         this->resetCallBack(instance);
188     }
189 }
190 
191 // Handler called by Host control command handler to convey the
192 // status of the executed command
hostControlEvent(sdbusplus::message_t & msg)193 void Status::hostControlEvent(sdbusplus::message_t& msg)
194 {
195     std::string cmdCompleted{};
196     std::string cmdStatus{};
197 
198     msg.read(cmdCompleted, cmdStatus);
199 
200     lg2::debug("Host control signal values: command={CMD}, status={STATUS}",
201                "CMD", cmdCompleted, "STATUS", cmdStatus);
202 
203     if (Control::Host::convertResultFromString(cmdStatus) !=
204         Control::Host::Result::Success)
205     {
206         if (Control::Host::convertCommandFromString(cmdCompleted) ==
207             Control::Host::Command::OCCReset)
208         {
209             // Must be a Timeout. Log an Error trace
210             lg2::error(
211                 "Error resetting the OCC: path={PATH}, sensorid={SENSOR}",
212                 "PATH", path, "SENSOR", std::get<0>(sensorMap.at(instance)));
213         }
214     }
215     return;
216 }
217 
218 // Called from Manager::pollerTimerExpired() in preperation to POLL OCC.
readOccState()219 void Status::readOccState()
220 {
221     if (stateValid)
222     {
223         // Reset retry count (since state is good)
224         currentOccReadRetriesCount = occReadRetries;
225     }
226     occReadStateNow();
227 }
228 
229 // Special processing that needs to happen once the OCCs change to ACTIVE state
occsWentActive()230 void Status::occsWentActive()
231 {
232     CmdStatus status = CmdStatus::SUCCESS;
233 
234     // IPS data will get sent automatically after a mode change if the mode
235     // supports it.
236     pmode->needToSendIPS();
237 
238     status = pmode->sendModeChange();
239     if (status != CmdStatus::SUCCESS)
240     {
241         lg2::error(
242             "Status::occsWentActive: OCC mode change failed with status {STATUS}",
243             "STATUS", status);
244 
245         // Disable and reset to try recovering
246         deviceError();
247     }
248 }
249 
250 // Send Ambient and Altitude to the OCC
sendAmbient(const uint8_t inTemp,const uint16_t inAltitude)251 CmdStatus Status::sendAmbient(const uint8_t inTemp, const uint16_t inAltitude)
252 {
253     CmdStatus status = CmdStatus::FAILURE;
254     bool ambientValid = true;
255     uint8_t ambientTemp = inTemp;
256     uint16_t altitude = inAltitude;
257 
258     if (ambientTemp == 0xFF)
259     {
260         // Get latest readings from manager
261         manager.getAmbientData(ambientValid, ambientTemp, altitude);
262         lg2::debug(
263             "sendAmbient: valid: {VALID}, Ambient: {TEMP}C, altitude: {ALT}m",
264             "VALID", ambientValid, "TEMP", ambientTemp, "ALT", altitude);
265     }
266 
267     std::vector<std::uint8_t> cmd, rsp;
268     cmd.reserve(11);
269     cmd.push_back(uint8_t(CmdType::SEND_AMBIENT));
270     cmd.push_back(0x00);                    // Data Length (2 bytes)
271     cmd.push_back(0x08);                    //
272     cmd.push_back(0x00);                    // Version
273     cmd.push_back(ambientValid ? 0 : 0xFF); // Ambient Status
274     cmd.push_back(ambientTemp);             // Ambient Temperature
275     cmd.push_back(altitude >> 8);           // Altitude in meters (2 bytes)
276     cmd.push_back(altitude & 0xFF);         //
277     cmd.push_back(0x00);                    // Reserved (3 bytes)
278     cmd.push_back(0x00);
279     cmd.push_back(0x00);
280     lg2::debug("sendAmbient: SEND_AMBIENT "
281                "command to OCC{INST} ({SIZE} bytes)",
282                "INST", instance, "SIZE", cmd.size());
283     status = occCmd.send(cmd, rsp);
284     if (status == CmdStatus::SUCCESS)
285     {
286         if (rsp.size() == 5)
287         {
288             if (RspStatus::SUCCESS != RspStatus(rsp[2]))
289             {
290                 lg2::error(
291                     "sendAmbient: SEND_AMBIENT failed with rspStatus {STATUS}",
292                     "STATUS", lg2::hex, rsp[2]);
293                 dump_hex(rsp);
294                 status = CmdStatus::FAILURE;
295             }
296         }
297         else
298         {
299             lg2::error(
300                 "sendAmbient: INVALID SEND_AMBIENT response length:{SIZE}",
301                 "SIZE", rsp.size());
302             dump_hex(rsp);
303             status = CmdStatus::FAILURE;
304         }
305     }
306     else
307     {
308         lg2::error("sendAmbient: SEND_AMBIENT FAILED! with status {STATUS}",
309                    "STATUS", lg2::hex, uint8_t(status));
310 
311         if (status == CmdStatus::COMM_FAILURE)
312         {
313             // Disable due to OCC comm failure and reset to try recovering
314             deviceError(Error::Descriptor(OCC_COMM_ERROR_PATH));
315         }
316     }
317 
318     return status;
319 }
320 
321 // Called when safe timer expires to determine if OCCs need to be reset
safeStateDelayExpired()322 void Status::safeStateDelayExpired()
323 {
324     if (this->occActive())
325     {
326         lg2::info(
327             "safeStateDelayExpired: OCC{INST} state missing or not valid, requesting reset",
328             "INST", instance);
329         // Disable and reset to try recovering
330         deviceError(Error::Descriptor(SAFE_ERROR_PATH));
331     }
332 }
333 
getHwmonPath()334 fs::path Status::getHwmonPath()
335 {
336     using namespace std::literals::string_literals;
337 
338     if (!fs::exists(hwmonPath))
339     {
340         static bool tracedFail[8] = {0};
341 
342         if (!hwmonPath.empty())
343         {
344             lg2::warning(
345                 "Status::getHwmonPath(): path no longer exists: {PATH}", "PATH",
346                 hwmonPath);
347             hwmonPath.clear();
348         }
349 
350         // Build the base HWMON path
351         fs::path prefixPath =
352             fs::path{OCC_HWMON_PATH + "occ-hwmon."s +
353                      std::to_string(instance + 1) + "/hwmon/"s};
354 
355         // Get the hwmonXX directory name
356         try
357         {
358             // there should only be one directory
359             const int numDirs = std::distance(
360                 fs::directory_iterator(prefixPath), fs::directory_iterator{});
361             if (numDirs == 1)
362             {
363                 hwmonPath = *fs::directory_iterator(prefixPath);
364                 tracedFail[instance] = false;
365             }
366             else
367             {
368                 if (!tracedFail[instance])
369                 {
370                     lg2::error(
371                         "Status::getHwmonPath(): Found multiple ({NUM}) hwmon paths!",
372                         "NUM", numDirs);
373                     tracedFail[instance] = true;
374                 }
375             }
376         }
377         catch (const fs::filesystem_error& e)
378         {
379             if (!tracedFail[instance])
380             {
381                 lg2::error(
382                     "Status::getHwmonPath(): error accessing {PATH}: {ERROR}",
383                     "PATH", prefixPath, "ERROR", e.what());
384                 tracedFail[instance] = true;
385             }
386         }
387     }
388 
389     return hwmonPath;
390 }
391 
392 // Called to read state and handle any errors
occReadStateNow()393 void Status::occReadStateNow()
394 {
395     unsigned int state;
396     const fs::path filename =
397         fs::path(DEV_PATH) /
398         fs::path(sysfsName + "." + std::to_string(instance + 1)) / "occ_state";
399 
400     std::ifstream file;
401     bool stateWasRead = false;
402 
403     // open file.
404     file.open(filename, std::ios::in);
405     const int openErrno = errno;
406 
407     // File is open and state can be used.
408     if (file.is_open() && file.good())
409     {
410         stateWasRead = true;
411         file >> state;
412         // Read the error code (if any) to check status of the read
413         std::ios_base::iostate readState = file.rdstate();
414         if (readState)
415         {
416             // There was a failure reading the file
417             if (lastOccReadStatus != -1)
418             {
419                 // Trace error bits
420                 std::string errorBits = "";
421                 if (readState & std::ios_base::eofbit)
422                 {
423                     errorBits += " EOF";
424                 }
425                 if (readState & std::ios_base::failbit)
426                 {
427                     errorBits += " failbit";
428                 }
429                 if (readState & std::ios_base::badbit)
430                 {
431                     errorBits += " badbit";
432                 }
433                 lg2::error(
434                     "readOccState: Failed to read OCC{INST} state: Read error on I/O operation - {ERROR}",
435                     "INST", instance, "ERROR", errorBits);
436                 lastOccReadStatus = -1;
437             }
438             stateWasRead = false;
439         }
440 
441         if (stateWasRead && (state != lastState))
442         {
443             // Trace OCC state changes
444             lg2::info(
445                 "Status::readOccState: OCC{INST} state {STATE} (lastState: {PRIOR})",
446                 "INST", instance, "STATE", lg2::hex, state, "PRIOR", lg2::hex,
447                 lastState);
448             lastState = state;
449 
450             if (OccState(state) == OccState::ACTIVE)
451             {
452                 if (pmode && device.master())
453                 {
454                     // Set the master OCC on the PowerMode object
455                     pmode->setMasterOcc(path);
456                     // Enable mode changes
457                     pmode->setMasterActive();
458 
459                     // Special processing by master OCC when it goes active
460                     occsWentActive();
461                 }
462 
463                 CmdStatus status = sendAmbient();
464                 if (status != CmdStatus::SUCCESS)
465                 {
466                     lg2::error(
467                         "readOccState: Sending Ambient failed with status {STATUS}",
468                         "STATUS", status);
469                 }
470             }
471 
472             // If OCC in known Good State.
473             if ((OccState(state) == OccState::ACTIVE) ||
474                 (OccState(state) == OccState::CHARACTERIZATION) ||
475                 (OccState(state) == OccState::OBSERVATION))
476             {
477                 // Good OCC State then sensors valid again
478                 stateValid = true;
479                 sensorsValid = true;
480 
481                 if (safeStateDelayTimer.isEnabled())
482                 {
483                     // stop safe delay timer (no longer in SAFE state)
484                     safeStateDelayTimer.setEnabled(false);
485                 }
486             }
487             else
488             {
489                 // OCC is in SAFE or some other unsupported state
490                 if (!safeStateDelayTimer.isEnabled())
491                 {
492                     lg2::error(
493                         "readOccState: Invalid OCC{INST} state of {STATE} (last state: {PRIOR}), starting safe state delay timer",
494                         "INST", instance, "STATE", lg2::hex, state, "PRIOR",
495                         lg2::hex, lastState);
496                     // start safe delay timer (before requesting reset)
497                     using namespace std::literals::chrono_literals;
498                     safeStateDelayTimer.restartOnce(60s);
499                 }
500 
501                 if (sensorsValid)
502                 {
503                     sensorsValid = false;
504                     // Sensors not supported (update to NaN and not functional)
505                     manager.setSensorValueToNaN(instance);
506                 }
507             }
508         }
509     }
510     else
511     {
512         // Unable to read state
513         stateValid = false;
514     }
515 
516     file.close();
517 
518     // if failed to read the OCC state -> Attempt retry
519     if (!stateWasRead)
520     {
521         if (sensorsValid)
522         {
523             sensorsValid = false;
524             manager.setSensorValueToNaN(instance);
525         }
526 
527         // If not able to read, OCC may be offline
528         if (openErrno != lastOccReadStatus)
529         {
530             lg2::error(
531                 "Status::readOccState: open/read failed trying to read OCC{INST} state (open errno={ERROR})",
532                 "INST", instance, "ERROR", openErrno);
533             lastOccReadStatus = openErrno;
534         }
535 
536         // See occReadRetries for number of retry attempts.
537         if (currentOccReadRetriesCount > 0)
538         {
539             --currentOccReadRetriesCount;
540         }
541         else
542         {
543             lg2::error(
544                 "readOccState: failed to read OCC{INST} state! (last state: {PRIOR})",
545                 "INST", instance, "PRIOR", lg2::hex, lastState);
546 
547             // State could not be determined, set it to NO State.
548             lastState = 0;
549 
550             // Disable the ability to send Failed actions until OCC is
551             // Active again.
552             stateValid = false;
553 
554             // Disable due to OCC comm failure and reset to try recovering
555             // (processor callout will be added)
556             deviceError(Error::Descriptor(OCC_COMM_ERROR_PATH, ECOMM,
557                                           procPath.c_str(), true));
558 
559             // Reset retry count (for next attempt after recovery)
560             currentOccReadRetriesCount = occReadRetries;
561         }
562     }
563     else if (lastOccReadStatus != 0)
564     {
565         lg2::info("readOccState: successfully read OCC{INST} state: {STATE}",
566                   "INST", instance, "STATE", state);
567         lastOccReadStatus = 0; // no error
568     }
569 }
570 
571 // Update processor throttle status on dbus
updateThrottle(const bool isThrottled,const uint8_t newReason)572 void Status::updateThrottle(const bool isThrottled, const uint8_t newReason)
573 {
574     if (!throttleHandle)
575     {
576         return;
577     }
578 
579     uint8_t newThrottleCause = throttleCause;
580 
581     if (isThrottled) // throttled due to newReason
582     {
583         if ((newReason & throttleCause) == 0)
584         {
585             // set the bit(s) for passed in reason
586             newThrottleCause |= newReason;
587         }
588         // else no change
589     }
590     else // no longer throttled due to newReason
591     {
592         if ((newReason & throttleCause) != 0)
593         {
594             // clear the bit(s) for passed in reason
595             newThrottleCause &= ~newReason;
596         }
597         // else no change
598     }
599 
600     if (newThrottleCause != throttleCause)
601     {
602         if (newThrottleCause == THROTTLED_NONE)
603         {
604             lg2::debug(
605                 "updateThrottle: OCC{INST} no longer throttled (prior reason: {REASON})",
606                 "INST", instance, "REASON", throttleCause);
607             throttleCause = THROTTLED_NONE;
608             throttleHandle->throttled(false);
609             throttleHandle->throttleCauses({});
610         }
611         else
612         {
613             lg2::debug(
614                 "updateThrottle: OCC{INST} is throttled with reason {REASON} (prior reason: {PRIOR})",
615                 "INST", instance, "REASON", newThrottleCause, "PRIOR",
616                 throttleCause);
617             throttleCause = newThrottleCause;
618 
619             std::vector<ThrottleObj::ThrottleReasons> updatedCauses;
620             if (throttleCause & THROTTLED_POWER)
621             {
622                 updatedCauses.push_back(ThrottleReason::PowerLimit);
623             }
624             if (throttleCause & THROTTLED_THERMAL)
625             {
626                 updatedCauses.push_back(ThrottleReason::ThermalLimit);
627             }
628             if (throttleCause & THROTTLED_SAFE)
629             {
630                 updatedCauses.push_back(
631                     ThrottleReason::ManagementDetectedFault);
632             }
633             throttleHandle->throttleCauses(updatedCauses);
634             throttleHandle->throttled(true);
635         }
636     }
637     // else no change to throttle status
638 }
639 
640 // Get processor path associated with this OCC
readProcAssociation()641 void Status::readProcAssociation()
642 {
643     std::string managingPath = path + "/power_managing";
644     lg2::debug("readProcAssociation: getting endpoints for {MANAGE} ({PATH})",
645                "MANAGE", managingPath, "PATH", path);
646     try
647     {
648         utils::PropertyValue procPathProperty{};
649         procPathProperty = utils::getProperty(
650             managingPath, "xyz.openbmc_project.Association", "endpoints");
651         auto result = std::get<std::vector<std::string>>(procPathProperty);
652         if (result.size() > 0)
653         {
654             procPath = result[0];
655             lg2::info("readProcAssociation: OCC{INST} has proc={PATH}", "INST",
656                       instance, "PATH", procPath);
657         }
658         else
659         {
660             lg2::error(
661                 "readProcAssociation: No processor associated with OCC{INST} / {PATH}",
662                 "INST", instance, "PATH", path);
663         }
664     }
665     catch (const sdbusplus::exception_t& e)
666     {
667         lg2::error(
668             "readProcAssociation: Unable to get proc assocated with {PATH} - {ERROR}",
669             "PATH", path, "ERROR", e.what());
670         procPath = {};
671     }
672 }
673 
674 } // namespace occ
675 } // namespace open_power
676