xref: /openbmc/openpower-occ-control/occ_status.cpp (revision f788150656a182cb75c0b72af9f84530531773ef)
1 #include "occ_status.hpp"
2 
3 #include "occ_manager.hpp"
4 #include "occ_sensor.hpp"
5 #include "powermode.hpp"
6 #include "utils.hpp"
7 
8 #include <phosphor-logging/lg2.hpp>
9 
10 #include <filesystem>
11 
12 namespace open_power
13 {
14 namespace occ
15 {
16 
17 using namespace phosphor::logging;
18 
19 using ThrottleObj =
20     sdbusplus::xyz::openbmc_project::Control::Power::server::Throttle;
21 
22 // Handles updates to occActive property
occActive(bool value)23 bool Status::occActive(bool value)
24 {
25     if (value != this->occActive())
26     {
27         lg2::info("Status::occActive OCC{INST} changed to {STATE}", "INST",
28                   instance, "STATE", value);
29         if (value)
30         {
31             // OCC is active
32             // Clear prior throttle reason (before setting device active)
33             updateThrottle(false, THROTTLED_ALL);
34 
35             // Set the device active
36             device.setActive(true);
37 
38             // Reset last OCC state
39             lastState = 0;
40 
41             // Start watching for errors (throttles, etc)
42             try
43             {
44                 addErrorWatch();
45             }
46             catch (const OpenFailure& e)
47             {
48                 // Failed to add watch for throttle events, request reset to try
49                 // to recover comm
50                 lg2::error(
51                     "Status::occActive: Unable to add error watch(s) for OCC{INST} watch: {ERROR}",
52                     "INST", instance, "ERROR", e.what());
53                 deviceError(Error::Descriptor(OCC_COMM_ERROR_PATH));
54                 return Base::Status::occActive(false);
55             }
56 
57             // Update the OCC active sensor
58             Base::Status::occActive(value);
59 
60             if (device.master())
61             {
62                 // Update powercap bounds from OCC
63                 manager.updatePcapBounds();
64             }
65 
66             // Call into Manager to let know that we have bound
67             if (this->managerCallBack)
68             {
69                 this->managerCallBack(instance, value);
70             }
71         }
72         else
73         {
74             // OCC is no longer active
75 #ifdef POWER10
76             if (sensorsValid)
77             {
78                 sensorsValid = false;
79                 // Sensors not supported (update to NaN and not functional)
80                 manager.setSensorValueToNaN(instance);
81             }
82 
83             if (pmode && device.master())
84             {
85                 // Prevent mode changes
86                 pmode->setMasterActive(false);
87             }
88             if (safeStateDelayTimer.isEnabled())
89             {
90                 // stop safe delay timer
91                 safeStateDelayTimer.setEnabled(false);
92             }
93 #endif
94             // Call into Manager to let know that we will unbind.
95             if (this->managerCallBack)
96             {
97                 this->managerCallBack(instance, value);
98             }
99 
100             // Stop watching for errors
101             removeErrorWatch();
102 
103             // Set the device inactive
104             device.setActive(false);
105 
106             // Clear throttles (OCC not active after disabling device)
107             updateThrottle(false, THROTTLED_ALL);
108         }
109     }
110     else if (value && !device.active())
111     {
112         // Existing error watch is on a dead file descriptor.
113         removeErrorWatch();
114 
115         /*
116          * In it's constructor, Status checks Device::bound() to see if OCC is
117          * active or not.
118          * Device::bound() checks for occX-dev0 directory.
119          * We will lose occX-dev0 directories during FSI rescan.
120          * So, if we start this application (and construct Status), and then
121          * later do FSI rescan, we will end up with occActive = true and device
122          * NOT bound. Lets correct that situation here.
123          */
124         device.setActive(true);
125 
126         // Add error watch again
127         try
128         {
129             addErrorWatch();
130         }
131         catch (const OpenFailure& e)
132         {
133             // Failed to add watch for throttle events, request reset to try to
134             // recover comm
135             lg2::error(
136                 "Status::occActive: Unable to add error watch(s) again for OCC{INST} watch: {ERROR}",
137                 "INST", instance, "ERROR", e.what());
138             deviceError(Error::Descriptor(OCC_COMM_ERROR_PATH));
139             return Base::Status::occActive(false);
140         }
141     }
142     else if (!value && device.active())
143     {
144         removeErrorWatch();
145 
146         // In the event that the application never receives the active signal
147         // even though the OCC is active (this can occur if the BMC is rebooted
148         // with the host on, since the initial OCC driver probe will discover
149         // the OCCs), this application needs to be able to unbind the device
150         // when we get the OCC inactive signal.
151         device.setActive(false);
152     }
153     return Base::Status::occActive(value);
154 }
155 
156 // Callback handler when a device error is reported.
deviceError(Error::Descriptor d)157 void Status::deviceError(Error::Descriptor d)
158 {
159 #ifdef POWER10
160     if (pmode && device.master())
161     {
162         // Prevent mode changes
163         pmode->setMasterActive(false);
164     }
165 #endif
166 
167     if (d.log)
168     {
169         FFDC::createOCCResetPEL(instance, d.path, d.err, d.callout,
170                                 d.isInventoryCallout);
171     }
172 
173     // This would deem OCC inactive
174     this->occActive(false);
175 
176     // Reset the OCC
177     this->resetOCC();
178 }
179 
180 // Sends message to host control command handler to reset OCC
resetOCC()181 void Status::resetOCC()
182 {
183     lg2::info(">>Status::resetOCC() - requesting reset for OCC{INST}", "INST",
184               instance);
185     this->occActive(false);
186 #ifdef PLDM
187     if (resetCallBack)
188     {
189         this->resetCallBack(instance);
190     }
191 #else
192     constexpr auto CONTROL_HOST_PATH = "/org/open_power/control/host0";
193     constexpr auto CONTROL_HOST_INTF = "org.open_power.Control.Host";
194 
195     // This will throw exception on failure
196     auto service = utils::getService(CONTROL_HOST_PATH, CONTROL_HOST_INTF);
197 
198     auto& bus = utils::getBus();
199     auto method = bus.new_method_call(service.c_str(), CONTROL_HOST_PATH,
200                                       CONTROL_HOST_INTF, "Execute");
201     // OCC Reset control command
202     method.append(convertForMessage(Control::Host::Command::OCCReset).c_str());
203 
204     // OCC Sensor ID for callout reasons
205     method.append(std::variant<uint8_t>(std::get<0>(sensorMap.at(instance))));
206     bus.call_noreply(method);
207     return;
208 #endif
209 }
210 
211 // Handler called by Host control command handler to convey the
212 // status of the executed command
hostControlEvent(sdbusplus::message_t & msg)213 void Status::hostControlEvent(sdbusplus::message_t& msg)
214 {
215     std::string cmdCompleted{};
216     std::string cmdStatus{};
217 
218     msg.read(cmdCompleted, cmdStatus);
219 
220     lg2::debug("Host control signal values: command={CMD}, status={STATUS}",
221                "CMD", cmdCompleted, "STATUS", cmdStatus);
222 
223     if (Control::Host::convertResultFromString(cmdStatus) !=
224         Control::Host::Result::Success)
225     {
226         if (Control::Host::convertCommandFromString(cmdCompleted) ==
227             Control::Host::Command::OCCReset)
228         {
229             // Must be a Timeout. Log an Error trace
230             lg2::error(
231                 "Error resetting the OCC: path={PATH}, sensorid={SENSOR}",
232                 "PATH", path, "SENSOR", std::get<0>(sensorMap.at(instance)));
233         }
234     }
235     return;
236 }
237 
238 // Called from Manager::pollerTimerExpired() in preperation to POLL OCC.
readOccState()239 void Status::readOccState()
240 {
241     if (stateValid)
242     {
243         // Reset retry count (since state is good)
244         currentOccReadRetriesCount = occReadRetries;
245     }
246     occReadStateNow();
247 }
248 
249 #ifdef POWER10
250 // Special processing that needs to happen once the OCCs change to ACTIVE state
occsWentActive()251 void Status::occsWentActive()
252 {
253     CmdStatus status = CmdStatus::SUCCESS;
254 
255     // IPS data will get sent automatically after a mode change if the mode
256     // supports it.
257     pmode->needToSendIPS();
258 
259     status = pmode->sendModeChange();
260     if (status != CmdStatus::SUCCESS)
261     {
262         lg2::error(
263             "Status::occsWentActive: OCC mode change failed with status {STATUS}",
264             "STATUS", status);
265 
266         // Disable and reset to try recovering
267         deviceError();
268     }
269 }
270 
271 // Send Ambient and Altitude to the OCC
sendAmbient(const uint8_t inTemp,const uint16_t inAltitude)272 CmdStatus Status::sendAmbient(const uint8_t inTemp, const uint16_t inAltitude)
273 {
274     CmdStatus status = CmdStatus::FAILURE;
275     bool ambientValid = true;
276     uint8_t ambientTemp = inTemp;
277     uint16_t altitude = inAltitude;
278 
279     if (ambientTemp == 0xFF)
280     {
281         // Get latest readings from manager
282         manager.getAmbientData(ambientValid, ambientTemp, altitude);
283         lg2::debug(
284             "sendAmbient: valid: {VALID}, Ambient: {TEMP}C, altitude: {ALT}m",
285             "VALID", ambientValid, "TEMP", ambientTemp, "ALT", altitude);
286     }
287 
288     std::vector<std::uint8_t> cmd, rsp;
289     cmd.reserve(11);
290     cmd.push_back(uint8_t(CmdType::SEND_AMBIENT));
291     cmd.push_back(0x00);                    // Data Length (2 bytes)
292     cmd.push_back(0x08);                    //
293     cmd.push_back(0x00);                    // Version
294     cmd.push_back(ambientValid ? 0 : 0xFF); // Ambient Status
295     cmd.push_back(ambientTemp);             // Ambient Temperature
296     cmd.push_back(altitude >> 8);           // Altitude in meters (2 bytes)
297     cmd.push_back(altitude & 0xFF);         //
298     cmd.push_back(0x00);                    // Reserved (3 bytes)
299     cmd.push_back(0x00);
300     cmd.push_back(0x00);
301     lg2::debug("sendAmbient: SEND_AMBIENT "
302                "command to OCC{INST} ({SIZE} bytes)",
303                "INST", instance, "SIZE", cmd.size());
304     status = occCmd.send(cmd, rsp);
305     if (status == CmdStatus::SUCCESS)
306     {
307         if (rsp.size() == 5)
308         {
309             if (RspStatus::SUCCESS != RspStatus(rsp[2]))
310             {
311                 lg2::error(
312                     "sendAmbient: SEND_AMBIENT failed with rspStatus {STATUS}",
313                     "STATUS", lg2::hex, rsp[2]);
314                 dump_hex(rsp);
315                 status = CmdStatus::FAILURE;
316             }
317         }
318         else
319         {
320             lg2::error(
321                 "sendAmbient: INVALID SEND_AMBIENT response length:{SIZE}",
322                 "SIZE", rsp.size());
323             dump_hex(rsp);
324             status = CmdStatus::FAILURE;
325         }
326     }
327     else
328     {
329         lg2::error("sendAmbient: SEND_AMBIENT FAILED! with status {STATUS}",
330                    "STATUS", lg2::hex, uint8_t(status));
331 
332         if (status == CmdStatus::COMM_FAILURE)
333         {
334             // Disable due to OCC comm failure and reset to try recovering
335             deviceError(Error::Descriptor(OCC_COMM_ERROR_PATH));
336         }
337     }
338 
339     return status;
340 }
341 
342 // Called when safe timer expires to determine if OCCs need to be reset
safeStateDelayExpired()343 void Status::safeStateDelayExpired()
344 {
345     if (this->occActive())
346     {
347         lg2::info(
348             "safeStateDelayExpired: OCC{INST} state missing or not valid, requesting reset",
349             "INST", instance);
350         // Disable and reset to try recovering
351         deviceError(Error::Descriptor(SAFE_ERROR_PATH));
352     }
353 }
354 #endif // POWER10
355 
getHwmonPath()356 fs::path Status::getHwmonPath()
357 {
358     using namespace std::literals::string_literals;
359 
360     if (!fs::exists(hwmonPath))
361     {
362         static bool tracedFail[8] = {0};
363 
364         if (!hwmonPath.empty())
365         {
366             lg2::warning(
367                 "Status::getHwmonPath(): path no longer exists: {PATH}", "PATH",
368                 hwmonPath);
369             hwmonPath.clear();
370         }
371 
372         // Build the base HWMON path
373         fs::path prefixPath =
374             fs::path{OCC_HWMON_PATH + "occ-hwmon."s +
375                      std::to_string(instance + 1) + "/hwmon/"s};
376 
377         // Get the hwmonXX directory name
378         try
379         {
380             // there should only be one directory
381             const int numDirs = std::distance(
382                 fs::directory_iterator(prefixPath), fs::directory_iterator{});
383             if (numDirs == 1)
384             {
385                 hwmonPath = *fs::directory_iterator(prefixPath);
386                 tracedFail[instance] = false;
387             }
388             else
389             {
390                 if (!tracedFail[instance])
391                 {
392                     lg2::error(
393                         "Status::getHwmonPath(): Found multiple ({NUM}) hwmon paths!",
394                         "NUM", numDirs);
395                     tracedFail[instance] = true;
396                 }
397             }
398         }
399         catch (const fs::filesystem_error& e)
400         {
401             if (!tracedFail[instance])
402             {
403                 lg2::error(
404                     "Status::getHwmonPath(): error accessing {PATH}: {ERROR}",
405                     "PATH", prefixPath, "ERROR", e.what());
406                 tracedFail[instance] = true;
407             }
408         }
409     }
410 
411     return hwmonPath;
412 }
413 
414 // Called to read state and handle any errors
occReadStateNow()415 void Status::occReadStateNow()
416 {
417     unsigned int state;
418     const fs::path filename =
419         fs::path(DEV_PATH) /
420         fs::path(sysfsName + "." + std::to_string(instance + 1)) / "occ_state";
421 
422     std::ifstream file;
423     bool stateWasRead = false;
424 
425     // open file.
426     file.open(filename, std::ios::in);
427     const int openErrno = errno;
428 
429     // File is open and state can be used.
430     if (file.is_open() && file.good())
431     {
432         stateWasRead = true;
433         file >> state;
434         // Read the error code (if any) to check status of the read
435         std::ios_base::iostate readState = file.rdstate();
436         if (readState)
437         {
438             // There was a failure reading the file
439             if (lastOccReadStatus != -1)
440             {
441                 // Trace error bits
442                 std::string errorBits = "";
443                 if (readState & std::ios_base::eofbit)
444                 {
445                     errorBits += " EOF";
446                 }
447                 if (readState & std::ios_base::failbit)
448                 {
449                     errorBits += " failbit";
450                 }
451                 if (readState & std::ios_base::badbit)
452                 {
453                     errorBits += " badbit";
454                 }
455                 lg2::error(
456                     "readOccState: Failed to read OCC{INST} state: Read error on I/O operation - {ERROR}",
457                     "INST", instance, "ERROR", errorBits);
458                 lastOccReadStatus = -1;
459             }
460             stateWasRead = false;
461         }
462 
463         if (stateWasRead && (state != lastState))
464         {
465             // Trace OCC state changes
466             lg2::info(
467                 "Status::readOccState: OCC{INST} state {STATE} (lastState: {PRIOR})",
468                 "INST", instance, "STATE", lg2::hex, state, "PRIOR", lg2::hex,
469                 lastState);
470             lastState = state;
471 #ifdef POWER10
472             if (OccState(state) == OccState::ACTIVE)
473             {
474                 if (pmode && device.master())
475                 {
476                     // Set the master OCC on the PowerMode object
477                     pmode->setMasterOcc(path);
478                     // Enable mode changes
479                     pmode->setMasterActive();
480 
481                     // Special processing by master OCC when it goes active
482                     occsWentActive();
483                 }
484 
485                 CmdStatus status = sendAmbient();
486                 if (status != CmdStatus::SUCCESS)
487                 {
488                     lg2::error(
489                         "readOccState: Sending Ambient failed with status {STATUS}",
490                         "STATUS", status);
491                 }
492             }
493 
494             // If OCC in known Good State.
495             if ((OccState(state) == OccState::ACTIVE) ||
496                 (OccState(state) == OccState::CHARACTERIZATION) ||
497                 (OccState(state) == OccState::OBSERVATION))
498             {
499                 // Good OCC State then sensors valid again
500                 stateValid = true;
501                 sensorsValid = true;
502 
503                 if (safeStateDelayTimer.isEnabled())
504                 {
505                     // stop safe delay timer (no longer in SAFE state)
506                     safeStateDelayTimer.setEnabled(false);
507                 }
508             }
509             else
510             {
511                 // OCC is in SAFE or some other unsupported state
512                 if (!safeStateDelayTimer.isEnabled())
513                 {
514                     lg2::error(
515                         "readOccState: Invalid OCC{INST} state of {STATE} (last state: {PRIOR}), starting safe state delay timer",
516                         "INST", instance, "STATE", lg2::hex, state, "PRIOR",
517                         lg2::hex, lastState);
518                     // start safe delay timer (before requesting reset)
519                     using namespace std::literals::chrono_literals;
520                     safeStateDelayTimer.restartOnce(60s);
521                 }
522 
523                 if (sensorsValid)
524                 {
525                     sensorsValid = false;
526                     // Sensors not supported (update to NaN and not functional)
527                     manager.setSensorValueToNaN(instance);
528                 }
529             }
530 #else
531             // Before P10 state not checked, only used good file open.
532             stateValid = true;
533 #endif
534         }
535     }
536 #ifdef POWER10
537     else
538     {
539         // Unable to read state
540         stateValid = false;
541     }
542 #endif
543     file.close();
544 
545     // if failed to read the OCC state -> Attempt retry
546     if (!stateWasRead)
547     {
548 #ifdef READ_OCC_SENSORS
549         if (sensorsValid)
550         {
551             sensorsValid = false;
552             manager.setSensorValueToNaN(instance);
553         }
554 #endif
555 
556         // If not able to read, OCC may be offline
557         if (openErrno != lastOccReadStatus)
558         {
559             lg2::error(
560                 "Status::readOccState: open/read failed trying to read OCC{INST} state (open errno={ERROR})",
561                 "INST", instance, "ERROR", openErrno);
562             lastOccReadStatus = openErrno;
563         }
564 
565         // See occReadRetries for number of retry attempts.
566         if (currentOccReadRetriesCount > 0)
567         {
568             --currentOccReadRetriesCount;
569         }
570         else
571         {
572             lg2::error(
573                 "readOccState: failed to read OCC{INST} state! (last state: {PRIOR})",
574                 "INST", instance, "PRIOR", lg2::hex, lastState);
575 
576             // State could not be determined, set it to NO State.
577             lastState = 0;
578 
579             // Disable the ability to send Failed actions until OCC is
580             // Active again.
581             stateValid = false;
582 
583             // Disable due to OCC comm failure and reset to try recovering
584             // (processor callout will be added)
585             deviceError(Error::Descriptor(OCC_COMM_ERROR_PATH, ECOMM,
586                                           procPath.c_str(), true));
587 
588             // Reset retry count (for next attempt after recovery)
589             currentOccReadRetriesCount = occReadRetries;
590         }
591     }
592     else if (lastOccReadStatus != 0)
593     {
594         lg2::info("readOccState: successfully read OCC{INST} state: {STATE}",
595                   "INST", instance, "STATE", state);
596         lastOccReadStatus = 0; // no error
597     }
598 }
599 
600 // Update processor throttle status on dbus
updateThrottle(const bool isThrottled,const uint8_t newReason)601 void Status::updateThrottle(const bool isThrottled, const uint8_t newReason)
602 {
603     if (!throttleHandle)
604     {
605         return;
606     }
607 
608     uint8_t newThrottleCause = throttleCause;
609 
610     if (isThrottled) // throttled due to newReason
611     {
612         if ((newReason & throttleCause) == 0)
613         {
614             // set the bit(s) for passed in reason
615             newThrottleCause |= newReason;
616         }
617         // else no change
618     }
619     else // no longer throttled due to newReason
620     {
621         if ((newReason & throttleCause) != 0)
622         {
623             // clear the bit(s) for passed in reason
624             newThrottleCause &= ~newReason;
625         }
626         // else no change
627     }
628 
629     if (newThrottleCause != throttleCause)
630     {
631         if (newThrottleCause == THROTTLED_NONE)
632         {
633             lg2::debug(
634                 "updateThrottle: OCC{INST} no longer throttled (prior reason: {REASON})",
635                 "INST", instance, "REASON", throttleCause);
636             throttleCause = THROTTLED_NONE;
637             throttleHandle->throttled(false);
638             throttleHandle->throttleCauses({});
639         }
640         else
641         {
642             lg2::debug(
643                 "updateThrottle: OCC{INST} is throttled with reason {REASON} (prior reason: {PRIOR})",
644                 "INST", instance, "REASON", newThrottleCause, "PRIOR",
645                 throttleCause);
646             throttleCause = newThrottleCause;
647 
648             std::vector<ThrottleObj::ThrottleReasons> updatedCauses;
649             if (throttleCause & THROTTLED_POWER)
650             {
651                 updatedCauses.push_back(
652                     throttleHandle->ThrottleReasons::PowerLimit);
653             }
654             if (throttleCause & THROTTLED_THERMAL)
655             {
656                 updatedCauses.push_back(
657                     throttleHandle->ThrottleReasons::ThermalLimit);
658             }
659             if (throttleCause & THROTTLED_SAFE)
660             {
661                 updatedCauses.push_back(
662                     throttleHandle->ThrottleReasons::ManagementDetectedFault);
663             }
664             throttleHandle->throttleCauses(updatedCauses);
665             throttleHandle->throttled(true);
666         }
667     }
668     // else no change to throttle status
669 }
670 
671 // Get processor path associated with this OCC
readProcAssociation()672 void Status::readProcAssociation()
673 {
674     std::string managingPath = path + "/power_managing";
675     lg2::debug("readProcAssociation: getting endpoints for {MANAGE} ({PATH})",
676                "MANAGE", managingPath, "PATH", path);
677     try
678     {
679         utils::PropertyValue procPathProperty{};
680         procPathProperty = utils::getProperty(
681             managingPath, "xyz.openbmc_project.Association", "endpoints");
682         auto result = std::get<std::vector<std::string>>(procPathProperty);
683         if (result.size() > 0)
684         {
685             procPath = result[0];
686             lg2::info("readProcAssociation: OCC{INST} has proc={PATH}", "INST",
687                       instance, "PATH", procPath);
688         }
689         else
690         {
691             lg2::error(
692                 "readProcAssociation: No processor associated with OCC{INST} / {PATH}",
693                 "INST", instance, "PATH", path);
694         }
695     }
696     catch (const sdbusplus::exception_t& e)
697     {
698         lg2::error(
699             "readProcAssociation: Unable to get proc assocated with {PATH} - {ERROR}",
700             "PATH", path, "ERROR", e.what());
701         procPath = {};
702     }
703 }
704 
705 } // namespace occ
706 } // namespace open_power
707