xref: /openbmc/openpower-occ-control/occ_status.cpp (revision 3ece5b99026970e1cf9bc7e797aff4c1be81b2f3)
1 #include "occ_status.hpp"
2 
3 #include "occ_manager.hpp"
4 #include "occ_sensor.hpp"
5 #include "powermode.hpp"
6 #include "utils.hpp"
7 
8 #include <phosphor-logging/lg2.hpp>
9 
10 #include <filesystem>
11 
12 namespace open_power
13 {
14 namespace occ
15 {
16 
17 using namespace phosphor::logging;
18 
19 using ThrottleObj =
20     sdbusplus::xyz::openbmc_project::Control::Power::server::Throttle;
21 
22 // Handles updates to occActive property
occActive(bool value)23 bool Status::occActive(bool value)
24 {
25     if (value != this->occActive())
26     {
27         lg2::info("Status::occActive OCC{INST} changed to {STATE}", "INST",
28                   instance, "STATE", value);
29         if (value)
30         {
31             // Clear prior throttle reason (before setting device active)
32             updateThrottle(false, THROTTLED_ALL);
33 
34             // Set the device active
35             device.setActive(true);
36 
37             // Reset last OCC state
38             lastState = 0;
39 
40             // Start watching for errors (throttles, etc)
41             try
42             {
43                 addErrorWatch();
44             }
45             catch (const OpenFailure& e)
46             {
47                 // Failed to add watch for throttle events, request reset to try
48                 // to recover comm
49                 lg2::error(
50                     "Status::occActive: Unable to add error watch(s) for OCC{INST} watch: {ERROR}",
51                     "INST", instance, "ERROR", e.what());
52                 deviceError(Error::Descriptor(OCC_COMM_ERROR_PATH));
53                 return Base::Status::occActive(false);
54             }
55 
56             // Update the OCC active sensor
57             Base::Status::occActive(value);
58 
59             if (device.master())
60             {
61                 // Update powercap bounds from OCC
62                 manager.updatePcapBounds();
63             }
64 
65             // Call into Manager to let know that we have bound
66             if (this->managerCallBack)
67             {
68                 this->managerCallBack(instance, value);
69             }
70         }
71         else
72         {
73 #ifdef POWER10
74             if (pmode && device.master())
75             {
76                 // Prevent mode changes
77                 pmode->setMasterActive(false);
78             }
79             if (safeStateDelayTimer.isEnabled())
80             {
81                 // stop safe delay timer
82                 safeStateDelayTimer.setEnabled(false);
83             }
84 #endif
85             // Call into Manager to let know that we will unbind.
86             if (this->managerCallBack)
87             {
88                 this->managerCallBack(instance, value);
89             }
90 
91             // Stop watching for errors
92             removeErrorWatch();
93 
94             // Set the device inactive
95             device.setActive(false);
96 
97             // Clear throttles (OCC not active after disabling device)
98             updateThrottle(false, THROTTLED_ALL);
99         }
100     }
101     else if (value && !device.active())
102     {
103         // Existing error watch is on a dead file descriptor.
104         removeErrorWatch();
105 
106         /*
107          * In it's constructor, Status checks Device::bound() to see if OCC is
108          * active or not.
109          * Device::bound() checks for occX-dev0 directory.
110          * We will lose occX-dev0 directories during FSI rescan.
111          * So, if we start this application (and construct Status), and then
112          * later do FSI rescan, we will end up with occActive = true and device
113          * NOT bound. Lets correct that situation here.
114          */
115         device.setActive(true);
116 
117         // Add error watch again
118         try
119         {
120             addErrorWatch();
121         }
122         catch (const OpenFailure& e)
123         {
124             // Failed to add watch for throttle events, request reset to try to
125             // recover comm
126             lg2::error(
127                 "Status::occActive: Unable to add error watch(s) again for OCC{INST} watch: {ERROR}",
128                 "INST", instance, "ERROR", e.what());
129             deviceError(Error::Descriptor(OCC_COMM_ERROR_PATH));
130             return Base::Status::occActive(false);
131         }
132     }
133     else if (!value && device.active())
134     {
135         removeErrorWatch();
136 
137         // In the event that the application never receives the active signal
138         // even though the OCC is active (this can occur if the BMC is rebooted
139         // with the host on, since the initial OCC driver probe will discover
140         // the OCCs), this application needs to be able to unbind the device
141         // when we get the OCC inactive signal.
142         device.setActive(false);
143     }
144     return Base::Status::occActive(value);
145 }
146 
147 // Callback handler when a device error is reported.
deviceError(Error::Descriptor d)148 void Status::deviceError(Error::Descriptor d)
149 {
150 #ifdef POWER10
151     if (pmode && device.master())
152     {
153         // Prevent mode changes
154         pmode->setMasterActive(false);
155     }
156 #endif
157 
158     if (d.log)
159     {
160         FFDC::createOCCResetPEL(instance, d.path, d.err, d.callout,
161                                 d.isInventoryCallout);
162     }
163 
164     // This would deem OCC inactive
165     this->occActive(false);
166 
167     // Reset the OCC
168     this->resetOCC();
169 }
170 
171 // Sends message to host control command handler to reset OCC
resetOCC()172 void Status::resetOCC()
173 {
174     lg2::info(">>Status::resetOCC() - requesting reset for OCC{INST}", "INST",
175               instance);
176     this->occActive(false);
177 #ifdef PLDM
178     if (resetCallBack)
179     {
180         this->resetCallBack(instance);
181     }
182 #else
183     constexpr auto CONTROL_HOST_PATH = "/org/open_power/control/host0";
184     constexpr auto CONTROL_HOST_INTF = "org.open_power.Control.Host";
185 
186     // This will throw exception on failure
187     auto service = utils::getService(CONTROL_HOST_PATH, CONTROL_HOST_INTF);
188 
189     auto& bus = utils::getBus();
190     auto method = bus.new_method_call(service.c_str(), CONTROL_HOST_PATH,
191                                       CONTROL_HOST_INTF, "Execute");
192     // OCC Reset control command
193     method.append(convertForMessage(Control::Host::Command::OCCReset).c_str());
194 
195     // OCC Sensor ID for callout reasons
196     method.append(std::variant<uint8_t>(std::get<0>(sensorMap.at(instance))));
197     bus.call_noreply(method);
198     return;
199 #endif
200 }
201 
202 // Handler called by Host control command handler to convey the
203 // status of the executed command
hostControlEvent(sdbusplus::message_t & msg)204 void Status::hostControlEvent(sdbusplus::message_t& msg)
205 {
206     std::string cmdCompleted{};
207     std::string cmdStatus{};
208 
209     msg.read(cmdCompleted, cmdStatus);
210 
211     lg2::debug("Host control signal values: command={CMD}, status={STATUS}",
212                "CMD", cmdCompleted, "STATUS", cmdStatus);
213 
214     if (Control::Host::convertResultFromString(cmdStatus) !=
215         Control::Host::Result::Success)
216     {
217         if (Control::Host::convertCommandFromString(cmdCompleted) ==
218             Control::Host::Command::OCCReset)
219         {
220             // Must be a Timeout. Log an Error trace
221             lg2::error(
222                 "Error resetting the OCC: path={PATH}, sensorid={SENSOR}",
223                 "PATH", path, "SENSOR", std::get<0>(sensorMap.at(instance)));
224         }
225     }
226     return;
227 }
228 
229 // Called from Manager::pollerTimerExpired() in preperation to POLL OCC.
readOccState()230 void Status::readOccState()
231 {
232     if (stateValid)
233     {
234         // Reset retry count (since state is good)
235         currentOccReadRetriesCount = occReadRetries;
236     }
237     occReadStateNow();
238 }
239 
240 #ifdef POWER10
241 // Special processing that needs to happen once the OCCs change to ACTIVE state
occsWentActive()242 void Status::occsWentActive()
243 {
244     CmdStatus status = CmdStatus::SUCCESS;
245 
246     // IPS data will get sent automatically after a mode change if the mode
247     // supports it.
248     pmode->needToSendIPS();
249 
250     status = pmode->sendModeChange();
251     if (status != CmdStatus::SUCCESS)
252     {
253         lg2::error(
254             "Status::occsWentActive: OCC mode change failed with status {STATUS}",
255             "STATUS", status);
256 
257         // Disable and reset to try recovering
258         deviceError();
259     }
260 }
261 
262 // Send Ambient and Altitude to the OCC
sendAmbient(const uint8_t inTemp,const uint16_t inAltitude)263 CmdStatus Status::sendAmbient(const uint8_t inTemp, const uint16_t inAltitude)
264 {
265     CmdStatus status = CmdStatus::FAILURE;
266     bool ambientValid = true;
267     uint8_t ambientTemp = inTemp;
268     uint16_t altitude = inAltitude;
269 
270     if (ambientTemp == 0xFF)
271     {
272         // Get latest readings from manager
273         manager.getAmbientData(ambientValid, ambientTemp, altitude);
274         lg2::debug(
275             "sendAmbient: valid: {VALID}, Ambient: {TEMP}C, altitude: {ALT}m",
276             "VALID", ambientValid, "TEMP", ambientTemp, "ALT", altitude);
277     }
278 
279     std::vector<std::uint8_t> cmd, rsp;
280     cmd.reserve(11);
281     cmd.push_back(uint8_t(CmdType::SEND_AMBIENT));
282     cmd.push_back(0x00);                    // Data Length (2 bytes)
283     cmd.push_back(0x08);                    //
284     cmd.push_back(0x00);                    // Version
285     cmd.push_back(ambientValid ? 0 : 0xFF); // Ambient Status
286     cmd.push_back(ambientTemp);             // Ambient Temperature
287     cmd.push_back(altitude >> 8);           // Altitude in meters (2 bytes)
288     cmd.push_back(altitude & 0xFF);         //
289     cmd.push_back(0x00);                    // Reserved (3 bytes)
290     cmd.push_back(0x00);
291     cmd.push_back(0x00);
292     lg2::debug("sendAmbient: SEND_AMBIENT "
293                "command to OCC{INST} ({SIZE} bytes)",
294                "INST", instance, "SIZE", cmd.size());
295     status = occCmd.send(cmd, rsp);
296     if (status == CmdStatus::SUCCESS)
297     {
298         if (rsp.size() == 5)
299         {
300             if (RspStatus::SUCCESS != RspStatus(rsp[2]))
301             {
302                 lg2::error(
303                     "sendAmbient: SEND_AMBIENT failed with rspStatus {STATUS}",
304                     "STATUS", lg2::hex, rsp[2]);
305                 dump_hex(rsp);
306                 status = CmdStatus::FAILURE;
307             }
308         }
309         else
310         {
311             lg2::error(
312                 "sendAmbient: INVALID SEND_AMBIENT response length:{SIZE}",
313                 "SIZE", rsp.size());
314             dump_hex(rsp);
315             status = CmdStatus::FAILURE;
316         }
317     }
318     else
319     {
320         lg2::error("sendAmbient: SEND_AMBIENT FAILED! with status {STATUS}",
321                    "STATUS", lg2::hex, uint8_t(status));
322 
323         if (status == CmdStatus::COMM_FAILURE)
324         {
325             // Disable due to OCC comm failure and reset to try recovering
326             deviceError(Error::Descriptor(OCC_COMM_ERROR_PATH));
327         }
328     }
329 
330     return status;
331 }
332 
333 // Called when safe timer expires to determine if OCCs need to be reset
safeStateDelayExpired()334 void Status::safeStateDelayExpired()
335 {
336     if (this->occActive())
337     {
338         lg2::info(
339             "safeStateDelayExpired: OCC{INST} state missing or not valid, requesting reset",
340             "INST", instance);
341         // Disable and reset to try recovering
342         deviceError(Error::Descriptor(SAFE_ERROR_PATH));
343     }
344 }
345 #endif // POWER10
346 
getHwmonPath()347 fs::path Status::getHwmonPath()
348 {
349     using namespace std::literals::string_literals;
350 
351     if (!fs::exists(hwmonPath))
352     {
353         static bool tracedFail[8] = {0};
354 
355         if (!hwmonPath.empty())
356         {
357             lg2::warning(
358                 "Status::getHwmonPath(): path no longer exists: {PATH}", "PATH",
359                 hwmonPath);
360             hwmonPath.clear();
361         }
362 
363         // Build the base HWMON path
364         fs::path prefixPath =
365             fs::path{OCC_HWMON_PATH + "occ-hwmon."s +
366                      std::to_string(instance + 1) + "/hwmon/"s};
367 
368         // Get the hwmonXX directory name
369         try
370         {
371             // there should only be one directory
372             const int numDirs = std::distance(
373                 fs::directory_iterator(prefixPath), fs::directory_iterator{});
374             if (numDirs == 1)
375             {
376                 hwmonPath = *fs::directory_iterator(prefixPath);
377                 tracedFail[instance] = false;
378             }
379             else
380             {
381                 if (!tracedFail[instance])
382                 {
383                     lg2::error(
384                         "Status::getHwmonPath(): Found multiple ({NUM}) hwmon paths!",
385                         "NUM", numDirs);
386                     tracedFail[instance] = true;
387                 }
388             }
389         }
390         catch (const fs::filesystem_error& e)
391         {
392             if (!tracedFail[instance])
393             {
394                 lg2::error(
395                     "Status::getHwmonPath(): error accessing {PATH}: {ERROR}",
396                     "PATH", prefixPath, "ERROR", e.what());
397                 tracedFail[instance] = true;
398             }
399         }
400     }
401 
402     return hwmonPath;
403 }
404 
405 // Called to read state and handle any errors
occReadStateNow()406 void Status::occReadStateNow()
407 {
408     unsigned int state;
409     const fs::path filename =
410         fs::path(DEV_PATH) /
411         fs::path(sysfsName + "." + std::to_string(instance + 1)) / "occ_state";
412 
413     std::ifstream file;
414     bool goodFile = false;
415 
416     // open file.
417     file.open(filename, std::ios::in);
418     const int openErrno = errno;
419 
420     // File is open and state can be used.
421     if (file.is_open() && file.good())
422     {
423         goodFile = true;
424         file >> state;
425         // Read the error code (if any) to check status of the read
426         std::ios_base::iostate readState = file.rdstate();
427         if (readState)
428         {
429             // There was a failure reading the file
430             if (lastOccReadStatus != -1)
431             {
432                 // Trace error bits
433                 std::string errorBits = "";
434                 if (readState & std::ios_base::eofbit)
435                 {
436                     errorBits += " EOF";
437                 }
438                 if (readState & std::ios_base::failbit)
439                 {
440                     errorBits += " failbit";
441                 }
442                 if (readState & std::ios_base::badbit)
443                 {
444                     errorBits += " badbit";
445                 }
446                 lg2::error(
447                     "readOccState: Failed to read OCC{INST} state: Read error on I/O operation - {ERROR}",
448                     "INST", instance, "ERROR", errorBits);
449                 lastOccReadStatus = -1;
450             }
451             goodFile = false;
452         }
453 
454         if (goodFile && (state != lastState))
455         {
456             // Trace OCC state changes
457             lg2::info(
458                 "Status::readOccState: OCC{INST} state {STATE} (lastState: {PRIOR})",
459                 "INST", instance, "STATE", lg2::hex, state, "PRIOR", lg2::hex,
460                 lastState);
461             lastState = state;
462 #ifdef POWER10
463             if (OccState(state) == OccState::ACTIVE)
464             {
465                 if (pmode && device.master())
466                 {
467                     // Set the master OCC on the PowerMode object
468                     pmode->setMasterOcc(path);
469                     // Enable mode changes
470                     pmode->setMasterActive();
471 
472                     // Special processing by master OCC when it goes active
473                     occsWentActive();
474                 }
475 
476                 CmdStatus status = sendAmbient();
477                 if (status != CmdStatus::SUCCESS)
478                 {
479                     lg2::error(
480                         "readOccState: Sending Ambient failed with status {STATUS}",
481                         "STATUS", status);
482                 }
483             }
484 
485             // If OCC in known Good State.
486             if ((OccState(state) == OccState::ACTIVE) ||
487                 (OccState(state) == OccState::CHARACTERIZATION) ||
488                 (OccState(state) == OccState::OBSERVATION))
489             {
490                 // Good OCC State then sensors valid again
491                 stateValid = true;
492 
493                 if (safeStateDelayTimer.isEnabled())
494                 {
495                     // stop safe delay timer (no longer in SAFE state)
496                     safeStateDelayTimer.setEnabled(false);
497                 }
498             }
499             else
500             {
501                 // OCC is in SAFE or some other unsupported state
502                 if (!safeStateDelayTimer.isEnabled())
503                 {
504                     lg2::error(
505                         "readOccState: Invalid OCC{INST} state of {STATE}, starting safe state delay timer",
506                         "INST", instance, "STATE", state);
507                     // start safe delay timer (before requesting reset)
508                     using namespace std::literals::chrono_literals;
509                     safeStateDelayTimer.restartOnce(60s);
510                 }
511                 // Not a supported state (update sensors to NaN and not
512                 // functional)
513                 stateValid = false;
514             }
515 #else
516             // Before P10 state not checked, only used good file open.
517             stateValid = true;
518 #endif
519         }
520     }
521 #ifdef POWER10
522     else
523     {
524         // Unable to read state
525         stateValid = false;
526     }
527 #endif
528     file.close();
529 
530     // if failed to Read a state or not a valid state -> Attempt retry
531     // after 1 Second delay if allowed.
532     if ((!goodFile) || (!stateValid))
533     {
534         if (!goodFile)
535         {
536             // If not able to read, OCC may be offline
537             if (openErrno != lastOccReadStatus)
538             {
539                 lg2::error(
540                     "Status::readOccState: open/read failed trying to read OCC{INST} state (open errno={ERROR})",
541                     "INST", instance, "ERROR", openErrno);
542                 lastOccReadStatus = openErrno;
543             }
544         }
545         else
546         {
547             // else this failed due to state not valid.
548             if (state != lastState)
549             {
550                 lg2::error(
551                     "Status::readOccState: OCC{INST} Invalid state {STATE} (last state: {PRIOR})",
552                     "INST", instance, "STATE", lg2::hex, state, "PRIOR",
553                     lg2::hex, lastState);
554             }
555         }
556 
557 #ifdef READ_OCC_SENSORS
558         manager.setSensorValueToNaN(instance);
559 #endif
560 
561         // See occReadRetries for number of retry attempts.
562         if (currentOccReadRetriesCount > 0)
563         {
564             --currentOccReadRetriesCount;
565         }
566         else
567         {
568             lg2::error("readOccState: failed to read OCC{INST} state!", "INST",
569                        instance);
570 
571             // State could not be determined, set it to NO State.
572             lastState = 0;
573 
574             // Disable the ability to send Failed actions until OCC is
575             // Active again.
576             stateValid = false;
577 
578             // Disable due to OCC comm failure and reset to try recovering
579             // (processor callout will be added)
580             deviceError(Error::Descriptor(OCC_COMM_ERROR_PATH, ECOMM,
581                                           procPath.c_str(), true));
582 
583             // Reset retry count (for next attempt after recovery)
584             currentOccReadRetriesCount = occReadRetries;
585         }
586     }
587     else
588     {
589         if (lastOccReadStatus != 0)
590         {
591             lg2::info(
592                 "Status::readOccState: successfully read OCC{INST} state: {STATE}",
593                 "INST", instance, "STATE", state);
594             lastOccReadStatus = 0; // no error
595         }
596     }
597 }
598 
599 // Update processor throttle status on dbus
updateThrottle(const bool isThrottled,const uint8_t newReason)600 void Status::updateThrottle(const bool isThrottled, const uint8_t newReason)
601 {
602     if (!throttleHandle)
603     {
604         return;
605     }
606 
607     uint8_t newThrottleCause = throttleCause;
608 
609     if (isThrottled) // throttled due to newReason
610     {
611         if ((newReason & throttleCause) == 0)
612         {
613             // set the bit(s) for passed in reason
614             newThrottleCause |= newReason;
615         }
616         // else no change
617     }
618     else // no longer throttled due to newReason
619     {
620         if ((newReason & throttleCause) != 0)
621         {
622             // clear the bit(s) for passed in reason
623             newThrottleCause &= ~newReason;
624         }
625         // else no change
626     }
627 
628     if (newThrottleCause != throttleCause)
629     {
630         if (newThrottleCause == THROTTLED_NONE)
631         {
632             lg2::debug(
633                 "updateThrottle: OCC{INST} no longer throttled (prior reason: {REASON})",
634                 "INST", instance, "REASON", throttleCause);
635             throttleCause = THROTTLED_NONE;
636             throttleHandle->throttled(false);
637             throttleHandle->throttleCauses({});
638         }
639         else
640         {
641             lg2::debug(
642                 "updateThrottle: OCC{INST} is throttled with reason {REASON} (prior reason: {PRIOR})",
643                 "INST", instance, "REASON", newThrottleCause, "PRIOR",
644                 throttleCause);
645             throttleCause = newThrottleCause;
646 
647             std::vector<ThrottleObj::ThrottleReasons> updatedCauses;
648             if (throttleCause & THROTTLED_POWER)
649             {
650                 updatedCauses.push_back(
651                     throttleHandle->ThrottleReasons::PowerLimit);
652             }
653             if (throttleCause & THROTTLED_THERMAL)
654             {
655                 updatedCauses.push_back(
656                     throttleHandle->ThrottleReasons::ThermalLimit);
657             }
658             if (throttleCause & THROTTLED_SAFE)
659             {
660                 updatedCauses.push_back(
661                     throttleHandle->ThrottleReasons::ManagementDetectedFault);
662             }
663             throttleHandle->throttleCauses(updatedCauses);
664             throttleHandle->throttled(true);
665         }
666     }
667     // else no change to throttle status
668 }
669 
670 // Get processor path associated with this OCC
readProcAssociation()671 void Status::readProcAssociation()
672 {
673     std::string managingPath = path + "/power_managing";
674     lg2::debug("readProcAssociation: getting endpoints for {MANAGE} ({PATH})",
675                "MANAGE", managingPath, "PATH", path);
676     try
677     {
678         utils::PropertyValue procPathProperty{};
679         procPathProperty = utils::getProperty(
680             managingPath, "xyz.openbmc_project.Association", "endpoints");
681         auto result = std::get<std::vector<std::string>>(procPathProperty);
682         if (result.size() > 0)
683         {
684             procPath = result[0];
685             lg2::info("readProcAssociation: OCC{INST} has proc={PATH}", "INST",
686                       instance, "PATH", procPath);
687         }
688         else
689         {
690             lg2::error(
691                 "readProcAssociation: No processor associated with OCC{INST} / {PATH}",
692                 "INST", instance, "PATH", path);
693         }
694     }
695     catch (const sdbusplus::exception_t& e)
696     {
697         lg2::error(
698             "readProcAssociation: Unable to get proc assocated with {PATH} - {ERROR}",
699             "PATH", path, "ERROR", e.what());
700         procPath = {};
701     }
702 }
703 
704 } // namespace occ
705 } // namespace open_power
706