xref: /openbmc/openpower-occ-control/occ_status.cpp (revision 37abe9be91df2bb173c9642a2740a425904d7921)
1 #include "occ_status.hpp"
2 
3 #include "occ_manager.hpp"
4 #include "occ_sensor.hpp"
5 #include "powermode.hpp"
6 #include "utils.hpp"
7 
8 #include <phosphor-logging/lg2.hpp>
9 
10 #include <filesystem>
11 
12 namespace open_power
13 {
14 namespace occ
15 {
16 
17 using namespace phosphor::logging;
18 
19 using ThrottleObj =
20     sdbusplus::xyz::openbmc_project::Control::Power::server::Throttle;
21 
22 // Handles updates to occActive property
occActive(bool value)23 bool Status::occActive(bool value)
24 {
25     if (value != this->occActive())
26     {
27         lg2::info("Status::occActive OCC{INST} changed to {STATE}", "INST",
28                   instance, "STATE", value);
29         if (value)
30         {
31             // Clear prior throttle reason (before setting device active)
32             updateThrottle(false, THROTTLED_ALL);
33 
34             // Set the device active
35             device.setActive(true);
36 
37             // Reset last OCC state
38             lastState = 0;
39 
40             // Start watching for errors (throttles, etc)
41             try
42             {
43                 addErrorWatch();
44             }
45             catch (const OpenFailure& e)
46             {
47                 // Failed to add watch for throttle events, request reset to try
48                 // to recover comm
49                 lg2::error(
50                     "Status::occActive: Unable to add error watch(s) for OCC{INST} watch: {ERROR}",
51                     "INST", instance, "ERROR", e.what());
52                 deviceError(Error::Descriptor(OCC_COMM_ERROR_PATH));
53                 return Base::Status::occActive(false);
54             }
55 
56             // Update the OCC active sensor
57             Base::Status::occActive(value);
58 
59             if (device.master())
60             {
61                 // Update powercap bounds from OCC
62                 manager.updatePcapBounds();
63             }
64 
65             // Call into Manager to let know that we have bound
66             if (this->managerCallBack)
67             {
68                 this->managerCallBack(instance, value);
69             }
70         }
71         else
72         {
73 #ifdef POWER10
74             if (pmode && device.master())
75             {
76                 // Prevent mode changes
77                 pmode->setMasterActive(false);
78             }
79             if (safeStateDelayTimer.isEnabled())
80             {
81                 // stop safe delay timer
82                 safeStateDelayTimer.setEnabled(false);
83             }
84 #endif
85             // Call into Manager to let know that we will unbind.
86             if (this->managerCallBack)
87             {
88                 this->managerCallBack(instance, value);
89             }
90 
91             // Stop watching for errors
92             removeErrorWatch();
93 
94             // Set the device inactive
95             device.setActive(false);
96 
97             // Clear throttles (OCC not active after disabling device)
98             updateThrottle(false, THROTTLED_ALL);
99         }
100     }
101     else if (value && !device.active())
102     {
103         // Existing error watch is on a dead file descriptor.
104         removeErrorWatch();
105 
106         /*
107          * In it's constructor, Status checks Device::bound() to see if OCC is
108          * active or not.
109          * Device::bound() checks for occX-dev0 directory.
110          * We will lose occX-dev0 directories during FSI rescan.
111          * So, if we start this application (and construct Status), and then
112          * later do FSI rescan, we will end up with occActive = true and device
113          * NOT bound. Lets correct that situation here.
114          */
115         device.setActive(true);
116 
117         // Add error watch again
118         try
119         {
120             addErrorWatch();
121         }
122         catch (const OpenFailure& e)
123         {
124             // Failed to add watch for throttle events, request reset to try to
125             // recover comm
126             lg2::error(
127                 "Status::occActive: Unable to add error watch(s) again for OCC{INST} watch: {ERROR}",
128                 "INST", instance, "ERROR", e.what());
129             deviceError(Error::Descriptor(OCC_COMM_ERROR_PATH));
130             return Base::Status::occActive(false);
131         }
132     }
133     else if (!value && device.active())
134     {
135         removeErrorWatch();
136 
137         // In the event that the application never receives the active signal
138         // even though the OCC is active (this can occur if the BMC is rebooted
139         // with the host on, since the initial OCC driver probe will discover
140         // the OCCs), this application needs to be able to unbind the device
141         // when we get the OCC inactive signal.
142         device.setActive(false);
143     }
144     return Base::Status::occActive(value);
145 }
146 
147 // Callback handler when a device error is reported.
deviceError(Error::Descriptor d)148 void Status::deviceError(Error::Descriptor d)
149 {
150 #ifdef POWER10
151     if (pmode && device.master())
152     {
153         // Prevent mode changes
154         pmode->setMasterActive(false);
155     }
156 #endif
157 
158     if (d.log)
159     {
160         FFDC::createOCCResetPEL(instance, d.path, d.err, d.callout);
161     }
162 
163     // This would deem OCC inactive
164     this->occActive(false);
165 
166     // Reset the OCC
167     this->resetOCC();
168 }
169 
170 // Sends message to host control command handler to reset OCC
resetOCC()171 void Status::resetOCC()
172 {
173     lg2::info(">>Status::resetOCC() - requesting reset for OCC{INST}", "INST",
174               instance);
175     this->occActive(false);
176 #ifdef PLDM
177     if (resetCallBack)
178     {
179         this->resetCallBack(instance);
180     }
181 #else
182     constexpr auto CONTROL_HOST_PATH = "/org/open_power/control/host0";
183     constexpr auto CONTROL_HOST_INTF = "org.open_power.Control.Host";
184 
185     // This will throw exception on failure
186     auto service = utils::getService(CONTROL_HOST_PATH, CONTROL_HOST_INTF);
187 
188     auto& bus = utils::getBus();
189     auto method = bus.new_method_call(service.c_str(), CONTROL_HOST_PATH,
190                                       CONTROL_HOST_INTF, "Execute");
191     // OCC Reset control command
192     method.append(convertForMessage(Control::Host::Command::OCCReset).c_str());
193 
194     // OCC Sensor ID for callout reasons
195     method.append(std::variant<uint8_t>(std::get<0>(sensorMap.at(instance))));
196     bus.call_noreply(method);
197     return;
198 #endif
199 }
200 
201 // Handler called by Host control command handler to convey the
202 // status of the executed command
hostControlEvent(sdbusplus::message_t & msg)203 void Status::hostControlEvent(sdbusplus::message_t& msg)
204 {
205     std::string cmdCompleted{};
206     std::string cmdStatus{};
207 
208     msg.read(cmdCompleted, cmdStatus);
209 
210     lg2::debug("Host control signal values: command={CMD}, status={STATUS}",
211                "CMD", cmdCompleted, "STATUS", cmdStatus);
212 
213     if (Control::Host::convertResultFromString(cmdStatus) !=
214         Control::Host::Result::Success)
215     {
216         if (Control::Host::convertCommandFromString(cmdCompleted) ==
217             Control::Host::Command::OCCReset)
218         {
219             // Must be a Timeout. Log an Error trace
220             lg2::error(
221                 "Error resetting the OCC: path={PATH}, sensorid={SENSOR}",
222                 "PATH", path, "SENSOR", std::get<0>(sensorMap.at(instance)));
223         }
224     }
225     return;
226 }
227 
228 // Called from Manager::pollerTimerExpired() in preperation to POLL OCC.
readOccState()229 void Status::readOccState()
230 {
231     if (stateValid)
232     {
233         // Reset retry count (since state is good)
234         currentOccReadRetriesCount = occReadRetries;
235     }
236     occReadStateNow();
237 }
238 
239 #ifdef POWER10
240 // Special processing that needs to happen once the OCCs change to ACTIVE state
occsWentActive()241 void Status::occsWentActive()
242 {
243     CmdStatus status = CmdStatus::SUCCESS;
244 
245     // IPS data will get sent automatically after a mode change if the mode
246     // supports it.
247     pmode->needToSendIPS();
248 
249     status = pmode->sendModeChange();
250     if (status != CmdStatus::SUCCESS)
251     {
252         lg2::error(
253             "Status::occsWentActive: OCC mode change failed with status {STATUS}",
254             "STATUS", status);
255 
256         // Disable and reset to try recovering
257         deviceError();
258     }
259 }
260 
261 // Send Ambient and Altitude to the OCC
sendAmbient(const uint8_t inTemp,const uint16_t inAltitude)262 CmdStatus Status::sendAmbient(const uint8_t inTemp, const uint16_t inAltitude)
263 {
264     CmdStatus status = CmdStatus::FAILURE;
265     bool ambientValid = true;
266     uint8_t ambientTemp = inTemp;
267     uint16_t altitude = inAltitude;
268 
269     if (ambientTemp == 0xFF)
270     {
271         // Get latest readings from manager
272         manager.getAmbientData(ambientValid, ambientTemp, altitude);
273         lg2::debug(
274             "sendAmbient: valid: {VALID}, Ambient: {TEMP}C, altitude: {ALT}m",
275             "VALID", ambientValid, "TEMP", ambientTemp, "ALT", altitude);
276     }
277 
278     std::vector<std::uint8_t> cmd, rsp;
279     cmd.reserve(11);
280     cmd.push_back(uint8_t(CmdType::SEND_AMBIENT));
281     cmd.push_back(0x00);                    // Data Length (2 bytes)
282     cmd.push_back(0x08);                    //
283     cmd.push_back(0x00);                    // Version
284     cmd.push_back(ambientValid ? 0 : 0xFF); // Ambient Status
285     cmd.push_back(ambientTemp);             // Ambient Temperature
286     cmd.push_back(altitude >> 8);           // Altitude in meters (2 bytes)
287     cmd.push_back(altitude & 0xFF);         //
288     cmd.push_back(0x00);                    // Reserved (3 bytes)
289     cmd.push_back(0x00);
290     cmd.push_back(0x00);
291     lg2::debug("sendAmbient: SEND_AMBIENT "
292                "command to OCC{INST} ({SIZE} bytes)",
293                "INST", instance, "SIZE", cmd.size());
294     status = occCmd.send(cmd, rsp);
295     if (status == CmdStatus::SUCCESS)
296     {
297         if (rsp.size() == 5)
298         {
299             if (RspStatus::SUCCESS != RspStatus(rsp[2]))
300             {
301                 lg2::error(
302                     "sendAmbient: SEND_AMBIENT failed with rspStatus {STATUS}",
303                     "STATUS", lg2::hex, rsp[2]);
304                 dump_hex(rsp);
305                 status = CmdStatus::FAILURE;
306             }
307         }
308         else
309         {
310             lg2::error(
311                 "sendAmbient: INVALID SEND_AMBIENT response length:{SIZE}",
312                 "SIZE", rsp.size());
313             dump_hex(rsp);
314             status = CmdStatus::FAILURE;
315         }
316     }
317     else
318     {
319         lg2::error("sendAmbient: SEND_AMBIENT FAILED! with status {STATUS}",
320                    "STATUS", lg2::hex, uint8_t(status));
321 
322         if (status == CmdStatus::COMM_FAILURE)
323         {
324             // Disable due to OCC comm failure and reset to try recovering
325             deviceError(Error::Descriptor(OCC_COMM_ERROR_PATH));
326         }
327     }
328 
329     return status;
330 }
331 
332 // Called when safe timer expires to determine if OCCs need to be reset
safeStateDelayExpired()333 void Status::safeStateDelayExpired()
334 {
335     if (this->occActive())
336     {
337         lg2::info(
338             "safeStateDelayExpired: OCC{INST} state missing or not valid, requesting reset",
339             "INST", instance);
340         // Disable and reset to try recovering
341         deviceError(Error::Descriptor(SAFE_ERROR_PATH));
342     }
343 }
344 #endif // POWER10
345 
getHwmonPath()346 fs::path Status::getHwmonPath()
347 {
348     using namespace std::literals::string_literals;
349 
350     if (!fs::exists(hwmonPath))
351     {
352         static bool tracedFail[8] = {0};
353 
354         if (!hwmonPath.empty())
355         {
356             lg2::warning(
357                 "Status::getHwmonPath(): path no longer exists: {PATH}", "PATH",
358                 hwmonPath);
359             hwmonPath.clear();
360         }
361 
362         // Build the base HWMON path
363         fs::path prefixPath =
364             fs::path{OCC_HWMON_PATH + "occ-hwmon."s +
365                      std::to_string(instance + 1) + "/hwmon/"s};
366 
367         // Get the hwmonXX directory name
368         try
369         {
370             // there should only be one directory
371             const int numDirs = std::distance(
372                 fs::directory_iterator(prefixPath), fs::directory_iterator{});
373             if (numDirs == 1)
374             {
375                 hwmonPath = *fs::directory_iterator(prefixPath);
376                 tracedFail[instance] = false;
377             }
378             else
379             {
380                 if (!tracedFail[instance])
381                 {
382                     lg2::error(
383                         "Status::getHwmonPath(): Found multiple ({NUM}) hwmon paths!",
384                         "NUM", numDirs);
385                     tracedFail[instance] = true;
386                 }
387             }
388         }
389         catch (const fs::filesystem_error& e)
390         {
391             if (!tracedFail[instance])
392             {
393                 lg2::error(
394                     "Status::getHwmonPath(): error accessing {PATH}: {ERROR}",
395                     "PATH", prefixPath, "ERROR", e.what());
396                 tracedFail[instance] = true;
397             }
398         }
399     }
400 
401     return hwmonPath;
402 }
403 
404 // Called to read state and handle any errors
occReadStateNow()405 void Status::occReadStateNow()
406 {
407     unsigned int state;
408     const fs::path filename =
409         fs::path(DEV_PATH) /
410         fs::path(sysfsName + "." + std::to_string(instance + 1)) / "occ_state";
411 
412     std::ifstream file;
413     bool goodFile = false;
414 
415     // open file.
416     file.open(filename, std::ios::in);
417     const int openErrno = errno;
418 
419     // File is open and state can be used.
420     if (file.is_open() && file.good())
421     {
422         goodFile = true;
423         file >> state;
424         // Read the error code (if any) to check status of the read
425         std::ios_base::iostate readState = file.rdstate();
426         if (readState)
427         {
428             // There was a failure reading the file
429             if (lastOccReadStatus != -1)
430             {
431                 // Trace error bits
432                 std::string errorBits = "";
433                 if (readState & std::ios_base::eofbit)
434                 {
435                     errorBits += " EOF";
436                 }
437                 if (readState & std::ios_base::failbit)
438                 {
439                     errorBits += " failbit";
440                 }
441                 if (readState & std::ios_base::badbit)
442                 {
443                     errorBits += " badbit";
444                 }
445                 lg2::error(
446                     "readOccState: Failed to read OCC{INST} state: Read error on I/O operation - {ERROR}",
447                     "INST", instance, "ERROR", errorBits);
448                 lastOccReadStatus = -1;
449             }
450             goodFile = false;
451         }
452 
453         if (goodFile && (state != lastState))
454         {
455             // Trace OCC state changes
456             lg2::info(
457                 "Status::readOccState: OCC{INST} state {STATE} (lastState: {PRIOR})",
458                 "INST", instance, "STATE", lg2::hex, state, "PRIOR", lg2::hex,
459                 lastState);
460             lastState = state;
461 #ifdef POWER10
462             if (OccState(state) == OccState::ACTIVE)
463             {
464                 if (pmode && device.master())
465                 {
466                     // Set the master OCC on the PowerMode object
467                     pmode->setMasterOcc(path);
468                     // Enable mode changes
469                     pmode->setMasterActive();
470 
471                     // Special processing by master OCC when it goes active
472                     occsWentActive();
473                 }
474 
475                 CmdStatus status = sendAmbient();
476                 if (status != CmdStatus::SUCCESS)
477                 {
478                     lg2::error(
479                         "readOccState: Sending Ambient failed with status {STATUS}",
480                         "STATUS", status);
481                 }
482             }
483 
484             // If OCC in known Good State.
485             if ((OccState(state) == OccState::ACTIVE) ||
486                 (OccState(state) == OccState::CHARACTERIZATION) ||
487                 (OccState(state) == OccState::OBSERVATION))
488             {
489                 // Good OCC State then sensors valid again
490                 stateValid = true;
491 
492                 if (safeStateDelayTimer.isEnabled())
493                 {
494                     // stop safe delay timer (no longer in SAFE state)
495                     safeStateDelayTimer.setEnabled(false);
496                 }
497             }
498             else
499             {
500                 // OCC is in SAFE or some other unsupported state
501                 if (!safeStateDelayTimer.isEnabled())
502                 {
503                     lg2::error(
504                         "readOccState: Invalid OCC{INST} state of {STATE}, starting safe state delay timer",
505                         "INST", instance, "STATE", state);
506                     // start safe delay timer (before requesting reset)
507                     using namespace std::literals::chrono_literals;
508                     safeStateDelayTimer.restartOnce(60s);
509                 }
510                 // Not a supported state (update sensors to NaN and not
511                 // functional)
512                 stateValid = false;
513             }
514 #else
515             // Before P10 state not checked, only used good file open.
516             stateValid = true;
517 #endif
518         }
519     }
520 #ifdef POWER10
521     else
522     {
523         // Unable to read state
524         stateValid = false;
525     }
526 #endif
527     file.close();
528 
529     // if failed to Read a state or not a valid state -> Attempt retry
530     // after 1 Second delay if allowed.
531     if ((!goodFile) || (!stateValid))
532     {
533         if (!goodFile)
534         {
535             // If not able to read, OCC may be offline
536             if (openErrno != lastOccReadStatus)
537             {
538                 lg2::error(
539                     "Status::readOccState: open/read failed trying to read OCC{INST} state (open errno={ERROR})",
540                     "INST", instance, "ERROR", openErrno);
541                 lastOccReadStatus = openErrno;
542             }
543         }
544         else
545         {
546             // else this failed due to state not valid.
547             if (state != lastState)
548             {
549                 lg2::error(
550                     "Status::readOccState: OCC{INST} Invalid state {STATE} (last state: {PRIOR})",
551                     "INST", instance, "STATE", lg2::hex, state, "PRIOR",
552                     lg2::hex, lastState);
553             }
554         }
555 
556 #ifdef READ_OCC_SENSORS
557         manager.setSensorValueToNaN(instance);
558 #endif
559 
560         // See occReadRetries for number of retry attempts.
561         if (currentOccReadRetriesCount > 0)
562         {
563             --currentOccReadRetriesCount;
564         }
565         else
566         {
567             lg2::error("readOccState: failed to read OCC{INST} state!", "INST",
568                        instance);
569 
570             // State could not be determined, set it to NO State.
571             lastState = 0;
572 
573             // Disable the ability to send Failed actions until OCC is
574             // Active again.
575             stateValid = false;
576 
577             // Disable due to OCC comm failure and reset to try recovering
578             deviceError(Error::Descriptor(OCC_COMM_ERROR_PATH));
579 
580             // Reset retry count (for next attempt after recovery)
581             currentOccReadRetriesCount = occReadRetries;
582         }
583     }
584     else
585     {
586         if (lastOccReadStatus != 0)
587         {
588             lg2::info(
589                 "Status::readOccState: successfully read OCC{INST} state: {STATE}",
590                 "INST", instance, "STATE", state);
591             lastOccReadStatus = 0; // no error
592         }
593     }
594 }
595 
596 // Update processor throttle status on dbus
updateThrottle(const bool isThrottled,const uint8_t newReason)597 void Status::updateThrottle(const bool isThrottled, const uint8_t newReason)
598 {
599     if (!throttleHandle)
600     {
601         return;
602     }
603 
604     uint8_t newThrottleCause = throttleCause;
605 
606     if (isThrottled) // throttled due to newReason
607     {
608         if ((newReason & throttleCause) == 0)
609         {
610             // set the bit(s) for passed in reason
611             newThrottleCause |= newReason;
612         }
613         // else no change
614     }
615     else // no longer throttled due to newReason
616     {
617         if ((newReason & throttleCause) != 0)
618         {
619             // clear the bit(s) for passed in reason
620             newThrottleCause &= ~newReason;
621         }
622         // else no change
623     }
624 
625     if (newThrottleCause != throttleCause)
626     {
627         if (newThrottleCause == THROTTLED_NONE)
628         {
629             lg2::debug(
630                 "updateThrottle: OCC{INST} no longer throttled (prior reason: {REASON})",
631                 "INST", instance, "REASON", throttleCause);
632             throttleCause = THROTTLED_NONE;
633             throttleHandle->throttled(false);
634             throttleHandle->throttleCauses({});
635         }
636         else
637         {
638             lg2::debug(
639                 "updateThrottle: OCC{INST} is throttled with reason {REASON} (prior reason: {PRIOR})",
640                 "INST", instance, "REASON", newThrottleCause, "PRIOR",
641                 throttleCause);
642             throttleCause = newThrottleCause;
643 
644             std::vector<ThrottleObj::ThrottleReasons> updatedCauses;
645             if (throttleCause & THROTTLED_POWER)
646             {
647                 updatedCauses.push_back(
648                     throttleHandle->ThrottleReasons::PowerLimit);
649             }
650             if (throttleCause & THROTTLED_THERMAL)
651             {
652                 updatedCauses.push_back(
653                     throttleHandle->ThrottleReasons::ThermalLimit);
654             }
655             if (throttleCause & THROTTLED_SAFE)
656             {
657                 updatedCauses.push_back(
658                     throttleHandle->ThrottleReasons::ManagementDetectedFault);
659             }
660             throttleHandle->throttleCauses(updatedCauses);
661             throttleHandle->throttled(true);
662         }
663     }
664     // else no change to throttle status
665 }
666 
667 // Get processor path associated with this OCC
readProcAssociation()668 void Status::readProcAssociation()
669 {
670     std::string managingPath = path + "/power_managing";
671     lg2::debug("readProcAssociation: getting endpoints for {MANAGE} ({PATH})",
672                "MANAGE", managingPath, "PATH", path);
673     try
674     {
675         utils::PropertyValue procPathProperty{};
676         procPathProperty = utils::getProperty(
677             managingPath, "xyz.openbmc_project.Association", "endpoints");
678         auto result = std::get<std::vector<std::string>>(procPathProperty);
679         if (result.size() > 0)
680         {
681             procPath = result[0];
682             lg2::info("readProcAssociation: OCC{INST} has proc={PATH}", "INST",
683                       instance, "PATH", procPath);
684         }
685         else
686         {
687             lg2::error(
688                 "readProcAssociation: No processor associated with OCC{INST} / {PATH}",
689                 "INST", instance, "PATH", path);
690         }
691     }
692     catch (const sdbusplus::exception_t& e)
693     {
694         lg2::error(
695             "readProcAssociation: Unable to get proc assocated with {PATH} - {ERROR}",
696             "PATH", path, "ERROR", e.what());
697         procPath = {};
698     }
699 }
700 
701 } // namespace occ
702 } // namespace open_power
703