1 #include "occ_status.hpp"
2 
3 #include "occ_manager.hpp"
4 #include "occ_sensor.hpp"
5 #include "powermode.hpp"
6 #include "utils.hpp"
7 
8 #include <phosphor-logging/log.hpp>
9 
10 #include <filesystem>
11 #include <format>
12 
13 namespace open_power
14 {
15 namespace occ
16 {
17 
18 using namespace phosphor::logging;
19 
20 using ThrottleObj =
21     sdbusplus::xyz::openbmc_project::Control::Power::server::Throttle;
22 
23 // Handles updates to occActive property
occActive(bool value)24 bool Status::occActive(bool value)
25 {
26     if (value != this->occActive())
27     {
28         log<level::INFO>(std::format("Status::occActive OCC{} changed to {}",
29                                      instance, value)
30                              .c_str());
31         if (value)
32         {
33             // Clear prior throttle reason (before setting device active)
34             updateThrottle(false, THROTTLED_ALL);
35 
36             // Set the device active
37             device.setActive(true);
38 
39             // Update the OCC active sensor
40             Base::Status::occActive(value);
41 
42             // Start watching for errors
43             addErrorWatch();
44 
45             // Reset last OCC state
46             lastState = 0;
47 
48             if (device.master())
49             {
50                 // Update powercap bounds from OCC
51                 manager.updatePcapBounds();
52             }
53 
54             // Call into Manager to let know that we have bound
55             if (this->managerCallBack)
56             {
57                 this->managerCallBack(instance, value);
58             }
59         }
60         else
61         {
62 #ifdef POWER10
63             if (pmode && device.master())
64             {
65                 // Prevent mode changes
66                 pmode->setMasterActive(false);
67             }
68             if (safeStateDelayTimer.isEnabled())
69             {
70                 // stop safe delay timer
71                 safeStateDelayTimer.setEnabled(false);
72             }
73 #endif
74             // Call into Manager to let know that we will unbind.
75             if (this->managerCallBack)
76             {
77                 this->managerCallBack(instance, value);
78             }
79 
80             // Stop watching for errors
81             removeErrorWatch();
82 
83             // Set the device inactive
84             device.setActive(false);
85 
86             // Clear throttles (OCC not active after disabling device)
87             updateThrottle(false, THROTTLED_ALL);
88         }
89     }
90     else if (value && !device.active())
91     {
92         // Existing error watch is on a dead file descriptor.
93         removeErrorWatch();
94 
95         /*
96          * In it's constructor, Status checks Device::bound() to see if OCC is
97          * active or not.
98          * Device::bound() checks for occX-dev0 directory.
99          * We will lose occX-dev0 directories during FSI rescan.
100          * So, if we start this application (and construct Status), and then
101          * later do FSI rescan, we will end up with occActive = true and device
102          * NOT bound. Lets correct that situation here.
103          */
104         device.setActive(true);
105 
106         // Add error watch again
107         addErrorWatch();
108     }
109     else if (!value && device.active())
110     {
111         removeErrorWatch();
112 
113         // In the event that the application never receives the active signal
114         // even though the OCC is active (this can occur if the BMC is rebooted
115         // with the host on, since the initial OCC driver probe will discover
116         // the OCCs), this application needs to be able to unbind the device
117         // when we get the OCC inactive signal.
118         device.setActive(false);
119     }
120     return Base::Status::occActive(value);
121 }
122 
123 // Callback handler when a device error is reported.
deviceError(Error::Descriptor d)124 void Status::deviceError(Error::Descriptor d)
125 {
126 #ifdef POWER10
127     if (pmode && device.master())
128     {
129         // Prevent mode changes
130         pmode->setMasterActive(false);
131     }
132 #endif
133 
134     if (d.log)
135     {
136         FFDC::createOCCResetPEL(instance, d.path, d.err, d.callout);
137     }
138 
139     // This would deem OCC inactive
140     this->occActive(false);
141 
142     // Reset the OCC
143     this->resetOCC();
144 }
145 
146 // Sends message to host control command handler to reset OCC
resetOCC()147 void Status::resetOCC()
148 {
149     log<level::INFO>(
150         std::format(">>Status::resetOCC() - requesting reset for OCC{}",
151                     instance)
152             .c_str());
153 #ifdef PLDM
154     if (resetCallBack)
155     {
156         this->resetCallBack(instance);
157     }
158 #else
159     constexpr auto CONTROL_HOST_PATH = "/org/open_power/control/host0";
160     constexpr auto CONTROL_HOST_INTF = "org.open_power.Control.Host";
161 
162     // This will throw exception on failure
163     auto service = utils::getService(CONTROL_HOST_PATH, CONTROL_HOST_INTF);
164 
165     auto& bus = utils::getBus();
166     auto method = bus.new_method_call(service.c_str(), CONTROL_HOST_PATH,
167                                       CONTROL_HOST_INTF, "Execute");
168     // OCC Reset control command
169     method.append(convertForMessage(Control::Host::Command::OCCReset).c_str());
170 
171     // OCC Sensor ID for callout reasons
172     method.append(std::variant<uint8_t>(std::get<0>(sensorMap.at(instance))));
173     bus.call_noreply(method);
174     return;
175 #endif
176 }
177 
178 // Handler called by Host control command handler to convey the
179 // status of the executed command
hostControlEvent(sdbusplus::message_t & msg)180 void Status::hostControlEvent(sdbusplus::message_t& msg)
181 {
182     std::string cmdCompleted{};
183     std::string cmdStatus{};
184 
185     msg.read(cmdCompleted, cmdStatus);
186 
187     log<level::DEBUG>("Host control signal values",
188                       entry("COMMAND=%s", cmdCompleted.c_str()),
189                       entry("STATUS=%s", cmdStatus.c_str()));
190 
191     if (Control::Host::convertResultFromString(cmdStatus) !=
192         Control::Host::Result::Success)
193     {
194         if (Control::Host::convertCommandFromString(cmdCompleted) ==
195             Control::Host::Command::OCCReset)
196         {
197             // Must be a Timeout. Log an Error trace
198             log<level::ERR>(
199                 "Error resetting the OCC.", entry("PATH=%s", path.c_str()),
200                 entry("SENSORID=0x%X", std::get<0>(sensorMap.at(instance))));
201         }
202     }
203     return;
204 }
205 
206 // Called from Manager::pollerTimerExpired() in preperation to POLL OCC.
readOccState()207 void Status::readOccState()
208 {
209     currentOccReadRetriesCount = occReadRetries;
210     occReadStateNow();
211 }
212 
213 #ifdef POWER10
214 // Special processing that needs to happen once the OCCs change to ACTIVE state
occsWentActive()215 void Status::occsWentActive()
216 {
217     CmdStatus status = CmdStatus::SUCCESS;
218 
219     status = pmode->sendModeChange();
220     if (status != CmdStatus::SUCCESS)
221     {
222         log<level::ERR>(
223             std::format(
224                 "Status::occsWentActive: OCC mode change failed with status {}",
225                 status)
226                 .c_str());
227 
228         // Disable and reset to try recovering
229         deviceError();
230     }
231 
232     status = pmode->sendIpsData();
233     if (status != CmdStatus::SUCCESS)
234     {
235         log<level::ERR>(
236             std::format(
237                 "Status::occsWentActive: Sending Idle Power Save Config data failed with status {}",
238                 status)
239                 .c_str());
240 
241         if (status == CmdStatus::COMM_FAILURE)
242         {
243             // Disable and reset to try recovering
244             deviceError();
245         }
246     }
247 }
248 
249 // Send Ambient and Altitude to the OCC
sendAmbient(const uint8_t inTemp,const uint16_t inAltitude)250 CmdStatus Status::sendAmbient(const uint8_t inTemp, const uint16_t inAltitude)
251 {
252     CmdStatus status = CmdStatus::FAILURE;
253     bool ambientValid = true;
254     uint8_t ambientTemp = inTemp;
255     uint16_t altitude = inAltitude;
256 
257     if (ambientTemp == 0xFF)
258     {
259         // Get latest readings from manager
260         manager.getAmbientData(ambientValid, ambientTemp, altitude);
261         log<level::DEBUG>(
262             std::format("sendAmbient: valid: {}, Ambient: {}C, altitude: {}m",
263                         ambientValid, ambientTemp, altitude)
264                 .c_str());
265     }
266 
267     std::vector<std::uint8_t> cmd, rsp;
268     cmd.reserve(11);
269     cmd.push_back(uint8_t(CmdType::SEND_AMBIENT));
270     cmd.push_back(0x00);                    // Data Length (2 bytes)
271     cmd.push_back(0x08);                    //
272     cmd.push_back(0x00);                    // Version
273     cmd.push_back(ambientValid ? 0 : 0xFF); // Ambient Status
274     cmd.push_back(ambientTemp);             // Ambient Temperature
275     cmd.push_back(altitude >> 8);           // Altitude in meters (2 bytes)
276     cmd.push_back(altitude & 0xFF);         //
277     cmd.push_back(0x00);                    // Reserved (3 bytes)
278     cmd.push_back(0x00);
279     cmd.push_back(0x00);
280     log<level::DEBUG>(std::format("sendAmbient: SEND_AMBIENT "
281                                   "command to OCC{} ({} bytes)",
282                                   instance, cmd.size())
283                           .c_str());
284     status = occCmd.send(cmd, rsp);
285     if (status == CmdStatus::SUCCESS)
286     {
287         if (rsp.size() == 5)
288         {
289             if (RspStatus::SUCCESS != RspStatus(rsp[2]))
290             {
291                 log<level::ERR>(
292                     std::format(
293                         "sendAmbient: SEND_AMBIENT failed with rspStatus 0x{:02X}",
294                         rsp[2])
295                         .c_str());
296                 dump_hex(rsp);
297                 status = CmdStatus::FAILURE;
298             }
299         }
300         else
301         {
302             log<level::ERR>(
303                 std::format(
304                     "sendAmbient: INVALID SEND_AMBIENT response length:{}",
305                     rsp.size())
306                     .c_str());
307             dump_hex(rsp);
308             status = CmdStatus::FAILURE;
309         }
310     }
311     else
312     {
313         log<level::ERR>(
314             std::format(
315                 "sendAmbient: SEND_AMBIENT FAILED! with status 0x{:02X}",
316                 status)
317                 .c_str());
318 
319         if (status == CmdStatus::COMM_FAILURE)
320         {
321             // Disable and reset to try recovering
322             deviceError();
323         }
324     }
325 
326     return status;
327 }
328 
329 // Called when safe timer expires to determine if OCCs need to be reset
safeStateDelayExpired()330 void Status::safeStateDelayExpired()
331 {
332     if (this->occActive())
333     {
334         log<level::INFO>(
335             std::format(
336                 "safeStateDelayExpired: OCC{} is in SAFE state, requesting reset",
337                 instance)
338                 .c_str());
339         // Disable and reset to try recovering
340         deviceError(Error::Descriptor(SAFE_ERROR_PATH));
341     }
342 }
343 #endif // POWER10
344 
getHwmonPath()345 fs::path Status::getHwmonPath()
346 {
347     using namespace std::literals::string_literals;
348 
349     if (!fs::exists(hwmonPath))
350     {
351         static bool tracedFail[8] = {0};
352 
353         if (!hwmonPath.empty())
354         {
355             log<level::ERR>(
356                 std::format("Status::getHwmonPath(): path no longer exists: {}",
357                             hwmonPath.c_str())
358                     .c_str());
359             hwmonPath.clear();
360         }
361 
362         // Build the base HWMON path
363         fs::path prefixPath = fs::path{OCC_HWMON_PATH + "occ-hwmon."s +
364                                        std::to_string(instance + 1) +
365                                        "/hwmon/"s};
366 
367         // Get the hwmonXX directory name
368         try
369         {
370             // there should only be one directory
371             const int numDirs = std::distance(
372                 fs::directory_iterator(prefixPath), fs::directory_iterator{});
373             if (numDirs == 1)
374             {
375                 hwmonPath = *fs::directory_iterator(prefixPath);
376                 tracedFail[instance] = false;
377             }
378             else
379             {
380                 if (!tracedFail[instance])
381                 {
382                     log<level::ERR>(
383                         std::format(
384                             "Status::getHwmonPath(): Found multiple ({}) hwmon paths!",
385                             numDirs)
386                             .c_str());
387                     tracedFail[instance] = true;
388                 }
389             }
390         }
391         catch (const fs::filesystem_error& e)
392         {
393             if (!tracedFail[instance])
394             {
395                 log<level::ERR>(
396                     std::format(
397                         "Status::getHwmonPath(): error accessing {}: {}",
398                         prefixPath.c_str(), e.what())
399                         .c_str());
400                 tracedFail[instance] = true;
401             }
402         }
403     }
404 
405     return hwmonPath;
406 }
407 
408 // Called to read state and upon failure to read after occReadStateFailTimer.
occReadStateNow()409 void Status::occReadStateNow()
410 {
411     unsigned int state;
412     const fs::path filename =
413         fs::path(DEV_PATH) /
414         fs::path(sysfsName + "." + std::to_string(instance + 1)) / "occ_state";
415 
416     std::ifstream file;
417     bool goodFile = false;
418 
419     // open file.
420     file.open(filename, std::ios::in);
421     const int openErrno = errno;
422 
423     // File is open and state can be used.
424     if (file.is_open() && file.good())
425     {
426         goodFile = true;
427         file >> state;
428 
429         if (state != lastState)
430         {
431             // Trace OCC state changes
432             log<level::INFO>(
433                 std::format(
434                     "Status::readOccState: OCC{} state 0x{:02X} (lastState: 0x{:02X})",
435                     instance, state, lastState)
436                     .c_str());
437             lastState = state;
438 #ifdef POWER10
439             if (OccState(state) == OccState::ACTIVE)
440             {
441                 if (pmode && device.master())
442                 {
443                     // Set the master OCC on the PowerMode object
444                     pmode->setMasterOcc(path);
445                     // Enable mode changes
446                     pmode->setMasterActive();
447 
448                     // Special processing by master OCC when it goes active
449                     occsWentActive();
450                 }
451 
452                 CmdStatus status = sendAmbient();
453                 if (status != CmdStatus::SUCCESS)
454                 {
455                     log<level::ERR>(
456                         std::format(
457                             "readOccState: Sending Ambient failed with status {}",
458                             status)
459                             .c_str());
460                 }
461             }
462 
463             // If OCC in known Good State.
464             if ((OccState(state) == OccState::ACTIVE) ||
465                 (OccState(state) == OccState::CHARACTERIZATION) ||
466                 (OccState(state) == OccState::OBSERVATION))
467             {
468                 // Good OCC State then sensors valid again
469                 stateValid = true;
470 
471                 if (safeStateDelayTimer.isEnabled())
472                 {
473                     // stop safe delay timer (no longer in SAFE state)
474                     safeStateDelayTimer.setEnabled(false);
475                 }
476             }
477             // Else not Valid state We would be in SAFE mode.
478             // This captures both SAFE mode, and 0x00, or other invalid
479             // state values.
480             else
481             {
482                 if (!safeStateDelayTimer.isEnabled())
483                 {
484                     // start safe delay timer (before requesting reset)
485                     using namespace std::literals::chrono_literals;
486                     safeStateDelayTimer.restartOnce(60s);
487                 }
488                 // Not valid state, update sensors to Nan & not functional.
489                 stateValid = false;
490             }
491 #else
492             // Before P10 state not checked, only used good file open.
493             stateValid = true;
494 #endif
495         }
496     }
497     file.close();
498 
499     // if failed to Read a state or not a valid state -> Attempt retry
500     // after 1 Second delay if allowed.
501     if ((!goodFile) || (!stateValid))
502     {
503         if (!goodFile)
504         {
505             // If not able to read, OCC may be offline
506             log<level::ERR>(
507                 std::format("Status::readOccState: open failed (errno={})",
508                             openErrno)
509                     .c_str());
510         }
511         else
512         {
513             // else this failed due to state not valid.
514             if (state != lastState)
515             {
516                 log<level::ERR>(
517                     std::format(
518                         "Status::readOccState: OCC{} Invalid state 0x{:02X} (last state: 0x{:02X})",
519                         instance, state, lastState)
520                         .c_str());
521             }
522         }
523 
524 #ifdef READ_OCC_SENSORS
525         manager.setSensorValueToNaN(instance);
526 #endif
527 
528         // See occReadRetries for number of retry attempts.
529         if (currentOccReadRetriesCount > 0)
530         {
531             --currentOccReadRetriesCount;
532 #ifdef POWER10
533             using namespace std::chrono_literals;
534             occReadStateFailTimer.restartOnce(1s);
535 #endif
536         }
537         else
538         {
539 #ifdef POWER10
540             if (!stateValid && occActive())
541             {
542                 if (!safeStateDelayTimer.isEnabled())
543                 {
544                     log<level::ERR>(
545                         "Starting 60 sec delay timer before requesting a reset");
546                     // start safe delay timer (before requesting reset)
547                     using namespace std::literals::chrono_literals;
548                     safeStateDelayTimer.restartOnce(60s);
549                 }
550             }
551 #else
552             // State could not be determined, set it to NO State.
553             lastState = 0;
554 
555             // Disable the ability to send Failed actions until OCC is
556             // Active again.
557             stateValid = false;
558 
559             // Disable and reset to try recovering
560             deviceError();
561 #endif
562         }
563     }
564 }
565 
566 // Update processor throttle status on dbus
updateThrottle(const bool isThrottled,const uint8_t newReason)567 void Status::updateThrottle(const bool isThrottled, const uint8_t newReason)
568 {
569     if (!throttleHandle)
570     {
571         return;
572     }
573 
574     uint8_t newThrottleCause = throttleCause;
575 
576     if (isThrottled) // throttled due to newReason
577     {
578         if ((newReason & throttleCause) == 0)
579         {
580             // set the bit(s) for passed in reason
581             newThrottleCause |= newReason;
582         }
583         // else no change
584     }
585     else // no longer throttled due to newReason
586     {
587         if ((newReason & throttleCause) != 0)
588         {
589             // clear the bit(s) for passed in reason
590             newThrottleCause &= ~newReason;
591         }
592         // else no change
593     }
594 
595     if (newThrottleCause != throttleCause)
596     {
597         if (newThrottleCause == THROTTLED_NONE)
598         {
599             log<level::DEBUG>(
600                 std::format(
601                     "updateThrottle: OCC{} no longer throttled (prior reason: {})",
602                     instance, throttleCause)
603                     .c_str());
604             throttleCause = THROTTLED_NONE;
605             throttleHandle->throttled(false);
606             throttleHandle->throttleCauses({});
607         }
608         else
609         {
610             log<level::DEBUG>(
611                 std::format(
612                     "updateThrottle: OCC{} is throttled with reason {} (prior reason: {})",
613                     instance, newThrottleCause, throttleCause)
614                     .c_str());
615             throttleCause = newThrottleCause;
616 
617             std::vector<ThrottleObj::ThrottleReasons> updatedCauses;
618             if (throttleCause & THROTTLED_POWER)
619             {
620                 updatedCauses.push_back(
621                     throttleHandle->ThrottleReasons::PowerLimit);
622             }
623             if (throttleCause & THROTTLED_THERMAL)
624             {
625                 updatedCauses.push_back(
626                     throttleHandle->ThrottleReasons::ThermalLimit);
627             }
628             if (throttleCause & THROTTLED_SAFE)
629             {
630                 updatedCauses.push_back(
631                     throttleHandle->ThrottleReasons::ManagementDetectedFault);
632             }
633             throttleHandle->throttleCauses(updatedCauses);
634             throttleHandle->throttled(true);
635         }
636     }
637     // else no change to throttle status
638 }
639 
640 // Get processor path associated with this OCC
readProcAssociation()641 void Status::readProcAssociation()
642 {
643     std::string managingPath = path + "/power_managing";
644     log<level::DEBUG>(
645         std::format("readProcAssociation: getting endpoints for {} ({})",
646                     managingPath, path)
647             .c_str());
648     try
649     {
650         utils::PropertyValue procPathProperty{};
651         procPathProperty = utils::getProperty(
652             managingPath, "xyz.openbmc_project.Association", "endpoints");
653         auto result = std::get<std::vector<std::string>>(procPathProperty);
654         if (result.size() > 0)
655         {
656             procPath = result[0];
657             log<level::INFO>(
658                 std::format("readProcAssociation: OCC{} has proc={}", instance,
659                             procPath.c_str())
660                     .c_str());
661         }
662         else
663         {
664             log<level::ERR>(
665                 std::format(
666                     "readProcAssociation: No processor associated with OCC{} / {}",
667                     instance, path)
668                     .c_str());
669         }
670     }
671     catch (const sdbusplus::exception_t& e)
672     {
673         log<level::ERR>(
674             std::format(
675                 "readProcAssociation: Unable to get proc assocated with {} - {}",
676                 path, e.what())
677                 .c_str());
678         procPath = {};
679     }
680 }
681 
682 } // namespace occ
683 } // namespace open_power
684