1 #include "occ_status.hpp"
2 
3 #include "occ_manager.hpp"
4 #include "occ_sensor.hpp"
5 #include "powermode.hpp"
6 #include "utils.hpp"
7 
8 #include <phosphor-logging/log.hpp>
9 
10 #include <filesystem>
11 #include <format>
12 
13 namespace open_power
14 {
15 namespace occ
16 {
17 
18 using namespace phosphor::logging;
19 
20 using ThrottleObj =
21     sdbusplus::xyz::openbmc_project::Control::Power::server::Throttle;
22 
23 // Handles updates to occActive property
24 bool Status::occActive(bool value)
25 {
26     if (value != this->occActive())
27     {
28         log<level::INFO>(std::format("Status::occActive OCC{} changed to {}",
29                                      instance, value)
30                              .c_str());
31         if (value)
32         {
33             // Clear prior throttle reason (before setting device active)
34             updateThrottle(false, THROTTLED_ALL);
35 
36             // Set the device active
37             device.setActive(true);
38 
39             // Reset last OCC state
40             lastState = 0;
41 
42             // Start watching for errors (throttles, etc)
43             try
44             {
45                 addErrorWatch();
46             }
47             catch (const OpenFailure& e)
48             {
49                 // Failed to add watch for throttle events, request reset to try
50                 // to recover comm
51                 log<level::ERR>(
52                     std::format(
53                         "Status::occActive: Unable to add error watch(s) for OCC{} watch: {}",
54                         instance, e.what())
55                         .c_str());
56                 deviceError(Error::Descriptor(OCC_COMM_ERROR_PATH));
57                 return Base::Status::occActive(false);
58             }
59 
60             // Update the OCC active sensor
61             Base::Status::occActive(value);
62 
63             if (device.master())
64             {
65                 // Update powercap bounds from OCC
66                 manager.updatePcapBounds();
67             }
68 
69             // Call into Manager to let know that we have bound
70             if (this->managerCallBack)
71             {
72                 this->managerCallBack(instance, value);
73             }
74         }
75         else
76         {
77 #ifdef POWER10
78             if (pmode && device.master())
79             {
80                 // Prevent mode changes
81                 pmode->setMasterActive(false);
82             }
83             if (safeStateDelayTimer.isEnabled())
84             {
85                 // stop safe delay timer
86                 safeStateDelayTimer.setEnabled(false);
87             }
88 #endif
89             // Call into Manager to let know that we will unbind.
90             if (this->managerCallBack)
91             {
92                 this->managerCallBack(instance, value);
93             }
94 
95             // Stop watching for errors
96             removeErrorWatch();
97 
98             // Set the device inactive
99             device.setActive(false);
100 
101             // Clear throttles (OCC not active after disabling device)
102             updateThrottle(false, THROTTLED_ALL);
103         }
104     }
105     else if (value && !device.active())
106     {
107         // Existing error watch is on a dead file descriptor.
108         removeErrorWatch();
109 
110         /*
111          * In it's constructor, Status checks Device::bound() to see if OCC is
112          * active or not.
113          * Device::bound() checks for occX-dev0 directory.
114          * We will lose occX-dev0 directories during FSI rescan.
115          * So, if we start this application (and construct Status), and then
116          * later do FSI rescan, we will end up with occActive = true and device
117          * NOT bound. Lets correct that situation here.
118          */
119         device.setActive(true);
120 
121         // Add error watch again
122         try
123         {
124             addErrorWatch();
125         }
126         catch (const OpenFailure& e)
127         {
128             // Failed to add watch for throttle events, request reset to try to
129             // recover comm
130             log<level::ERR>(
131                 std::format(
132                     "Status::occActive: Unable to add error watch(s) again for OCC{} watch: {}",
133                     instance, e.what())
134                     .c_str());
135             deviceError(Error::Descriptor(OCC_COMM_ERROR_PATH));
136             return Base::Status::occActive(false);
137         }
138     }
139     else if (!value && device.active())
140     {
141         removeErrorWatch();
142 
143         // In the event that the application never receives the active signal
144         // even though the OCC is active (this can occur if the BMC is rebooted
145         // with the host on, since the initial OCC driver probe will discover
146         // the OCCs), this application needs to be able to unbind the device
147         // when we get the OCC inactive signal.
148         device.setActive(false);
149     }
150     return Base::Status::occActive(value);
151 }
152 
153 // Callback handler when a device error is reported.
154 void Status::deviceError(Error::Descriptor d)
155 {
156 #ifdef POWER10
157     if (pmode && device.master())
158     {
159         // Prevent mode changes
160         pmode->setMasterActive(false);
161     }
162 #endif
163 
164     if (d.log)
165     {
166         FFDC::createOCCResetPEL(instance, d.path, d.err, d.callout);
167     }
168 
169     // This would deem OCC inactive
170     this->occActive(false);
171 
172     // Reset the OCC
173     this->resetOCC();
174 }
175 
176 // Sends message to host control command handler to reset OCC
177 void Status::resetOCC()
178 {
179     log<level::INFO>(
180         std::format(">>Status::resetOCC() - requesting reset for OCC{}",
181                     instance)
182             .c_str());
183     this->occActive(false);
184 #ifdef PLDM
185     if (resetCallBack)
186     {
187         this->resetCallBack(instance);
188     }
189 #else
190     constexpr auto CONTROL_HOST_PATH = "/org/open_power/control/host0";
191     constexpr auto CONTROL_HOST_INTF = "org.open_power.Control.Host";
192 
193     // This will throw exception on failure
194     auto service = utils::getService(CONTROL_HOST_PATH, CONTROL_HOST_INTF);
195 
196     auto& bus = utils::getBus();
197     auto method = bus.new_method_call(service.c_str(), CONTROL_HOST_PATH,
198                                       CONTROL_HOST_INTF, "Execute");
199     // OCC Reset control command
200     method.append(convertForMessage(Control::Host::Command::OCCReset).c_str());
201 
202     // OCC Sensor ID for callout reasons
203     method.append(std::variant<uint8_t>(std::get<0>(sensorMap.at(instance))));
204     bus.call_noreply(method);
205     return;
206 #endif
207 }
208 
209 // Handler called by Host control command handler to convey the
210 // status of the executed command
211 void Status::hostControlEvent(sdbusplus::message_t& msg)
212 {
213     std::string cmdCompleted{};
214     std::string cmdStatus{};
215 
216     msg.read(cmdCompleted, cmdStatus);
217 
218     log<level::DEBUG>("Host control signal values",
219                       entry("COMMAND=%s", cmdCompleted.c_str()),
220                       entry("STATUS=%s", cmdStatus.c_str()));
221 
222     if (Control::Host::convertResultFromString(cmdStatus) !=
223         Control::Host::Result::Success)
224     {
225         if (Control::Host::convertCommandFromString(cmdCompleted) ==
226             Control::Host::Command::OCCReset)
227         {
228             // Must be a Timeout. Log an Error trace
229             log<level::ERR>(
230                 "Error resetting the OCC.", entry("PATH=%s", path.c_str()),
231                 entry("SENSORID=0x%X", std::get<0>(sensorMap.at(instance))));
232         }
233     }
234     return;
235 }
236 
237 // Called from Manager::pollerTimerExpired() in preperation to POLL OCC.
238 void Status::readOccState()
239 {
240     if (stateValid)
241     {
242         // Reset retry count (since state is good)
243         currentOccReadRetriesCount = occReadRetries;
244     }
245     occReadStateNow();
246 }
247 
248 #ifdef POWER10
249 // Special processing that needs to happen once the OCCs change to ACTIVE state
250 void Status::occsWentActive()
251 {
252     CmdStatus status = CmdStatus::SUCCESS;
253 
254     status = pmode->sendModeChange();
255     if (status != CmdStatus::SUCCESS)
256     {
257         log<level::ERR>(
258             std::format(
259                 "Status::occsWentActive: OCC mode change failed with status {}",
260                 status)
261                 .c_str());
262 
263         // Disable and reset to try recovering
264         deviceError();
265     }
266 
267     status = pmode->sendIpsData();
268     if (status != CmdStatus::SUCCESS)
269     {
270         log<level::ERR>(
271             std::format(
272                 "Status::occsWentActive: Sending Idle Power Save Config data failed with status {}",
273                 status)
274                 .c_str());
275 
276         if (status == CmdStatus::COMM_FAILURE)
277         {
278             // Disable and reset to try recovering
279             deviceError();
280         }
281     }
282 }
283 
284 // Send Ambient and Altitude to the OCC
285 CmdStatus Status::sendAmbient(const uint8_t inTemp, const uint16_t inAltitude)
286 {
287     CmdStatus status = CmdStatus::FAILURE;
288     bool ambientValid = true;
289     uint8_t ambientTemp = inTemp;
290     uint16_t altitude = inAltitude;
291 
292     if (ambientTemp == 0xFF)
293     {
294         // Get latest readings from manager
295         manager.getAmbientData(ambientValid, ambientTemp, altitude);
296         log<level::DEBUG>(
297             std::format("sendAmbient: valid: {}, Ambient: {}C, altitude: {}m",
298                         ambientValid, ambientTemp, altitude)
299                 .c_str());
300     }
301 
302     std::vector<std::uint8_t> cmd, rsp;
303     cmd.reserve(11);
304     cmd.push_back(uint8_t(CmdType::SEND_AMBIENT));
305     cmd.push_back(0x00);                    // Data Length (2 bytes)
306     cmd.push_back(0x08);                    //
307     cmd.push_back(0x00);                    // Version
308     cmd.push_back(ambientValid ? 0 : 0xFF); // Ambient Status
309     cmd.push_back(ambientTemp);             // Ambient Temperature
310     cmd.push_back(altitude >> 8);           // Altitude in meters (2 bytes)
311     cmd.push_back(altitude & 0xFF);         //
312     cmd.push_back(0x00);                    // Reserved (3 bytes)
313     cmd.push_back(0x00);
314     cmd.push_back(0x00);
315     log<level::DEBUG>(std::format("sendAmbient: SEND_AMBIENT "
316                                   "command to OCC{} ({} bytes)",
317                                   instance, cmd.size())
318                           .c_str());
319     status = occCmd.send(cmd, rsp);
320     if (status == CmdStatus::SUCCESS)
321     {
322         if (rsp.size() == 5)
323         {
324             if (RspStatus::SUCCESS != RspStatus(rsp[2]))
325             {
326                 log<level::ERR>(
327                     std::format(
328                         "sendAmbient: SEND_AMBIENT failed with rspStatus 0x{:02X}",
329                         rsp[2])
330                         .c_str());
331                 dump_hex(rsp);
332                 status = CmdStatus::FAILURE;
333             }
334         }
335         else
336         {
337             log<level::ERR>(
338                 std::format(
339                     "sendAmbient: INVALID SEND_AMBIENT response length:{}",
340                     rsp.size())
341                     .c_str());
342             dump_hex(rsp);
343             status = CmdStatus::FAILURE;
344         }
345     }
346     else
347     {
348         log<level::ERR>(
349             std::format(
350                 "sendAmbient: SEND_AMBIENT FAILED! with status 0x{:02X}",
351                 status)
352                 .c_str());
353 
354         if (status == CmdStatus::COMM_FAILURE)
355         {
356             // Disable due to OCC comm failure and reset to try recovering
357             deviceError(Error::Descriptor(OCC_COMM_ERROR_PATH));
358         }
359     }
360 
361     return status;
362 }
363 
364 // Called when safe timer expires to determine if OCCs need to be reset
365 void Status::safeStateDelayExpired()
366 {
367     if (this->occActive())
368     {
369         log<level::INFO>(
370             std::format(
371                 "safeStateDelayExpired: OCC{} state missing or not valid, requesting reset",
372                 instance)
373                 .c_str());
374         // Disable and reset to try recovering
375         deviceError(Error::Descriptor(SAFE_ERROR_PATH));
376     }
377 }
378 #endif // POWER10
379 
380 fs::path Status::getHwmonPath()
381 {
382     using namespace std::literals::string_literals;
383 
384     if (!fs::exists(hwmonPath))
385     {
386         static bool tracedFail[8] = {0};
387 
388         if (!hwmonPath.empty())
389         {
390             log<level::WARNING>(
391                 std::format("Status::getHwmonPath(): path no longer exists: {}",
392                             hwmonPath.c_str())
393                     .c_str());
394             hwmonPath.clear();
395         }
396 
397         // Build the base HWMON path
398         fs::path prefixPath =
399             fs::path{OCC_HWMON_PATH + "occ-hwmon."s +
400                      std::to_string(instance + 1) + "/hwmon/"s};
401 
402         // Get the hwmonXX directory name
403         try
404         {
405             // there should only be one directory
406             const int numDirs = std::distance(
407                 fs::directory_iterator(prefixPath), fs::directory_iterator{});
408             if (numDirs == 1)
409             {
410                 hwmonPath = *fs::directory_iterator(prefixPath);
411                 tracedFail[instance] = false;
412             }
413             else
414             {
415                 if (!tracedFail[instance])
416                 {
417                     log<level::ERR>(
418                         std::format(
419                             "Status::getHwmonPath(): Found multiple ({}) hwmon paths!",
420                             numDirs)
421                             .c_str());
422                     tracedFail[instance] = true;
423                 }
424             }
425         }
426         catch (const fs::filesystem_error& e)
427         {
428             if (!tracedFail[instance])
429             {
430                 log<level::ERR>(
431                     std::format(
432                         "Status::getHwmonPath(): error accessing {}: {}",
433                         prefixPath.c_str(), e.what())
434                         .c_str());
435                 tracedFail[instance] = true;
436             }
437         }
438     }
439 
440     return hwmonPath;
441 }
442 
443 // Called to read state and handle any errors
444 void Status::occReadStateNow()
445 {
446     unsigned int state;
447     const fs::path filename =
448         fs::path(DEV_PATH) /
449         fs::path(sysfsName + "." + std::to_string(instance + 1)) / "occ_state";
450 
451     std::ifstream file;
452     bool goodFile = false;
453 
454     // open file.
455     file.open(filename, std::ios::in);
456     const int openErrno = errno;
457 
458     // File is open and state can be used.
459     if (file.is_open() && file.good())
460     {
461         goodFile = true;
462         file >> state;
463         // Read the error code (if any) to check status of the read
464         std::ios_base::iostate readState = file.rdstate();
465         if (readState)
466         {
467             // There was a failure reading the file
468             if (lastOccReadStatus != -1)
469             {
470                 // Trace error bits
471                 std::string errorBits = "";
472                 if (readState & std::ios_base::eofbit)
473                 {
474                     errorBits += " EOF";
475                 }
476                 if (readState & std::ios_base::failbit)
477                 {
478                     errorBits += " failbit";
479                 }
480                 if (readState & std::ios_base::badbit)
481                 {
482                     errorBits += " badbit";
483                 }
484                 log<level::ERR>(
485                     std::format(
486                         "readOccState: Failed to read OCC{} state: Read error on I/O operation -{}",
487                         instance, errorBits)
488                         .c_str());
489                 lastOccReadStatus = -1;
490             }
491             goodFile = false;
492         }
493 
494         if (goodFile && (state != lastState))
495         {
496             // Trace OCC state changes
497             log<level::INFO>(
498                 std::format(
499                     "Status::readOccState: OCC{} state 0x{:02X} (lastState: 0x{:02X})",
500                     instance, state, lastState)
501                     .c_str());
502             lastState = state;
503 #ifdef POWER10
504             if (OccState(state) == OccState::ACTIVE)
505             {
506                 if (pmode && device.master())
507                 {
508                     // Set the master OCC on the PowerMode object
509                     pmode->setMasterOcc(path);
510                     // Enable mode changes
511                     pmode->setMasterActive();
512 
513                     // Special processing by master OCC when it goes active
514                     occsWentActive();
515                 }
516 
517                 CmdStatus status = sendAmbient();
518                 if (status != CmdStatus::SUCCESS)
519                 {
520                     log<level::ERR>(
521                         std::format(
522                             "readOccState: Sending Ambient failed with status {}",
523                             status)
524                             .c_str());
525                 }
526             }
527 
528             // If OCC in known Good State.
529             if ((OccState(state) == OccState::ACTIVE) ||
530                 (OccState(state) == OccState::CHARACTERIZATION) ||
531                 (OccState(state) == OccState::OBSERVATION))
532             {
533                 // Good OCC State then sensors valid again
534                 stateValid = true;
535 
536                 if (safeStateDelayTimer.isEnabled())
537                 {
538                     // stop safe delay timer (no longer in SAFE state)
539                     safeStateDelayTimer.setEnabled(false);
540                 }
541             }
542             else
543             {
544                 // OCC is in SAFE or some other unsupported state
545                 if (!safeStateDelayTimer.isEnabled())
546                 {
547                     log<level::ERR>(
548                         std::format(
549                             "readOccState: Invalid OCC{} state of {}, starting safe state delay timer",
550                             instance, state)
551                             .c_str());
552                     // start safe delay timer (before requesting reset)
553                     using namespace std::literals::chrono_literals;
554                     safeStateDelayTimer.restartOnce(60s);
555                 }
556                 // Not a supported state (update sensors to NaN and not
557                 // functional)
558                 stateValid = false;
559             }
560 #else
561             // Before P10 state not checked, only used good file open.
562             stateValid = true;
563 #endif
564         }
565     }
566 #ifdef POWER10
567     else
568     {
569         // Unable to read state
570         stateValid = false;
571     }
572 #endif
573     file.close();
574 
575     // if failed to Read a state or not a valid state -> Attempt retry
576     // after 1 Second delay if allowed.
577     if ((!goodFile) || (!stateValid))
578     {
579         if (!goodFile)
580         {
581             // If not able to read, OCC may be offline
582             if (openErrno != lastOccReadStatus)
583             {
584                 log<level::ERR>(
585                     std::format(
586                         "Status::readOccState: open/read failed trying to read OCC{} state (open errno={})",
587                         instance, openErrno)
588                         .c_str());
589                 lastOccReadStatus = openErrno;
590             }
591         }
592         else
593         {
594             // else this failed due to state not valid.
595             if (state != lastState)
596             {
597                 log<level::ERR>(
598                     std::format(
599                         "Status::readOccState: OCC{} Invalid state 0x{:02X} (last state: 0x{:02X})",
600                         instance, state, lastState)
601                         .c_str());
602             }
603         }
604 
605 #ifdef READ_OCC_SENSORS
606         manager.setSensorValueToNaN(instance);
607 #endif
608 
609         // See occReadRetries for number of retry attempts.
610         if (currentOccReadRetriesCount > 0)
611         {
612             --currentOccReadRetriesCount;
613         }
614         else
615         {
616             log<level::ERR>(
617                 std::format("readOccState: failed to read OCC{} state!",
618                             instance)
619                     .c_str());
620 
621             // State could not be determined, set it to NO State.
622             lastState = 0;
623 
624             // Disable the ability to send Failed actions until OCC is
625             // Active again.
626             stateValid = false;
627 
628             // Disable due to OCC comm failure and reset to try recovering
629             deviceError(Error::Descriptor(OCC_COMM_ERROR_PATH));
630 
631             // Reset retry count (for next attempt after recovery)
632             currentOccReadRetriesCount = occReadRetries;
633         }
634     }
635     else
636     {
637         if (lastOccReadStatus != 0)
638         {
639             log<level::INFO>(
640                 std::format(
641                     "Status::readOccState: successfully read OCC{} state: {}",
642                     instance, state)
643                     .c_str());
644             lastOccReadStatus = 0; // no error
645         }
646     }
647 }
648 
649 // Update processor throttle status on dbus
650 void Status::updateThrottle(const bool isThrottled, const uint8_t newReason)
651 {
652     if (!throttleHandle)
653     {
654         return;
655     }
656 
657     uint8_t newThrottleCause = throttleCause;
658 
659     if (isThrottled) // throttled due to newReason
660     {
661         if ((newReason & throttleCause) == 0)
662         {
663             // set the bit(s) for passed in reason
664             newThrottleCause |= newReason;
665         }
666         // else no change
667     }
668     else // no longer throttled due to newReason
669     {
670         if ((newReason & throttleCause) != 0)
671         {
672             // clear the bit(s) for passed in reason
673             newThrottleCause &= ~newReason;
674         }
675         // else no change
676     }
677 
678     if (newThrottleCause != throttleCause)
679     {
680         if (newThrottleCause == THROTTLED_NONE)
681         {
682             log<level::DEBUG>(
683                 std::format(
684                     "updateThrottle: OCC{} no longer throttled (prior reason: {})",
685                     instance, throttleCause)
686                     .c_str());
687             throttleCause = THROTTLED_NONE;
688             throttleHandle->throttled(false);
689             throttleHandle->throttleCauses({});
690         }
691         else
692         {
693             log<level::DEBUG>(
694                 std::format(
695                     "updateThrottle: OCC{} is throttled with reason {} (prior reason: {})",
696                     instance, newThrottleCause, throttleCause)
697                     .c_str());
698             throttleCause = newThrottleCause;
699 
700             std::vector<ThrottleObj::ThrottleReasons> updatedCauses;
701             if (throttleCause & THROTTLED_POWER)
702             {
703                 updatedCauses.push_back(
704                     throttleHandle->ThrottleReasons::PowerLimit);
705             }
706             if (throttleCause & THROTTLED_THERMAL)
707             {
708                 updatedCauses.push_back(
709                     throttleHandle->ThrottleReasons::ThermalLimit);
710             }
711             if (throttleCause & THROTTLED_SAFE)
712             {
713                 updatedCauses.push_back(
714                     throttleHandle->ThrottleReasons::ManagementDetectedFault);
715             }
716             throttleHandle->throttleCauses(updatedCauses);
717             throttleHandle->throttled(true);
718         }
719     }
720     // else no change to throttle status
721 }
722 
723 // Get processor path associated with this OCC
724 void Status::readProcAssociation()
725 {
726     std::string managingPath = path + "/power_managing";
727     log<level::DEBUG>(
728         std::format("readProcAssociation: getting endpoints for {} ({})",
729                     managingPath, path)
730             .c_str());
731     try
732     {
733         utils::PropertyValue procPathProperty{};
734         procPathProperty = utils::getProperty(
735             managingPath, "xyz.openbmc_project.Association", "endpoints");
736         auto result = std::get<std::vector<std::string>>(procPathProperty);
737         if (result.size() > 0)
738         {
739             procPath = result[0];
740             log<level::INFO>(
741                 std::format("readProcAssociation: OCC{} has proc={}", instance,
742                             procPath.c_str())
743                     .c_str());
744         }
745         else
746         {
747             log<level::ERR>(
748                 std::format(
749                     "readProcAssociation: No processor associated with OCC{} / {}",
750                     instance, path)
751                     .c_str());
752         }
753     }
754     catch (const sdbusplus::exception_t& e)
755     {
756         log<level::ERR>(
757             std::format(
758                 "readProcAssociation: Unable to get proc assocated with {} - {}",
759                 path, e.what())
760                 .c_str());
761         procPath = {};
762     }
763 }
764 
765 } // namespace occ
766 } // namespace open_power
767