1 #include "occ_status.hpp"
2 
3 #include "occ_manager.hpp"
4 #include "occ_sensor.hpp"
5 #include "powermode.hpp"
6 #include "utils.hpp"
7 
8 #include <fmt/core.h>
9 
10 #include <phosphor-logging/log.hpp>
11 
12 #include <filesystem>
13 
14 namespace open_power
15 {
16 namespace occ
17 {
18 
19 using namespace phosphor::logging;
20 
21 using ThrottleObj =
22     sdbusplus::xyz::openbmc_project::Control::Power::server::Throttle;
23 
24 // Handles updates to occActive property
25 bool Status::occActive(bool value)
26 {
27     if (value != this->occActive())
28     {
29         log<level::INFO>(fmt::format("Status::occActive OCC{} changed to {}",
30                                      instance, value)
31                              .c_str());
32         if (value)
33         {
34             // Clear prior throttle reason (before setting device active)
35             updateThrottle(false, THROTTLED_ALL);
36 
37             // Set the device active
38             device.setActive(true);
39 
40             // Update the OCC active sensor
41             Base::Status::occActive(value);
42 
43             // Start watching for errors
44             addErrorWatch();
45 
46             // Reset last OCC state
47             lastState = 0;
48 
49             if (device.master())
50             {
51                 // Update powercap bounds from OCC
52                 manager.updatePcapBounds();
53             }
54 
55             // Call into Manager to let know that we have bound
56             if (this->managerCallBack)
57             {
58                 this->managerCallBack(instance, value);
59             }
60         }
61         else
62         {
63 #ifdef POWER10
64             if (pmode && device.master())
65             {
66                 // Prevent mode changes
67                 pmode->setMasterActive(false);
68             }
69             if (safeStateDelayTimer.isEnabled())
70             {
71                 // stop safe delay timer
72                 safeStateDelayTimer.setEnabled(false);
73             }
74 #endif
75             // Call into Manager to let know that we will unbind.
76             if (this->managerCallBack)
77             {
78                 this->managerCallBack(instance, value);
79             }
80 
81             // Stop watching for errors
82             removeErrorWatch();
83 
84             // Set the device inactive
85             device.setActive(false);
86 
87             // Clear throttles (OCC not active after disabling device)
88             updateThrottle(false, THROTTLED_ALL);
89         }
90     }
91     else if (value && !device.active())
92     {
93         // Existing error watch is on a dead file descriptor.
94         removeErrorWatch();
95 
96         /*
97          * In it's constructor, Status checks Device::bound() to see if OCC is
98          * active or not.
99          * Device::bound() checks for occX-dev0 directory.
100          * We will lose occX-dev0 directories during FSI rescan.
101          * So, if we start this application (and construct Status), and then
102          * later do FSI rescan, we will end up with occActive = true and device
103          * NOT bound. Lets correct that situation here.
104          */
105         device.setActive(true);
106 
107         // Add error watch again
108         addErrorWatch();
109     }
110     else if (!value && device.active())
111     {
112         removeErrorWatch();
113 
114         // In the event that the application never receives the active signal
115         // even though the OCC is active (this can occur if the BMC is rebooted
116         // with the host on, since the initial OCC driver probe will discover
117         // the OCCs), this application needs to be able to unbind the device
118         // when we get the OCC inactive signal.
119         device.setActive(false);
120     }
121     return Base::Status::occActive(value);
122 }
123 
124 // Callback handler when a device error is reported.
125 void Status::deviceError(Error::Descriptor d)
126 {
127 #ifdef POWER10
128     if (pmode && device.master())
129     {
130         // Prevent mode changes
131         pmode->setMasterActive(false);
132     }
133 #endif
134 
135     if (d.log)
136     {
137         FFDC::createOCCResetPEL(instance, d.path, d.err, d.callout);
138     }
139 
140     // This would deem OCC inactive
141     this->occActive(false);
142 
143     // Reset the OCC
144     this->resetOCC();
145 }
146 
147 // Sends message to host control command handler to reset OCC
148 void Status::resetOCC()
149 {
150     log<level::INFO>(
151         fmt::format(">>Status::resetOCC() - requesting reset for OCC{}",
152                     instance)
153             .c_str());
154 #ifdef PLDM
155     if (resetCallBack)
156     {
157         this->resetCallBack(instance);
158     }
159 #else
160     constexpr auto CONTROL_HOST_PATH = "/org/open_power/control/host0";
161     constexpr auto CONTROL_HOST_INTF = "org.open_power.Control.Host";
162 
163     // This will throw exception on failure
164     auto service = utils::getService(CONTROL_HOST_PATH, CONTROL_HOST_INTF);
165 
166     auto& bus = utils::getBus();
167     auto method = bus.new_method_call(service.c_str(), CONTROL_HOST_PATH,
168                                       CONTROL_HOST_INTF, "Execute");
169     // OCC Reset control command
170     method.append(convertForMessage(Control::Host::Command::OCCReset).c_str());
171 
172     // OCC Sensor ID for callout reasons
173     method.append(std::variant<uint8_t>(std::get<0>(sensorMap.at(instance))));
174     bus.call_noreply(method);
175     return;
176 #endif
177 }
178 
179 // Handler called by Host control command handler to convey the
180 // status of the executed command
181 void Status::hostControlEvent(sdbusplus::message_t& msg)
182 {
183     std::string cmdCompleted{};
184     std::string cmdStatus{};
185 
186     msg.read(cmdCompleted, cmdStatus);
187 
188     log<level::DEBUG>("Host control signal values",
189                       entry("COMMAND=%s", cmdCompleted.c_str()),
190                       entry("STATUS=%s", cmdStatus.c_str()));
191 
192     if (Control::Host::convertResultFromString(cmdStatus) !=
193         Control::Host::Result::Success)
194     {
195         if (Control::Host::convertCommandFromString(cmdCompleted) ==
196             Control::Host::Command::OCCReset)
197         {
198             // Must be a Timeout. Log an Error trace
199             log<level::ERR>(
200                 "Error resetting the OCC.", entry("PATH=%s", path.c_str()),
201                 entry("SENSORID=0x%X", std::get<0>(sensorMap.at(instance))));
202         }
203     }
204     return;
205 }
206 
207 // Called from Manager::pollerTimerExpired() in preperation to POLL OCC.
208 void Status::readOccState()
209 {
210     currentOccReadRetriesCount = occReadRetries;
211     occReadStateNow();
212 }
213 
214 #ifdef POWER10
215 // Special processing that needs to happen once the OCCs change to ACTIVE state
216 void Status::occsWentActive()
217 {
218     CmdStatus status = CmdStatus::SUCCESS;
219 
220     status = pmode->sendModeChange();
221     if (status != CmdStatus::SUCCESS)
222     {
223         log<level::ERR>(
224             fmt::format(
225                 "Status::occsWentActive: OCC mode change failed with status {}",
226                 status)
227                 .c_str());
228 
229         // Disable and reset to try recovering
230         deviceError();
231     }
232 
233     status = pmode->sendIpsData();
234     if (status != CmdStatus::SUCCESS)
235     {
236         log<level::ERR>(
237             fmt::format(
238                 "Status::occsWentActive: Sending Idle Power Save Config data failed with status {}",
239                 status)
240                 .c_str());
241 
242         if (status == CmdStatus::COMM_FAILURE)
243         {
244             // Disable and reset to try recovering
245             deviceError();
246         }
247     }
248 }
249 
250 // Send Ambient and Altitude to the OCC
251 CmdStatus Status::sendAmbient(const uint8_t inTemp, const uint16_t inAltitude)
252 {
253     CmdStatus status = CmdStatus::FAILURE;
254     bool ambientValid = true;
255     uint8_t ambientTemp = inTemp;
256     uint16_t altitude = inAltitude;
257 
258     if (ambientTemp == 0xFF)
259     {
260         // Get latest readings from manager
261         manager.getAmbientData(ambientValid, ambientTemp, altitude);
262         log<level::DEBUG>(
263             fmt::format("sendAmbient: valid: {}, Ambient: {}C, altitude: {}m",
264                         ambientValid, ambientTemp, altitude)
265                 .c_str());
266     }
267 
268     std::vector<std::uint8_t> cmd, rsp;
269     cmd.reserve(11);
270     cmd.push_back(uint8_t(CmdType::SEND_AMBIENT));
271     cmd.push_back(0x00);                    // Data Length (2 bytes)
272     cmd.push_back(0x08);                    //
273     cmd.push_back(0x00);                    // Version
274     cmd.push_back(ambientValid ? 0 : 0xFF); // Ambient Status
275     cmd.push_back(ambientTemp);             // Ambient Temperature
276     cmd.push_back(altitude >> 8);           // Altitude in meters (2 bytes)
277     cmd.push_back(altitude & 0xFF);         //
278     cmd.push_back(0x00);                    // Reserved (3 bytes)
279     cmd.push_back(0x00);
280     cmd.push_back(0x00);
281     log<level::DEBUG>(fmt::format("sendAmbient: SEND_AMBIENT "
282                                   "command to OCC{} ({} bytes)",
283                                   instance, cmd.size())
284                           .c_str());
285     status = occCmd.send(cmd, rsp);
286     if (status == CmdStatus::SUCCESS)
287     {
288         if (rsp.size() == 5)
289         {
290             if (RspStatus::SUCCESS != RspStatus(rsp[2]))
291             {
292                 log<level::ERR>(
293                     fmt::format(
294                         "sendAmbient: SEND_AMBIENT failed with rspStatus 0x{:02X}",
295                         rsp[2])
296                         .c_str());
297                 dump_hex(rsp);
298                 status = CmdStatus::FAILURE;
299             }
300         }
301         else
302         {
303             log<level::ERR>(
304                 fmt::format(
305                     "sendAmbient: INVALID SEND_AMBIENT response length:{}",
306                     rsp.size())
307                     .c_str());
308             dump_hex(rsp);
309             status = CmdStatus::FAILURE;
310         }
311     }
312     else
313     {
314         log<level::ERR>(
315             fmt::format(
316                 "sendAmbient: SEND_AMBIENT FAILED! with status 0x{:02X}",
317                 status)
318                 .c_str());
319 
320         if (status == CmdStatus::COMM_FAILURE)
321         {
322             // Disable and reset to try recovering
323             deviceError();
324         }
325     }
326 
327     return status;
328 }
329 
330 // Called when safe timer expires to determine if OCCs need to be reset
331 void Status::safeStateDelayExpired()
332 {
333     if (this->occActive())
334     {
335         log<level::INFO>(
336             fmt::format(
337                 "safeStateDelayExpired: OCC{} is in SAFE state, requesting reset",
338                 instance)
339                 .c_str());
340         // Disable and reset to try recovering
341         deviceError(Error::Descriptor(SAFE_ERROR_PATH));
342     }
343 }
344 #endif // POWER10
345 
346 fs::path Status::getHwmonPath()
347 {
348     using namespace std::literals::string_literals;
349 
350     if (!fs::exists(hwmonPath))
351     {
352         static bool tracedFail[8] = {0};
353 
354         if (!hwmonPath.empty())
355         {
356             log<level::ERR>(
357                 fmt::format("Status::getHwmonPath(): path no longer exists: {}",
358                             hwmonPath.c_str())
359                     .c_str());
360             hwmonPath.clear();
361         }
362 
363         // Build the base HWMON path
364         fs::path prefixPath = fs::path{OCC_HWMON_PATH + "occ-hwmon."s +
365                                        std::to_string(instance + 1) +
366                                        "/hwmon/"s};
367 
368         // Get the hwmonXX directory name
369         try
370         {
371             // there should only be one directory
372             const int numDirs = std::distance(
373                 fs::directory_iterator(prefixPath), fs::directory_iterator{});
374             if (numDirs == 1)
375             {
376                 hwmonPath = *fs::directory_iterator(prefixPath);
377                 tracedFail[instance] = false;
378             }
379             else
380             {
381                 if (!tracedFail[instance])
382                 {
383                     log<level::ERR>(
384                         fmt::format(
385                             "Status::getHwmonPath(): Found multiple ({}) hwmon paths!",
386                             numDirs)
387                             .c_str());
388                     tracedFail[instance] = true;
389                 }
390             }
391         }
392         catch (const fs::filesystem_error& e)
393         {
394             if (!tracedFail[instance])
395             {
396                 log<level::ERR>(
397                     fmt::format(
398                         "Status::getHwmonPath(): error accessing {}: {}",
399                         prefixPath.c_str(), e.what())
400                         .c_str());
401                 tracedFail[instance] = true;
402             }
403         }
404     }
405 
406     return hwmonPath;
407 }
408 
409 // Called to read state and upon failure to read after occReadStateFailTimer.
410 void Status::occReadStateNow()
411 {
412     unsigned int state;
413     const fs::path filename =
414         fs::path(DEV_PATH) /
415         fs::path(sysfsName + "." + std::to_string(instance + 1)) / "occ_state";
416 
417     std::ifstream file;
418     bool goodFile = false;
419 
420     // open file.
421     file.open(filename, std::ios::in);
422     const int openErrno = errno;
423 
424     // File is open and state can be used.
425     if (file.is_open() && file.good())
426     {
427         goodFile = true;
428         file >> state;
429 
430         if (state != lastState)
431         {
432             // Trace OCC state changes
433             log<level::INFO>(
434                 fmt::format(
435                     "Status::readOccState: OCC{} state 0x{:02X} (lastState: 0x{:02X})",
436                     instance, state, lastState)
437                     .c_str());
438             lastState = state;
439 #ifdef POWER10
440             if (OccState(state) == OccState::ACTIVE)
441             {
442                 if (pmode && device.master())
443                 {
444                     // Set the master OCC on the PowerMode object
445                     pmode->setMasterOcc(path);
446                     // Enable mode changes
447                     pmode->setMasterActive();
448 
449                     // Special processing by master OCC when it goes active
450                     occsWentActive();
451                 }
452 
453                 CmdStatus status = sendAmbient();
454                 if (status != CmdStatus::SUCCESS)
455                 {
456                     log<level::ERR>(
457                         fmt::format(
458                             "readOccState: Sending Ambient failed with status {}",
459                             status)
460                             .c_str());
461                 }
462             }
463 
464             // If OCC in known Good State.
465             if ((OccState(state) == OccState::ACTIVE) ||
466                 (OccState(state) == OccState::CHARACTERIZATION) ||
467                 (OccState(state) == OccState::OBSERVATION))
468             {
469                 // Good OCC State then sensors valid again
470                 stateValid = true;
471 
472                 if (safeStateDelayTimer.isEnabled())
473                 {
474                     // stop safe delay timer (no longer in SAFE state)
475                     safeStateDelayTimer.setEnabled(false);
476                 }
477             }
478             // Else not Valid state We would be in SAFE mode.
479             // This captures both SAFE mode, and 0x00, or other invalid
480             // state values.
481             else
482             {
483                 if (!safeStateDelayTimer.isEnabled())
484                 {
485                     // start safe delay timer (before requesting reset)
486                     using namespace std::literals::chrono_literals;
487                     safeStateDelayTimer.restartOnce(60s);
488                 }
489                 // Not valid state, update sensors to Nan & not functional.
490                 stateValid = false;
491             }
492 #else
493             // Before P10 state not checked, only used good file open.
494             stateValid = true;
495 #endif
496         }
497     }
498     file.close();
499 
500     // if failed to Read a state or not a valid state -> Attempt retry
501     // after 1 Second delay if allowed.
502     if ((!goodFile) || (!stateValid))
503     {
504         if (!goodFile)
505         {
506             // If not able to read, OCC may be offline
507             log<level::ERR>(
508                 fmt::format("Status::readOccState: open failed (errno={})",
509                             openErrno)
510                     .c_str());
511         }
512         else
513         {
514             // else this failed due to state not valid.
515             if (state != lastState)
516             {
517                 log<level::ERR>(
518                     fmt::format(
519                         "Status::readOccState: OCC{} Invalid state 0x{:02X} (last state: 0x{:02X})",
520                         instance, state, lastState)
521                         .c_str());
522             }
523         }
524 
525 #ifdef READ_OCC_SENSORS
526         manager.setSensorValueToNaN(instance);
527 #endif
528 
529         // See occReadRetries for number of retry attempts.
530         if (currentOccReadRetriesCount > 0)
531         {
532             --currentOccReadRetriesCount;
533 #ifdef POWER10
534             using namespace std::chrono_literals;
535             occReadStateFailTimer.restartOnce(1s);
536 #endif
537         }
538         else
539         {
540 #ifdef POWER10
541             if (!stateValid && occActive())
542             {
543                 if (!safeStateDelayTimer.isEnabled())
544                 {
545                     log<level::ERR>(
546                         "Starting 60 sec delay timer before requesting a reset");
547                     // start safe delay timer (before requesting reset)
548                     using namespace std::literals::chrono_literals;
549                     safeStateDelayTimer.restartOnce(60s);
550                 }
551             }
552 #else
553             // State could not be determined, set it to NO State.
554             lastState = 0;
555 
556             // Disable the ability to send Failed actions until OCC is
557             // Active again.
558             stateValid = false;
559 
560             // Disable and reset to try recovering
561             deviceError();
562 #endif
563         }
564     }
565 }
566 
567 // Update processor throttle status on dbus
568 void Status::updateThrottle(const bool isThrottled, const uint8_t newReason)
569 {
570     if (!throttleHandle)
571     {
572         return;
573     }
574 
575     uint8_t newThrottleCause = throttleCause;
576 
577     if (isThrottled) // throttled due to newReason
578     {
579         if ((newReason & throttleCause) == 0)
580         {
581             // set the bit(s) for passed in reason
582             newThrottleCause |= newReason;
583         }
584         // else no change
585     }
586     else // no longer throttled due to newReason
587     {
588         if ((newReason & throttleCause) != 0)
589         {
590             // clear the bit(s) for passed in reason
591             newThrottleCause &= ~newReason;
592         }
593         // else no change
594     }
595 
596     if (newThrottleCause != throttleCause)
597     {
598         if (newThrottleCause == THROTTLED_NONE)
599         {
600             log<level::DEBUG>(
601                 fmt::format(
602                     "updateThrottle: OCC{} no longer throttled (prior reason: {})",
603                     instance, throttleCause)
604                     .c_str());
605             throttleCause = THROTTLED_NONE;
606             throttleHandle->throttled(false);
607             throttleHandle->throttleCauses({});
608         }
609         else
610         {
611             log<level::DEBUG>(
612                 fmt::format(
613                     "updateThrottle: OCC{} is throttled with reason {} (prior reason: {})",
614                     instance, newThrottleCause, throttleCause)
615                     .c_str());
616             throttleCause = newThrottleCause;
617 
618             std::vector<ThrottleObj::ThrottleReasons> updatedCauses;
619             if (throttleCause & THROTTLED_POWER)
620             {
621                 updatedCauses.push_back(
622                     throttleHandle->ThrottleReasons::PowerLimit);
623             }
624             if (throttleCause & THROTTLED_THERMAL)
625             {
626                 updatedCauses.push_back(
627                     throttleHandle->ThrottleReasons::ThermalLimit);
628             }
629             if (throttleCause & THROTTLED_SAFE)
630             {
631                 updatedCauses.push_back(
632                     throttleHandle->ThrottleReasons::ManagementDetectedFault);
633             }
634             throttleHandle->throttleCauses(updatedCauses);
635             throttleHandle->throttled(true);
636         }
637     }
638     // else no change to throttle status
639 }
640 
641 // Get processor path associated with this OCC
642 void Status::readProcAssociation()
643 {
644     std::string managingPath = path + "/power_managing";
645     log<level::DEBUG>(
646         fmt::format("readProcAssociation: getting endpoints for {} ({})",
647                     managingPath, path)
648             .c_str());
649     try
650     {
651         utils::PropertyValue procPathProperty{};
652         procPathProperty = utils::getProperty(
653             managingPath, "xyz.openbmc_project.Association", "endpoints");
654         auto result = std::get<std::vector<std::string>>(procPathProperty);
655         if (result.size() > 0)
656         {
657             procPath = result[0];
658             log<level::INFO>(
659                 fmt::format("readProcAssociation: OCC{} has proc={}", instance,
660                             procPath.c_str())
661                     .c_str());
662         }
663         else
664         {
665             log<level::ERR>(
666                 fmt::format(
667                     "readProcAssociation: No processor associated with OCC{} / {}",
668                     instance, path)
669                     .c_str());
670         }
671     }
672     catch (const sdbusplus::exception_t& e)
673     {
674         log<level::ERR>(
675             fmt::format(
676                 "readProcAssociation: Unable to get proc assocated with {} - {}",
677                 path, e.what())
678                 .c_str());
679         procPath = {};
680     }
681 }
682 
683 } // namespace occ
684 } // namespace open_power
685