xref: /openbmc/openpower-occ-control/occ_status.cpp (revision 3ece5b99026970e1cf9bc7e797aff4c1be81b2f3)
1  #include "occ_status.hpp"
2  
3  #include "occ_manager.hpp"
4  #include "occ_sensor.hpp"
5  #include "powermode.hpp"
6  #include "utils.hpp"
7  
8  #include <phosphor-logging/lg2.hpp>
9  
10  #include <filesystem>
11  
12  namespace open_power
13  {
14  namespace occ
15  {
16  
17  using namespace phosphor::logging;
18  
19  using ThrottleObj =
20      sdbusplus::xyz::openbmc_project::Control::Power::server::Throttle;
21  
22  // Handles updates to occActive property
occActive(bool value)23  bool Status::occActive(bool value)
24  {
25      if (value != this->occActive())
26      {
27          lg2::info("Status::occActive OCC{INST} changed to {STATE}", "INST",
28                    instance, "STATE", value);
29          if (value)
30          {
31              // Clear prior throttle reason (before setting device active)
32              updateThrottle(false, THROTTLED_ALL);
33  
34              // Set the device active
35              device.setActive(true);
36  
37              // Reset last OCC state
38              lastState = 0;
39  
40              // Start watching for errors (throttles, etc)
41              try
42              {
43                  addErrorWatch();
44              }
45              catch (const OpenFailure& e)
46              {
47                  // Failed to add watch for throttle events, request reset to try
48                  // to recover comm
49                  lg2::error(
50                      "Status::occActive: Unable to add error watch(s) for OCC{INST} watch: {ERROR}",
51                      "INST", instance, "ERROR", e.what());
52                  deviceError(Error::Descriptor(OCC_COMM_ERROR_PATH));
53                  return Base::Status::occActive(false);
54              }
55  
56              // Update the OCC active sensor
57              Base::Status::occActive(value);
58  
59              if (device.master())
60              {
61                  // Update powercap bounds from OCC
62                  manager.updatePcapBounds();
63              }
64  
65              // Call into Manager to let know that we have bound
66              if (this->managerCallBack)
67              {
68                  this->managerCallBack(instance, value);
69              }
70          }
71          else
72          {
73  #ifdef POWER10
74              if (pmode && device.master())
75              {
76                  // Prevent mode changes
77                  pmode->setMasterActive(false);
78              }
79              if (safeStateDelayTimer.isEnabled())
80              {
81                  // stop safe delay timer
82                  safeStateDelayTimer.setEnabled(false);
83              }
84  #endif
85              // Call into Manager to let know that we will unbind.
86              if (this->managerCallBack)
87              {
88                  this->managerCallBack(instance, value);
89              }
90  
91              // Stop watching for errors
92              removeErrorWatch();
93  
94              // Set the device inactive
95              device.setActive(false);
96  
97              // Clear throttles (OCC not active after disabling device)
98              updateThrottle(false, THROTTLED_ALL);
99          }
100      }
101      else if (value && !device.active())
102      {
103          // Existing error watch is on a dead file descriptor.
104          removeErrorWatch();
105  
106          /*
107           * In it's constructor, Status checks Device::bound() to see if OCC is
108           * active or not.
109           * Device::bound() checks for occX-dev0 directory.
110           * We will lose occX-dev0 directories during FSI rescan.
111           * So, if we start this application (and construct Status), and then
112           * later do FSI rescan, we will end up with occActive = true and device
113           * NOT bound. Lets correct that situation here.
114           */
115          device.setActive(true);
116  
117          // Add error watch again
118          try
119          {
120              addErrorWatch();
121          }
122          catch (const OpenFailure& e)
123          {
124              // Failed to add watch for throttle events, request reset to try to
125              // recover comm
126              lg2::error(
127                  "Status::occActive: Unable to add error watch(s) again for OCC{INST} watch: {ERROR}",
128                  "INST", instance, "ERROR", e.what());
129              deviceError(Error::Descriptor(OCC_COMM_ERROR_PATH));
130              return Base::Status::occActive(false);
131          }
132      }
133      else if (!value && device.active())
134      {
135          removeErrorWatch();
136  
137          // In the event that the application never receives the active signal
138          // even though the OCC is active (this can occur if the BMC is rebooted
139          // with the host on, since the initial OCC driver probe will discover
140          // the OCCs), this application needs to be able to unbind the device
141          // when we get the OCC inactive signal.
142          device.setActive(false);
143      }
144      return Base::Status::occActive(value);
145  }
146  
147  // Callback handler when a device error is reported.
deviceError(Error::Descriptor d)148  void Status::deviceError(Error::Descriptor d)
149  {
150  #ifdef POWER10
151      if (pmode && device.master())
152      {
153          // Prevent mode changes
154          pmode->setMasterActive(false);
155      }
156  #endif
157  
158      if (d.log)
159      {
160          FFDC::createOCCResetPEL(instance, d.path, d.err, d.callout,
161                                  d.isInventoryCallout);
162      }
163  
164      // This would deem OCC inactive
165      this->occActive(false);
166  
167      // Reset the OCC
168      this->resetOCC();
169  }
170  
171  // Sends message to host control command handler to reset OCC
resetOCC()172  void Status::resetOCC()
173  {
174      lg2::info(">>Status::resetOCC() - requesting reset for OCC{INST}", "INST",
175                instance);
176      this->occActive(false);
177  #ifdef PLDM
178      if (resetCallBack)
179      {
180          this->resetCallBack(instance);
181      }
182  #else
183      constexpr auto CONTROL_HOST_PATH = "/org/open_power/control/host0";
184      constexpr auto CONTROL_HOST_INTF = "org.open_power.Control.Host";
185  
186      // This will throw exception on failure
187      auto service = utils::getService(CONTROL_HOST_PATH, CONTROL_HOST_INTF);
188  
189      auto& bus = utils::getBus();
190      auto method = bus.new_method_call(service.c_str(), CONTROL_HOST_PATH,
191                                        CONTROL_HOST_INTF, "Execute");
192      // OCC Reset control command
193      method.append(convertForMessage(Control::Host::Command::OCCReset).c_str());
194  
195      // OCC Sensor ID for callout reasons
196      method.append(std::variant<uint8_t>(std::get<0>(sensorMap.at(instance))));
197      bus.call_noreply(method);
198      return;
199  #endif
200  }
201  
202  // Handler called by Host control command handler to convey the
203  // status of the executed command
hostControlEvent(sdbusplus::message_t & msg)204  void Status::hostControlEvent(sdbusplus::message_t& msg)
205  {
206      std::string cmdCompleted{};
207      std::string cmdStatus{};
208  
209      msg.read(cmdCompleted, cmdStatus);
210  
211      lg2::debug("Host control signal values: command={CMD}, status={STATUS}",
212                 "CMD", cmdCompleted, "STATUS", cmdStatus);
213  
214      if (Control::Host::convertResultFromString(cmdStatus) !=
215          Control::Host::Result::Success)
216      {
217          if (Control::Host::convertCommandFromString(cmdCompleted) ==
218              Control::Host::Command::OCCReset)
219          {
220              // Must be a Timeout. Log an Error trace
221              lg2::error(
222                  "Error resetting the OCC: path={PATH}, sensorid={SENSOR}",
223                  "PATH", path, "SENSOR", std::get<0>(sensorMap.at(instance)));
224          }
225      }
226      return;
227  }
228  
229  // Called from Manager::pollerTimerExpired() in preperation to POLL OCC.
readOccState()230  void Status::readOccState()
231  {
232      if (stateValid)
233      {
234          // Reset retry count (since state is good)
235          currentOccReadRetriesCount = occReadRetries;
236      }
237      occReadStateNow();
238  }
239  
240  #ifdef POWER10
241  // Special processing that needs to happen once the OCCs change to ACTIVE state
occsWentActive()242  void Status::occsWentActive()
243  {
244      CmdStatus status = CmdStatus::SUCCESS;
245  
246      // IPS data will get sent automatically after a mode change if the mode
247      // supports it.
248      pmode->needToSendIPS();
249  
250      status = pmode->sendModeChange();
251      if (status != CmdStatus::SUCCESS)
252      {
253          lg2::error(
254              "Status::occsWentActive: OCC mode change failed with status {STATUS}",
255              "STATUS", status);
256  
257          // Disable and reset to try recovering
258          deviceError();
259      }
260  }
261  
262  // Send Ambient and Altitude to the OCC
sendAmbient(const uint8_t inTemp,const uint16_t inAltitude)263  CmdStatus Status::sendAmbient(const uint8_t inTemp, const uint16_t inAltitude)
264  {
265      CmdStatus status = CmdStatus::FAILURE;
266      bool ambientValid = true;
267      uint8_t ambientTemp = inTemp;
268      uint16_t altitude = inAltitude;
269  
270      if (ambientTemp == 0xFF)
271      {
272          // Get latest readings from manager
273          manager.getAmbientData(ambientValid, ambientTemp, altitude);
274          lg2::debug(
275              "sendAmbient: valid: {VALID}, Ambient: {TEMP}C, altitude: {ALT}m",
276              "VALID", ambientValid, "TEMP", ambientTemp, "ALT", altitude);
277      }
278  
279      std::vector<std::uint8_t> cmd, rsp;
280      cmd.reserve(11);
281      cmd.push_back(uint8_t(CmdType::SEND_AMBIENT));
282      cmd.push_back(0x00);                    // Data Length (2 bytes)
283      cmd.push_back(0x08);                    //
284      cmd.push_back(0x00);                    // Version
285      cmd.push_back(ambientValid ? 0 : 0xFF); // Ambient Status
286      cmd.push_back(ambientTemp);             // Ambient Temperature
287      cmd.push_back(altitude >> 8);           // Altitude in meters (2 bytes)
288      cmd.push_back(altitude & 0xFF);         //
289      cmd.push_back(0x00);                    // Reserved (3 bytes)
290      cmd.push_back(0x00);
291      cmd.push_back(0x00);
292      lg2::debug("sendAmbient: SEND_AMBIENT "
293                 "command to OCC{INST} ({SIZE} bytes)",
294                 "INST", instance, "SIZE", cmd.size());
295      status = occCmd.send(cmd, rsp);
296      if (status == CmdStatus::SUCCESS)
297      {
298          if (rsp.size() == 5)
299          {
300              if (RspStatus::SUCCESS != RspStatus(rsp[2]))
301              {
302                  lg2::error(
303                      "sendAmbient: SEND_AMBIENT failed with rspStatus {STATUS}",
304                      "STATUS", lg2::hex, rsp[2]);
305                  dump_hex(rsp);
306                  status = CmdStatus::FAILURE;
307              }
308          }
309          else
310          {
311              lg2::error(
312                  "sendAmbient: INVALID SEND_AMBIENT response length:{SIZE}",
313                  "SIZE", rsp.size());
314              dump_hex(rsp);
315              status = CmdStatus::FAILURE;
316          }
317      }
318      else
319      {
320          lg2::error("sendAmbient: SEND_AMBIENT FAILED! with status {STATUS}",
321                     "STATUS", lg2::hex, uint8_t(status));
322  
323          if (status == CmdStatus::COMM_FAILURE)
324          {
325              // Disable due to OCC comm failure and reset to try recovering
326              deviceError(Error::Descriptor(OCC_COMM_ERROR_PATH));
327          }
328      }
329  
330      return status;
331  }
332  
333  // Called when safe timer expires to determine if OCCs need to be reset
safeStateDelayExpired()334  void Status::safeStateDelayExpired()
335  {
336      if (this->occActive())
337      {
338          lg2::info(
339              "safeStateDelayExpired: OCC{INST} state missing or not valid, requesting reset",
340              "INST", instance);
341          // Disable and reset to try recovering
342          deviceError(Error::Descriptor(SAFE_ERROR_PATH));
343      }
344  }
345  #endif // POWER10
346  
getHwmonPath()347  fs::path Status::getHwmonPath()
348  {
349      using namespace std::literals::string_literals;
350  
351      if (!fs::exists(hwmonPath))
352      {
353          static bool tracedFail[8] = {0};
354  
355          if (!hwmonPath.empty())
356          {
357              lg2::warning(
358                  "Status::getHwmonPath(): path no longer exists: {PATH}", "PATH",
359                  hwmonPath);
360              hwmonPath.clear();
361          }
362  
363          // Build the base HWMON path
364          fs::path prefixPath =
365              fs::path{OCC_HWMON_PATH + "occ-hwmon."s +
366                       std::to_string(instance + 1) + "/hwmon/"s};
367  
368          // Get the hwmonXX directory name
369          try
370          {
371              // there should only be one directory
372              const int numDirs = std::distance(
373                  fs::directory_iterator(prefixPath), fs::directory_iterator{});
374              if (numDirs == 1)
375              {
376                  hwmonPath = *fs::directory_iterator(prefixPath);
377                  tracedFail[instance] = false;
378              }
379              else
380              {
381                  if (!tracedFail[instance])
382                  {
383                      lg2::error(
384                          "Status::getHwmonPath(): Found multiple ({NUM}) hwmon paths!",
385                          "NUM", numDirs);
386                      tracedFail[instance] = true;
387                  }
388              }
389          }
390          catch (const fs::filesystem_error& e)
391          {
392              if (!tracedFail[instance])
393              {
394                  lg2::error(
395                      "Status::getHwmonPath(): error accessing {PATH}: {ERROR}",
396                      "PATH", prefixPath, "ERROR", e.what());
397                  tracedFail[instance] = true;
398              }
399          }
400      }
401  
402      return hwmonPath;
403  }
404  
405  // Called to read state and handle any errors
occReadStateNow()406  void Status::occReadStateNow()
407  {
408      unsigned int state;
409      const fs::path filename =
410          fs::path(DEV_PATH) /
411          fs::path(sysfsName + "." + std::to_string(instance + 1)) / "occ_state";
412  
413      std::ifstream file;
414      bool goodFile = false;
415  
416      // open file.
417      file.open(filename, std::ios::in);
418      const int openErrno = errno;
419  
420      // File is open and state can be used.
421      if (file.is_open() && file.good())
422      {
423          goodFile = true;
424          file >> state;
425          // Read the error code (if any) to check status of the read
426          std::ios_base::iostate readState = file.rdstate();
427          if (readState)
428          {
429              // There was a failure reading the file
430              if (lastOccReadStatus != -1)
431              {
432                  // Trace error bits
433                  std::string errorBits = "";
434                  if (readState & std::ios_base::eofbit)
435                  {
436                      errorBits += " EOF";
437                  }
438                  if (readState & std::ios_base::failbit)
439                  {
440                      errorBits += " failbit";
441                  }
442                  if (readState & std::ios_base::badbit)
443                  {
444                      errorBits += " badbit";
445                  }
446                  lg2::error(
447                      "readOccState: Failed to read OCC{INST} state: Read error on I/O operation - {ERROR}",
448                      "INST", instance, "ERROR", errorBits);
449                  lastOccReadStatus = -1;
450              }
451              goodFile = false;
452          }
453  
454          if (goodFile && (state != lastState))
455          {
456              // Trace OCC state changes
457              lg2::info(
458                  "Status::readOccState: OCC{INST} state {STATE} (lastState: {PRIOR})",
459                  "INST", instance, "STATE", lg2::hex, state, "PRIOR", lg2::hex,
460                  lastState);
461              lastState = state;
462  #ifdef POWER10
463              if (OccState(state) == OccState::ACTIVE)
464              {
465                  if (pmode && device.master())
466                  {
467                      // Set the master OCC on the PowerMode object
468                      pmode->setMasterOcc(path);
469                      // Enable mode changes
470                      pmode->setMasterActive();
471  
472                      // Special processing by master OCC when it goes active
473                      occsWentActive();
474                  }
475  
476                  CmdStatus status = sendAmbient();
477                  if (status != CmdStatus::SUCCESS)
478                  {
479                      lg2::error(
480                          "readOccState: Sending Ambient failed with status {STATUS}",
481                          "STATUS", status);
482                  }
483              }
484  
485              // If OCC in known Good State.
486              if ((OccState(state) == OccState::ACTIVE) ||
487                  (OccState(state) == OccState::CHARACTERIZATION) ||
488                  (OccState(state) == OccState::OBSERVATION))
489              {
490                  // Good OCC State then sensors valid again
491                  stateValid = true;
492  
493                  if (safeStateDelayTimer.isEnabled())
494                  {
495                      // stop safe delay timer (no longer in SAFE state)
496                      safeStateDelayTimer.setEnabled(false);
497                  }
498              }
499              else
500              {
501                  // OCC is in SAFE or some other unsupported state
502                  if (!safeStateDelayTimer.isEnabled())
503                  {
504                      lg2::error(
505                          "readOccState: Invalid OCC{INST} state of {STATE}, starting safe state delay timer",
506                          "INST", instance, "STATE", state);
507                      // start safe delay timer (before requesting reset)
508                      using namespace std::literals::chrono_literals;
509                      safeStateDelayTimer.restartOnce(60s);
510                  }
511                  // Not a supported state (update sensors to NaN and not
512                  // functional)
513                  stateValid = false;
514              }
515  #else
516              // Before P10 state not checked, only used good file open.
517              stateValid = true;
518  #endif
519          }
520      }
521  #ifdef POWER10
522      else
523      {
524          // Unable to read state
525          stateValid = false;
526      }
527  #endif
528      file.close();
529  
530      // if failed to Read a state or not a valid state -> Attempt retry
531      // after 1 Second delay if allowed.
532      if ((!goodFile) || (!stateValid))
533      {
534          if (!goodFile)
535          {
536              // If not able to read, OCC may be offline
537              if (openErrno != lastOccReadStatus)
538              {
539                  lg2::error(
540                      "Status::readOccState: open/read failed trying to read OCC{INST} state (open errno={ERROR})",
541                      "INST", instance, "ERROR", openErrno);
542                  lastOccReadStatus = openErrno;
543              }
544          }
545          else
546          {
547              // else this failed due to state not valid.
548              if (state != lastState)
549              {
550                  lg2::error(
551                      "Status::readOccState: OCC{INST} Invalid state {STATE} (last state: {PRIOR})",
552                      "INST", instance, "STATE", lg2::hex, state, "PRIOR",
553                      lg2::hex, lastState);
554              }
555          }
556  
557  #ifdef READ_OCC_SENSORS
558          manager.setSensorValueToNaN(instance);
559  #endif
560  
561          // See occReadRetries for number of retry attempts.
562          if (currentOccReadRetriesCount > 0)
563          {
564              --currentOccReadRetriesCount;
565          }
566          else
567          {
568              lg2::error("readOccState: failed to read OCC{INST} state!", "INST",
569                         instance);
570  
571              // State could not be determined, set it to NO State.
572              lastState = 0;
573  
574              // Disable the ability to send Failed actions until OCC is
575              // Active again.
576              stateValid = false;
577  
578              // Disable due to OCC comm failure and reset to try recovering
579              // (processor callout will be added)
580              deviceError(Error::Descriptor(OCC_COMM_ERROR_PATH, ECOMM,
581                                            procPath.c_str(), true));
582  
583              // Reset retry count (for next attempt after recovery)
584              currentOccReadRetriesCount = occReadRetries;
585          }
586      }
587      else
588      {
589          if (lastOccReadStatus != 0)
590          {
591              lg2::info(
592                  "Status::readOccState: successfully read OCC{INST} state: {STATE}",
593                  "INST", instance, "STATE", state);
594              lastOccReadStatus = 0; // no error
595          }
596      }
597  }
598  
599  // Update processor throttle status on dbus
updateThrottle(const bool isThrottled,const uint8_t newReason)600  void Status::updateThrottle(const bool isThrottled, const uint8_t newReason)
601  {
602      if (!throttleHandle)
603      {
604          return;
605      }
606  
607      uint8_t newThrottleCause = throttleCause;
608  
609      if (isThrottled) // throttled due to newReason
610      {
611          if ((newReason & throttleCause) == 0)
612          {
613              // set the bit(s) for passed in reason
614              newThrottleCause |= newReason;
615          }
616          // else no change
617      }
618      else // no longer throttled due to newReason
619      {
620          if ((newReason & throttleCause) != 0)
621          {
622              // clear the bit(s) for passed in reason
623              newThrottleCause &= ~newReason;
624          }
625          // else no change
626      }
627  
628      if (newThrottleCause != throttleCause)
629      {
630          if (newThrottleCause == THROTTLED_NONE)
631          {
632              lg2::debug(
633                  "updateThrottle: OCC{INST} no longer throttled (prior reason: {REASON})",
634                  "INST", instance, "REASON", throttleCause);
635              throttleCause = THROTTLED_NONE;
636              throttleHandle->throttled(false);
637              throttleHandle->throttleCauses({});
638          }
639          else
640          {
641              lg2::debug(
642                  "updateThrottle: OCC{INST} is throttled with reason {REASON} (prior reason: {PRIOR})",
643                  "INST", instance, "REASON", newThrottleCause, "PRIOR",
644                  throttleCause);
645              throttleCause = newThrottleCause;
646  
647              std::vector<ThrottleObj::ThrottleReasons> updatedCauses;
648              if (throttleCause & THROTTLED_POWER)
649              {
650                  updatedCauses.push_back(
651                      throttleHandle->ThrottleReasons::PowerLimit);
652              }
653              if (throttleCause & THROTTLED_THERMAL)
654              {
655                  updatedCauses.push_back(
656                      throttleHandle->ThrottleReasons::ThermalLimit);
657              }
658              if (throttleCause & THROTTLED_SAFE)
659              {
660                  updatedCauses.push_back(
661                      throttleHandle->ThrottleReasons::ManagementDetectedFault);
662              }
663              throttleHandle->throttleCauses(updatedCauses);
664              throttleHandle->throttled(true);
665          }
666      }
667      // else no change to throttle status
668  }
669  
670  // Get processor path associated with this OCC
readProcAssociation()671  void Status::readProcAssociation()
672  {
673      std::string managingPath = path + "/power_managing";
674      lg2::debug("readProcAssociation: getting endpoints for {MANAGE} ({PATH})",
675                 "MANAGE", managingPath, "PATH", path);
676      try
677      {
678          utils::PropertyValue procPathProperty{};
679          procPathProperty = utils::getProperty(
680              managingPath, "xyz.openbmc_project.Association", "endpoints");
681          auto result = std::get<std::vector<std::string>>(procPathProperty);
682          if (result.size() > 0)
683          {
684              procPath = result[0];
685              lg2::info("readProcAssociation: OCC{INST} has proc={PATH}", "INST",
686                        instance, "PATH", procPath);
687          }
688          else
689          {
690              lg2::error(
691                  "readProcAssociation: No processor associated with OCC{INST} / {PATH}",
692                  "INST", instance, "PATH", path);
693          }
694      }
695      catch (const sdbusplus::exception_t& e)
696      {
697          lg2::error(
698              "readProcAssociation: Unable to get proc assocated with {PATH} - {ERROR}",
699              "PATH", path, "ERROR", e.what());
700          procPath = {};
701      }
702  }
703  
704  } // namespace occ
705  } // namespace open_power
706