xref: /openbmc/openpower-occ-control/occ_manager.cpp (revision 720a3841e8684f93a25953d9db66bd4d0a4c3df7)
1  #include "config.h"
2  
3  #include "occ_manager.hpp"
4  
5  #include "i2c_occ.hpp"
6  #include "occ_dbus.hpp"
7  #include "occ_errors.hpp"
8  #include "utils.hpp"
9  
10  #include <phosphor-logging/elog-errors.hpp>
11  #include <phosphor-logging/lg2.hpp>
12  #include <xyz/openbmc_project/Common/error.hpp>
13  
14  #include <chrono>
15  #include <cmath>
16  #include <filesystem>
17  #include <fstream>
18  #include <regex>
19  
20  namespace open_power
21  {
22  namespace occ
23  {
24  
25  constexpr uint32_t fruTypeNotAvailable = 0xFF;
26  constexpr auto fruTypeSuffix = "fru_type";
27  constexpr auto faultSuffix = "fault";
28  constexpr auto inputSuffix = "input";
29  constexpr auto maxSuffix = "max";
30  
31  const auto HOST_ON_FILE = "/run/openbmc/host@0-on";
32  
33  using namespace phosphor::logging;
34  using namespace std::literals::chrono_literals;
35  
36  template <typename T>
readFile(const std::string & path)37  T readFile(const std::string& path)
38  {
39      std::ifstream ifs;
40      ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit |
41                     std::ifstream::eofbit);
42      T data;
43  
44      try
45      {
46          ifs.open(path);
47          ifs >> data;
48          ifs.close();
49      }
50      catch (const std::exception& e)
51      {
52          auto err = errno;
53          throw std::system_error(err, std::generic_category());
54      }
55  
56      return data;
57  }
58  
createPldmHandle()59  void Manager::createPldmHandle()
60  {
61  #ifdef PLDM
62      pldmHandle = std::make_unique<pldm::Interface>(
63          std::bind(std::mem_fn(&Manager::updateOCCActive), this,
64                    std::placeholders::_1, std::placeholders::_2),
65          std::bind(std::mem_fn(&Manager::sbeHRESETResult), this,
66                    std::placeholders::_1, std::placeholders::_2),
67          std::bind(std::mem_fn(&Manager::updateOccSafeMode), this,
68                    std::placeholders::_1),
69          event);
70  #endif
71  }
72  
73  // findAndCreateObjects():
74  // Takes care of getting the required objects created and
75  // finds the available devices/processors.
76  // (function is called everytime the discoverTimer expires)
77  // - create the PowerMode object to control OCC modes
78  // - create statusObjects for each OCC device found
79  // - waits for OCC Active sensors PDRs to become available
80  // - restart discoverTimer if all data is not available yet
findAndCreateObjects()81  void Manager::findAndCreateObjects()
82  {
83  #ifndef POWER10
84      for (auto id = 0; id < MAX_CPUS; ++id)
85      {
86          // Create one occ per cpu
87          auto occ = std::string(OCC_NAME) + std::to_string(id);
88          createObjects(occ);
89      }
90  #else
91      if (!pmode)
92      {
93          // Create the power mode object
94          pmode = std::make_unique<powermode::PowerMode>(
95              *this, powermode::PMODE_PATH, powermode::PIPS_PATH, event);
96      }
97  
98      if (!fs::exists(HOST_ON_FILE))
99      {
100          static bool statusObjCreated = false;
101          if (!statusObjCreated)
102          {
103              // Create the OCCs based on on the /dev/occX devices
104              auto occs = findOCCsInDev();
105  
106              if (occs.empty() || (prevOCCSearch.size() != occs.size()))
107              {
108                  // Something changed or no OCCs yet, try again in 10s.
109                  // Note on the first pass prevOCCSearch will be empty,
110                  // so there will be at least one delay to give things
111                  // a chance to settle.
112                  prevOCCSearch = occs;
113  
114                  lg2::info(
115                      "Manager::findAndCreateObjects(): Waiting for OCCs (currently {QTY})",
116                      "QTY", occs.size());
117  
118                  discoverTimer->restartOnce(10s);
119              }
120              else
121              {
122                  // All OCCs appear to be available, create status objects
123  
124                  // createObjects requires OCC0 first.
125                  std::sort(occs.begin(), occs.end());
126  
127                  lg2::info(
128                      "Manager::findAndCreateObjects(): Creating {QTY} OCC Status Objects",
129                      "QTY", occs.size());
130                  for (auto id : occs)
131                  {
132                      createObjects(std::string(OCC_NAME) + std::to_string(id));
133                  }
134                  statusObjCreated = true;
135                  waitingForAllOccActiveSensors = true;
136  
137                  // Find/update the processor path associated with each OCC
138                  for (auto& obj : statusObjects)
139                  {
140                      obj->updateProcAssociation();
141                  }
142              }
143          }
144  
145          if (statusObjCreated && waitingForAllOccActiveSensors)
146          {
147              static bool tracedHostWait = false;
148              if (utils::isHostRunning())
149              {
150                  if (tracedHostWait)
151                  {
152                      lg2::info(
153                          "Manager::findAndCreateObjects(): Host is running");
154                      tracedHostWait = false;
155                  }
156                  checkAllActiveSensors();
157              }
158              else
159              {
160                  if (!tracedHostWait)
161                  {
162                      lg2::info(
163                          "Manager::findAndCreateObjects(): Waiting for host to start");
164                      tracedHostWait = true;
165                  }
166                  discoverTimer->restartOnce(30s);
167  #ifdef PLDM
168                  if (throttlePldmTraceTimer->isEnabled())
169                  {
170                      // Host is no longer running, disable throttle timer and
171                      // make sure traces are not throttled
172                      lg2::info("findAndCreateObjects(): disabling sensor timer");
173                      throttlePldmTraceTimer->setEnabled(false);
174                      pldmHandle->setTraceThrottle(false);
175                  }
176  #endif
177              }
178          }
179      }
180      else
181      {
182          lg2::info(
183              "Manager::findAndCreateObjects(): Waiting for {FILE} to complete...",
184              "FILE", HOST_ON_FILE);
185          discoverTimer->restartOnce(10s);
186      }
187  #endif
188  }
189  
190  #ifdef POWER10
191  // Check if all occActive sensors are available
checkAllActiveSensors()192  void Manager::checkAllActiveSensors()
193  {
194      static bool allActiveSensorAvailable = false;
195      static bool tracedSensorWait = false;
196      static bool waitingForHost = false;
197  
198      if (open_power::occ::utils::isHostRunning())
199      {
200          if (waitingForHost)
201          {
202              waitingForHost = false;
203              lg2::info("checkAllActiveSensors(): Host is now running");
204          }
205  
206          // Start with the assumption that all are available
207          allActiveSensorAvailable = true;
208          for (auto& obj : statusObjects)
209          {
210              if ((!obj->occActive()) && (!obj->getPldmSensorReceived()))
211              {
212                  auto instance = obj->getOccInstanceID();
213                  // Check if sensor was queued while waiting for discovery
214                  auto match = queuedActiveState.find(instance);
215                  if (match != queuedActiveState.end())
216                  {
217                      queuedActiveState.erase(match);
218                      lg2::info(
219                          "checkAllActiveSensors(): OCC{INST} is ACTIVE (queued)",
220                          "INST", instance);
221                      obj->occActive(true);
222                  }
223                  else
224                  {
225                      allActiveSensorAvailable = false;
226                      if (!tracedSensorWait)
227                      {
228                          lg2::info(
229                              "checkAllActiveSensors(): Waiting on OCC{INST} Active sensor",
230                              "INST", instance);
231                          tracedSensorWait = true;
232  #ifdef PLDM
233                          // Make sure PLDM traces are not throttled
234                          pldmHandle->setTraceThrottle(false);
235                          // Start timer to throttle PLDM traces when timer
236                          // expires
237                          onPldmTimeoutCreatePel = false;
238                          throttlePldmTraceTimer->restartOnce(5min);
239  #endif
240                      }
241  #ifdef PLDM
242                      // Ignore active sensor check if the OCCs are being reset
243                      if (!resetInProgress)
244                      {
245                          pldmHandle->checkActiveSensor(obj->getOccInstanceID());
246                      }
247  #endif
248                      break;
249                  }
250              }
251          }
252      }
253      else
254      {
255          if (!waitingForHost)
256          {
257              waitingForHost = true;
258              lg2::info("checkAllActiveSensors(): Waiting for host to start");
259  #ifdef PLDM
260              if (throttlePldmTraceTimer->isEnabled())
261              {
262                  // Host is no longer running, disable throttle timer and
263                  // make sure traces are not throttled
264                  lg2::info("checkAllActiveSensors(): disabling sensor timer");
265                  throttlePldmTraceTimer->setEnabled(false);
266                  pldmHandle->setTraceThrottle(false);
267              }
268  #endif
269          }
270      }
271  
272      if (allActiveSensorAvailable)
273      {
274          // All sensors were found, disable the discovery timer
275          if (discoverTimer->isEnabled())
276          {
277              discoverTimer->setEnabled(false);
278          }
279  #ifdef PLDM
280          if (throttlePldmTraceTimer->isEnabled())
281          {
282              // Disable throttle timer and make sure traces are not throttled
283              throttlePldmTraceTimer->setEnabled(false);
284              pldmHandle->setTraceThrottle(false);
285          }
286  #endif
287          if (waitingForAllOccActiveSensors)
288          {
289              lg2::info(
290                  "checkAllActiveSensors(): OCC Active sensors are available");
291              waitingForAllOccActiveSensors = false;
292  
293              if (resetRequired)
294              {
295                  initiateOccRequest(resetInstance);
296  
297                  if (!waitForAllOccsTimer->isEnabled())
298                  {
299                      lg2::warning(
300                          "occsNotAllRunning: Restarting waitForAllOccTimer");
301                      // restart occ wait timer to check status after reset
302                      // completes
303                      waitForAllOccsTimer->restartOnce(60s);
304                  }
305              }
306          }
307          queuedActiveState.clear();
308          tracedSensorWait = false;
309      }
310      else
311      {
312          // Not all sensors were available, so keep waiting
313          if (!tracedSensorWait)
314          {
315              lg2::info(
316                  "checkAllActiveSensors(): Waiting for OCC Active sensors to become available");
317              tracedSensorWait = true;
318          }
319          discoverTimer->restartOnce(10s);
320      }
321  }
322  #endif
323  
findOCCsInDev()324  std::vector<int> Manager::findOCCsInDev()
325  {
326      std::vector<int> occs;
327      std::regex expr{R"(occ(\d+)$)"};
328  
329      for (auto& file : fs::directory_iterator("/dev"))
330      {
331          std::smatch match;
332          std::string path{file.path().string()};
333          if (std::regex_search(path, match, expr))
334          {
335              auto num = std::stoi(match[1].str());
336  
337              // /dev numbering starts at 1, ours starts at 0.
338              occs.push_back(num - 1);
339          }
340      }
341  
342      return occs;
343  }
344  
cpuCreated(sdbusplus::message_t & msg)345  int Manager::cpuCreated(sdbusplus::message_t& msg)
346  {
347      namespace fs = std::filesystem;
348  
349      sdbusplus::message::object_path o;
350      msg.read(o);
351      fs::path cpuPath(std::string(std::move(o)));
352  
353      auto name = cpuPath.filename().string();
354      auto index = name.find(CPU_NAME);
355      name.replace(index, std::strlen(CPU_NAME), OCC_NAME);
356  
357      createObjects(name);
358  
359      return 0;
360  }
361  
createObjects(const std::string & occ)362  void Manager::createObjects(const std::string& occ)
363  {
364      auto path = fs::path(OCC_CONTROL_ROOT) / occ;
365  
366      statusObjects.emplace_back(std::make_unique<Status>(
367          event, path.c_str(), *this,
368  #ifdef POWER10
369          pmode,
370  #endif
371          std::bind(std::mem_fn(&Manager::statusCallBack), this,
372                    std::placeholders::_1, std::placeholders::_2)
373  #ifdef PLDM
374              ,
375          // Callback will set flag indicating reset needs to be done
376          // instead of immediately issuing a reset via PLDM.
377          std::bind(std::mem_fn(&Manager::resetOccRequest), this,
378                    std::placeholders::_1)
379  #endif
380              ));
381  
382      // Create the power cap monitor object
383      if (!pcap)
384      {
385          pcap = std::make_unique<open_power::occ::powercap::PowerCap>(
386              *statusObjects.back());
387      }
388  
389      if (statusObjects.back()->isMasterOcc())
390      {
391          lg2::info("Manager::createObjects(): OCC{INST} is the master", "INST",
392                    statusObjects.back()->getOccInstanceID());
393          _pollTimer->setEnabled(false);
394  
395  #ifdef POWER10
396          // Set the master OCC on the PowerMode object
397          pmode->setMasterOcc(path);
398  #endif
399      }
400  
401      passThroughObjects.emplace_back(std::make_unique<PassThrough>(
402          path.c_str()
403  #ifdef POWER10
404              ,
405          pmode
406  #endif
407          ));
408  }
409  
410  // If a reset is not already outstanding, set a flag to indicate that a reset is
411  // needed.
resetOccRequest(instanceID instance)412  void Manager::resetOccRequest(instanceID instance)
413  {
414      if (!resetRequired)
415      {
416          resetRequired = true;
417          resetInstance = instance;
418          lg2::error(
419              "resetOccRequest: PM Complex reset was requested due to OCC{INST}",
420              "INST", instance);
421      }
422      else if (instance != resetInstance)
423      {
424          lg2::warning(
425              "resetOccRequest: Ignoring PM Complex reset request for OCC{INST}, because reset already outstanding for OCC{RINST}",
426              "INST", instance, "RINST", resetInstance);
427      }
428  }
429  
430  // If a reset has not been started, initiate an OCC reset via PLDM
initiateOccRequest(instanceID instance)431  void Manager::initiateOccRequest(instanceID instance)
432  {
433      if (!resetInProgress)
434      {
435          resetInProgress = true;
436          resetInstance = instance;
437          lg2::error(
438              "initiateOccRequest: Initiating PM Complex reset due to OCC{INST}",
439              "INST", instance);
440  #ifdef PLDM
441          pldmHandle->resetOCC(instance);
442  #endif
443          resetRequired = false;
444      }
445      else
446      {
447          lg2::warning(
448              "initiateOccRequest: Ignoring PM Complex reset request for OCC{INST}, because reset already in process for OCC{RINST}",
449              "INST", instance, "RINST", resetInstance);
450      }
451  }
452  
statusCallBack(instanceID instance,bool status)453  void Manager::statusCallBack(instanceID instance, bool status)
454  {
455      if (status == true)
456      {
457          if (resetInProgress)
458          {
459              lg2::info(
460                  "statusCallBack: Ignoring OCC{INST} activate because a reset has been initiated due to OCC{INST}",
461                  "INST", instance, "RINST", resetInstance);
462              return;
463          }
464  
465          // OCC went active
466          ++activeCount;
467  
468  #ifdef POWER10
469          if (activeCount == 1)
470          {
471              // First OCC went active (allow some time for all OCCs to go active)
472              waitForAllOccsTimer->restartOnce(60s);
473          }
474  #endif
475  
476          if (activeCount == statusObjects.size())
477          {
478  #ifdef POWER10
479              // All OCCs are now running
480              if (waitForAllOccsTimer->isEnabled())
481              {
482                  // stop occ wait timer
483                  waitForAllOccsTimer->setEnabled(false);
484              }
485  
486              // All OCCs have been found, check if we need a reset
487              if (resetRequired)
488              {
489                  initiateOccRequest(resetInstance);
490  
491                  if (!waitForAllOccsTimer->isEnabled())
492                  {
493                      lg2::warning(
494                          "occsNotAllRunning: Restarting waitForAllOccTimer");
495                      // restart occ wait timer
496                      waitForAllOccsTimer->restartOnce(60s);
497                  }
498              }
499              else
500              {
501                  // Verify master OCC and start presence monitor
502                  validateOccMaster();
503              }
504  #else
505              // Verify master OCC and start presence monitor
506              validateOccMaster();
507  #endif
508          }
509  
510          // Start poll timer if not already started
511          if (!_pollTimer->isEnabled())
512          {
513              lg2::info("Manager: OCCs will be polled every {TIME} seconds",
514                        "TIME", pollInterval);
515  
516              // Send poll and start OCC poll timer
517              pollerTimerExpired();
518          }
519      }
520      else
521      {
522          // OCC went away
523          if (activeCount > 0)
524          {
525              --activeCount;
526          }
527          else
528          {
529              lg2::info("OCC{INST} disabled, but currently no active OCCs",
530                        "INST", instance);
531          }
532  
533          if (activeCount == 0)
534          {
535              // No OCCs are running
536  
537              if (resetInProgress)
538              {
539                  // All OCC active sensors are clear (reset should be in
540                  // progress)
541                  lg2::info(
542                      "statusCallBack: Clearing resetInProgress (activeCount={COUNT}, OCC{INST}, status={STATUS})",
543                      "COUNT", activeCount, "INST", instance, "STATUS", status);
544                  resetInProgress = false;
545                  resetInstance = 255;
546              }
547  
548              // Stop OCC poll timer
549              if (_pollTimer->isEnabled())
550              {
551                  lg2::info(
552                      "Manager::statusCallBack(): OCCs are not running, stopping poll timer");
553                  _pollTimer->setEnabled(false);
554              }
555  
556  #ifdef POWER10
557              // stop wait timer
558              if (waitForAllOccsTimer->isEnabled())
559              {
560                  waitForAllOccsTimer->setEnabled(false);
561              }
562  #endif
563          }
564          else if (resetInProgress)
565          {
566              lg2::info(
567                  "statusCallBack: Skipping clear of resetInProgress (activeCount={COUNT}, OCC{INST}, status={STATUS})",
568                  "COUNT", activeCount, "INST", instance, "STATUS", status);
569          }
570  #ifdef READ_OCC_SENSORS
571          // Clear OCC sensors
572          setSensorValueToNaN(instance);
573  #endif
574      }
575  
576  #ifdef POWER10
577      if (waitingForAllOccActiveSensors)
578      {
579          if (utils::isHostRunning())
580          {
581              checkAllActiveSensors();
582          }
583      }
584  #endif
585  }
586  
587  #ifdef I2C_OCC
initStatusObjects()588  void Manager::initStatusObjects()
589  {
590      // Make sure we have a valid path string
591      static_assert(sizeof(DEV_PATH) != 0);
592  
593      auto deviceNames = i2c_occ::getOccHwmonDevices(DEV_PATH);
594      for (auto& name : deviceNames)
595      {
596          i2c_occ::i2cToDbus(name);
597          name = std::string(OCC_NAME) + '_' + name;
598          auto path = fs::path(OCC_CONTROL_ROOT) / name;
599          statusObjects.emplace_back(
600              std::make_unique<Status>(event, path.c_str(), *this));
601      }
602      // The first device is master occ
603      pcap = std::make_unique<open_power::occ::powercap::PowerCap>(
604          *statusObjects.front());
605  #ifdef POWER10
606      pmode = std::make_unique<powermode::PowerMode>(*this, powermode::PMODE_PATH,
607                                                     powermode::PIPS_PATH);
608      // Set the master OCC on the PowerMode object
609      pmode->setMasterOcc(path);
610  #endif
611  }
612  #endif
613  
614  #ifdef PLDM
sbeTimeout(unsigned int instance)615  void Manager::sbeTimeout(unsigned int instance)
616  {
617      auto obj = std::find_if(statusObjects.begin(), statusObjects.end(),
618                              [instance](const auto& obj) {
619                                  return instance == obj->getOccInstanceID();
620                              });
621  
622      if (obj != statusObjects.end() && (*obj)->occActive())
623      {
624          lg2::info("SBE timeout, requesting HRESET (OCC{INST})", "INST",
625                    instance);
626  
627  #ifdef PHAL_SUPPORT
628          setSBEState(instance, SBE_STATE_NOT_USABLE);
629  #endif
630  
631          pldmHandle->sendHRESET(instance);
632      }
633  }
634  
updateOCCActive(instanceID instance,bool status)635  bool Manager::updateOCCActive(instanceID instance, bool status)
636  {
637      auto obj = std::find_if(statusObjects.begin(), statusObjects.end(),
638                              [instance](const auto& obj) {
639                                  return instance == obj->getOccInstanceID();
640                              });
641  
642      const bool hostRunning = open_power::occ::utils::isHostRunning();
643      if (obj != statusObjects.end())
644      {
645          if (!hostRunning && (status == true))
646          {
647              lg2::warning(
648                  "updateOCCActive: Host is not running yet (OCC{INST} active={STAT}), clearing sensor received",
649                  "INST", instance, "STAT", status);
650              (*obj)->setPldmSensorReceived(false);
651              if (!waitingForAllOccActiveSensors)
652              {
653                  lg2::info(
654                      "updateOCCActive: Waiting for Host and all OCC Active Sensors");
655                  waitingForAllOccActiveSensors = true;
656              }
657  #ifdef POWER10
658              discoverTimer->restartOnce(30s);
659  #endif
660              return false;
661          }
662          else
663          {
664              (*obj)->setPldmSensorReceived(true);
665              return (*obj)->occActive(status);
666          }
667      }
668      else
669      {
670          if (hostRunning)
671          {
672              lg2::warning(
673                  "updateOCCActive: No status object to update for OCC{INST} (active={STAT})",
674                  "INST", instance, "STAT", status);
675          }
676          else
677          {
678              if (status == true)
679              {
680                  lg2::warning(
681                      "updateOCCActive: No status objects and Host is not running yet (OCC{INST} active={STAT})",
682                      "INST", instance, "STAT", status);
683              }
684          }
685          if (status == true)
686          {
687              // OCC went active
688              queuedActiveState.insert(instance);
689          }
690          else
691          {
692              auto match = queuedActiveState.find(instance);
693              if (match != queuedActiveState.end())
694              {
695                  // OCC was disabled
696                  queuedActiveState.erase(match);
697              }
698          }
699          return false;
700      }
701  }
702  
703  // Called upon pldm event To set powermode Safe Mode State for system.
updateOccSafeMode(bool safeMode)704  void Manager::updateOccSafeMode(bool safeMode)
705  {
706  #ifdef POWER10
707      pmode->updateDbusSafeMode(safeMode);
708  #endif
709      // Update the processor throttle status on dbus
710      for (auto& obj : statusObjects)
711      {
712          obj->updateThrottle(safeMode, THROTTLED_SAFE);
713      }
714  }
715  
sbeHRESETResult(instanceID instance,bool success)716  void Manager::sbeHRESETResult(instanceID instance, bool success)
717  {
718      if (success)
719      {
720          lg2::info("HRESET succeeded (OCC{INST})", "INST", instance);
721  
722  #ifdef PHAL_SUPPORT
723          setSBEState(instance, SBE_STATE_BOOTED);
724  #endif
725  
726          return;
727      }
728  
729  #ifdef PHAL_SUPPORT
730      setSBEState(instance, SBE_STATE_FAILED);
731  
732      if (sbeCanDump(instance))
733      {
734          lg2::info("HRESET failed (OCC{INST}), triggering SBE dump", "INST",
735                    instance);
736  
737          auto& bus = utils::getBus();
738          uint32_t src6 = instance << 16;
739          uint32_t logId =
740              FFDC::createPEL("org.open_power.Processor.Error.SbeChipOpTimeout",
741                              src6, "SBE command timeout");
742  
743          try
744          {
745              constexpr auto interface = "xyz.openbmc_project.Dump.Create";
746              constexpr auto function = "CreateDump";
747  
748              std::string service =
749                  utils::getService(OP_DUMP_OBJ_PATH, interface);
750              auto method = bus.new_method_call(service.c_str(), OP_DUMP_OBJ_PATH,
751                                                interface, function);
752  
753              std::map<std::string, std::variant<std::string, uint64_t>>
754                  createParams{
755                      {"com.ibm.Dump.Create.CreateParameters.ErrorLogId",
756                       uint64_t(logId)},
757                      {"com.ibm.Dump.Create.CreateParameters.DumpType",
758                       "com.ibm.Dump.Create.DumpType.SBE"},
759                      {"com.ibm.Dump.Create.CreateParameters.FailingUnitId",
760                       uint64_t(instance)},
761                  };
762  
763              method.append(createParams);
764  
765              auto response = bus.call(method);
766          }
767          catch (const sdbusplus::exception_t& e)
768          {
769              constexpr auto ERROR_DUMP_DISABLED =
770                  "xyz.openbmc_project.Dump.Create.Error.Disabled";
771              if (e.name() == ERROR_DUMP_DISABLED)
772              {
773                  lg2::info("Dump is disabled, skipping");
774              }
775              else
776              {
777                  lg2::error("Dump failed");
778              }
779          }
780      }
781  #endif
782  
783      // SBE Reset failed, try PM Complex reset
784      lg2::error("sbeHRESETResult: Forcing PM Complex reset");
785      resetOccRequest(instance);
786  }
787  
788  #ifdef PHAL_SUPPORT
sbeCanDump(unsigned int instance)789  bool Manager::sbeCanDump(unsigned int instance)
790  {
791      struct pdbg_target* proc = getPdbgTarget(instance);
792  
793      if (!proc)
794      {
795          // allow the dump in the error case
796          return true;
797      }
798  
799      try
800      {
801          if (!openpower::phal::sbe::isDumpAllowed(proc))
802          {
803              return false;
804          }
805  
806          if (openpower::phal::pdbg::isSbeVitalAttnActive(proc))
807          {
808              return false;
809          }
810      }
811      catch (openpower::phal::exception::SbeError& e)
812      {
813          lg2::info("Failed to query SBE state");
814      }
815  
816      // allow the dump in the error case
817      return true;
818  }
819  
setSBEState(unsigned int instance,enum sbe_state state)820  void Manager::setSBEState(unsigned int instance, enum sbe_state state)
821  {
822      struct pdbg_target* proc = getPdbgTarget(instance);
823  
824      if (!proc)
825      {
826          return;
827      }
828  
829      try
830      {
831          openpower::phal::sbe::setState(proc, state);
832      }
833      catch (const openpower::phal::exception::SbeError& e)
834      {
835          lg2::error("Failed to set SBE state: {ERROR}", "ERROR", e.what());
836      }
837  }
838  
getPdbgTarget(unsigned int instance)839  struct pdbg_target* Manager::getPdbgTarget(unsigned int instance)
840  {
841      if (!pdbgInitialized)
842      {
843          try
844          {
845              openpower::phal::pdbg::init();
846              pdbgInitialized = true;
847          }
848          catch (const openpower::phal::exception::PdbgError& e)
849          {
850              lg2::error("pdbg initialization failed");
851              return nullptr;
852          }
853      }
854  
855      struct pdbg_target* proc = nullptr;
856      pdbg_for_each_class_target("proc", proc)
857      {
858          if (pdbg_target_index(proc) == instance)
859          {
860              return proc;
861          }
862      }
863  
864      lg2::error("Failed to get pdbg target");
865      return nullptr;
866  }
867  #endif
868  #endif
869  
pollerTimerExpired()870  void Manager::pollerTimerExpired()
871  {
872      if (!_pollTimer)
873      {
874          lg2::error("pollerTimerExpired() ERROR: Timer not defined");
875          return;
876      }
877  
878  #ifdef POWER10
879      if (resetRequired)
880      {
881          lg2::error("pollerTimerExpired() - Initiating PM Complex reset");
882          initiateOccRequest(resetInstance);
883  
884          if (!waitForAllOccsTimer->isEnabled())
885          {
886              lg2::warning("pollerTimerExpired: Restarting waitForAllOccTimer");
887              // restart occ wait timer
888              waitForAllOccsTimer->restartOnce(60s);
889          }
890          return;
891      }
892  #endif
893  
894      for (auto& obj : statusObjects)
895      {
896          if (!obj->occActive())
897          {
898              // OCC is not running yet
899  #ifdef READ_OCC_SENSORS
900              auto id = obj->getOccInstanceID();
901              setSensorValueToNaN(id);
902  #endif
903              continue;
904          }
905  
906          // Read sysfs to force kernel to poll OCC
907          obj->readOccState();
908  
909  #ifdef READ_OCC_SENSORS
910          // Read occ sensor values
911          getSensorValues(obj);
912  #endif
913      }
914  
915      if (activeCount > 0)
916      {
917          // Restart OCC poll timer
918          _pollTimer->restartOnce(std::chrono::seconds(pollInterval));
919      }
920      else
921      {
922          // No OCCs running, so poll timer will not be restarted
923          lg2::info(
924              "Manager::pollerTimerExpired: poll timer will not be restarted");
925      }
926  }
927  
928  #ifdef READ_OCC_SENSORS
readTempSensors(const fs::path & path,uint32_t occInstance)929  void Manager::readTempSensors(const fs::path& path, uint32_t occInstance)
930  {
931      // There may be more than one sensor with the same FRU type
932      // and label so make two passes: the first to read the temps
933      // from sysfs, and the second to put them on D-Bus after
934      // resolving any conflicts.
935      std::map<std::string, double> sensorData;
936  
937      std::regex expr{"temp\\d+_label$"}; // Example: temp5_label
938      for (auto& file : fs::directory_iterator(path))
939      {
940          if (!std::regex_search(file.path().string(), expr))
941          {
942              continue;
943          }
944  
945          uint32_t labelValue{0};
946  
947          try
948          {
949              labelValue = readFile<uint32_t>(file.path());
950          }
951          catch (const std::system_error& e)
952          {
953              lg2::debug(
954                  "readTempSensors: Failed reading {PATH}, errno = {ERROR}",
955                  "PATH", file.path().string(), "ERROR", e.code().value());
956              continue;
957          }
958  
959          const std::string& tempLabel = "label";
960          const std::string filePathString = file.path().string().substr(
961              0, file.path().string().length() - tempLabel.length());
962  
963          uint32_t fruTypeValue{0};
964          try
965          {
966              fruTypeValue = readFile<uint32_t>(filePathString + fruTypeSuffix);
967          }
968          catch (const std::system_error& e)
969          {
970              lg2::debug(
971                  "readTempSensors: Failed reading {PATH}, errno = {ERROR}",
972                  "PATH", filePathString + fruTypeSuffix, "ERROR",
973                  e.code().value());
974              continue;
975          }
976  
977          std::string sensorPath =
978              OCC_SENSORS_ROOT + std::string("/temperature/");
979  
980          std::string dvfsTempPath;
981  
982          if (fruTypeValue == VRMVdd)
983          {
984              sensorPath.append(
985                  "vrm_vdd" + std::to_string(occInstance) + "_temp");
986          }
987          else if (fruTypeValue == processorIoRing)
988          {
989              sensorPath.append(
990                  "proc" + std::to_string(occInstance) + "_ioring_temp");
991              dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/proc" +
992                             std::to_string(occInstance) + "_ioring_dvfs_temp";
993          }
994          else
995          {
996              uint16_t type = (labelValue & 0xFF000000) >> 24;
997              uint16_t instanceID = labelValue & 0x0000FFFF;
998  
999              if (type == OCC_DIMM_TEMP_SENSOR_TYPE)
1000              {
1001                  if (fruTypeValue == fruTypeNotAvailable)
1002                  {
1003                      // Not all DIMM related temps are available to read
1004                      // (no _input file in this case)
1005                      continue;
1006                  }
1007                  auto iter = dimmTempSensorName.find(fruTypeValue);
1008                  if (iter == dimmTempSensorName.end())
1009                  {
1010                      lg2::error(
1011                          "readTempSensors: Fru type error! fruTypeValue = {FRU}) ",
1012                          "FRU", fruTypeValue);
1013                      continue;
1014                  }
1015  
1016                  sensorPath.append(
1017                      "dimm" + std::to_string(instanceID) + iter->second);
1018  
1019                  dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/" +
1020                                 dimmDVFSSensorName.at(fruTypeValue);
1021              }
1022              else if (type == OCC_CPU_TEMP_SENSOR_TYPE)
1023              {
1024                  if (fruTypeValue == processorCore)
1025                  {
1026                      // The OCC reports small core temps, of which there are
1027                      // two per big core.  All current P10 systems are in big
1028                      // core mode, so use a big core name.
1029                      uint16_t coreNum = instanceID / 2;
1030                      uint16_t tempNum = instanceID % 2;
1031                      sensorPath.append("proc" + std::to_string(occInstance) +
1032                                        "_core" + std::to_string(coreNum) + "_" +
1033                                        std::to_string(tempNum) + "_temp");
1034  
1035                      dvfsTempPath =
1036                          std::string{OCC_SENSORS_ROOT} + "/temperature/proc" +
1037                          std::to_string(occInstance) + "_core_dvfs_temp";
1038                  }
1039                  else
1040                  {
1041                      continue;
1042                  }
1043              }
1044              else
1045              {
1046                  continue;
1047              }
1048          }
1049  
1050          // The dvfs temp file only needs to be read once per chip per type.
1051          if (!dvfsTempPath.empty() &&
1052              !dbus::OccDBusSensors::getOccDBus().hasDvfsTemp(dvfsTempPath))
1053          {
1054              try
1055              {
1056                  auto dvfsValue = readFile<double>(filePathString + maxSuffix);
1057  
1058                  dbus::OccDBusSensors::getOccDBus().setDvfsTemp(
1059                      dvfsTempPath, dvfsValue * std::pow(10, -3));
1060              }
1061              catch (const std::system_error& e)
1062              {
1063                  lg2::debug(
1064                      "readTempSensors: Failed reading {PATH}, errno = {ERROR}",
1065                      "PATH", filePathString + maxSuffix, "ERROR",
1066                      e.code().value());
1067              }
1068          }
1069  
1070          uint32_t faultValue{0};
1071          try
1072          {
1073              faultValue = readFile<uint32_t>(filePathString + faultSuffix);
1074          }
1075          catch (const std::system_error& e)
1076          {
1077              lg2::debug(
1078                  "readTempSensors: Failed reading {PATH}, errno = {ERROR}",
1079                  "PATH", filePathString + faultSuffix, "ERROR",
1080                  e.code().value());
1081              continue;
1082          }
1083  
1084          double tempValue{0};
1085          // NOTE: if OCC sends back 0xFF, kernal sets this fault value to 1.
1086          if (faultValue != 0)
1087          {
1088              tempValue = std::numeric_limits<double>::quiet_NaN();
1089          }
1090          else
1091          {
1092              // Read the temperature
1093              try
1094              {
1095                  tempValue = readFile<double>(filePathString + inputSuffix);
1096              }
1097              catch (const std::system_error& e)
1098              {
1099                  lg2::debug(
1100                      "readTempSensors: Failed reading {PATH}, errno = {ERROR}",
1101                      "PATH", filePathString + inputSuffix, "ERROR",
1102                      e.code().value());
1103  
1104                  // if errno == EAGAIN(Resource temporarily unavailable) then set
1105                  // temp to 0, to avoid using old temp, and affecting FAN
1106                  // Control.
1107                  if (e.code().value() == EAGAIN)
1108                  {
1109                      tempValue = 0;
1110                  }
1111                  // else the errno would be something like
1112                  //     EBADF(Bad file descriptor)
1113                  // or ENOENT(No such file or directory)
1114                  else
1115                  {
1116                      continue;
1117                  }
1118              }
1119          }
1120  
1121          // If this object path already has a value, only overwite
1122          // it if the previous one was an NaN or a smaller value.
1123          auto existing = sensorData.find(sensorPath);
1124          if (existing != sensorData.end())
1125          {
1126              // Multiple sensors found for this FRU type
1127              if ((std::isnan(existing->second) && (tempValue == 0)) ||
1128                  ((existing->second == 0) && std::isnan(tempValue)))
1129              {
1130                  // One of the redundant sensors has failed (0xFF/nan), and the
1131                  // other sensor has no reading (0), so set the FRU to NaN to
1132                  // force fan increase
1133                  tempValue = std::numeric_limits<double>::quiet_NaN();
1134                  existing->second = tempValue;
1135              }
1136              if (std::isnan(existing->second) || (tempValue > existing->second))
1137              {
1138                  existing->second = tempValue;
1139              }
1140          }
1141          else
1142          {
1143              // First sensor for this FRU type
1144              sensorData[sensorPath] = tempValue;
1145          }
1146      }
1147  
1148      // Now publish the values on D-Bus.
1149      for (const auto& [objectPath, value] : sensorData)
1150      {
1151          dbus::OccDBusSensors::getOccDBus().setValue(objectPath,
1152                                                      value * std::pow(10, -3));
1153  
1154          dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1155              objectPath, !std::isnan(value));
1156  
1157          if (existingSensors.find(objectPath) == existingSensors.end())
1158          {
1159              dbus::OccDBusSensors::getOccDBus().setChassisAssociation(
1160                  objectPath, {"all_sensors"});
1161          }
1162  
1163          existingSensors[objectPath] = occInstance;
1164      }
1165  }
1166  
1167  std::optional<std::string>
getPowerLabelFunctionID(const std::string & value)1168      Manager::getPowerLabelFunctionID(const std::string& value)
1169  {
1170      // If the value is "system", then the FunctionID is "system".
1171      if (value == "system")
1172      {
1173          return value;
1174      }
1175  
1176      // If the value is not "system", then the label value have 3 numbers, of
1177      // which we only care about the middle one:
1178      // <sensor id>_<function id>_<apss channel>
1179      // eg: The value is "0_10_5" , then the FunctionID is "10".
1180      if (value.find("_") == std::string::npos)
1181      {
1182          return std::nullopt;
1183      }
1184  
1185      auto powerLabelValue = value.substr((value.find("_") + 1));
1186  
1187      if (powerLabelValue.find("_") == std::string::npos)
1188      {
1189          return std::nullopt;
1190      }
1191  
1192      return powerLabelValue.substr(0, powerLabelValue.find("_"));
1193  }
1194  
readPowerSensors(const fs::path & path,uint32_t id)1195  void Manager::readPowerSensors(const fs::path& path, uint32_t id)
1196  {
1197      std::regex expr{"power\\d+_label$"}; // Example: power5_label
1198      for (auto& file : fs::directory_iterator(path))
1199      {
1200          if (!std::regex_search(file.path().string(), expr))
1201          {
1202              continue;
1203          }
1204  
1205          std::string labelValue;
1206          try
1207          {
1208              labelValue = readFile<std::string>(file.path());
1209          }
1210          catch (const std::system_error& e)
1211          {
1212              lg2::debug(
1213                  "readPowerSensors: Failed reading {PATH}, errno = {ERROR}",
1214                  "PATH", file.path().string(), "ERROR", e.code().value());
1215              continue;
1216          }
1217  
1218          auto functionID = getPowerLabelFunctionID(labelValue);
1219          if (functionID == std::nullopt)
1220          {
1221              continue;
1222          }
1223  
1224          const std::string& tempLabel = "label";
1225          const std::string filePathString = file.path().string().substr(
1226              0, file.path().string().length() - tempLabel.length());
1227  
1228          std::string sensorPath = OCC_SENSORS_ROOT + std::string("/power/");
1229  
1230          auto iter = powerSensorName.find(*functionID);
1231          if (iter == powerSensorName.end())
1232          {
1233              continue;
1234          }
1235          sensorPath.append(iter->second);
1236  
1237          double tempValue{0};
1238  
1239          try
1240          {
1241              tempValue = readFile<double>(filePathString + inputSuffix);
1242          }
1243          catch (const std::system_error& e)
1244          {
1245              lg2::debug(
1246                  "readPowerSensors: Failed reading {PATH}, errno = {ERROR}",
1247                  "PATH", filePathString + inputSuffix, "ERROR",
1248                  e.code().value());
1249              continue;
1250          }
1251  
1252          dbus::OccDBusSensors::getOccDBus().setUnit(
1253              sensorPath, "xyz.openbmc_project.Sensor.Value.Unit.Watts");
1254  
1255          dbus::OccDBusSensors::getOccDBus().setValue(
1256              sensorPath, tempValue * std::pow(10, -3) * std::pow(10, -3));
1257  
1258          dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1259              sensorPath, true);
1260  
1261          if (existingSensors.find(sensorPath) == existingSensors.end())
1262          {
1263              std::vector<int> occs;
1264              std::vector<std::string> fTypeList = {"all_sensors"};
1265              if (iter->second == "total_power")
1266              {
1267                  // Total system power has its own chassis association
1268                  fTypeList.push_back("total_power");
1269              }
1270              dbus::OccDBusSensors::getOccDBus().setChassisAssociation(
1271                  sensorPath, fTypeList);
1272          }
1273  
1274          existingSensors[sensorPath] = id;
1275      }
1276      return;
1277  }
1278  
setSensorValueToNaN(uint32_t id) const1279  void Manager::setSensorValueToNaN(uint32_t id) const
1280  {
1281      for (const auto& [sensorPath, occId] : existingSensors)
1282      {
1283          if (occId == id)
1284          {
1285              dbus::OccDBusSensors::getOccDBus().setValue(
1286                  sensorPath, std::numeric_limits<double>::quiet_NaN());
1287  
1288              dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1289                  sensorPath, true);
1290          }
1291      }
1292      return;
1293  }
1294  
setSensorValueToNonFunctional(uint32_t id) const1295  void Manager::setSensorValueToNonFunctional(uint32_t id) const
1296  {
1297      for (const auto& [sensorPath, occId] : existingSensors)
1298      {
1299          if (occId == id)
1300          {
1301              dbus::OccDBusSensors::getOccDBus().setValue(
1302                  sensorPath, std::numeric_limits<double>::quiet_NaN());
1303  
1304              dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1305                  sensorPath, false);
1306          }
1307      }
1308      return;
1309  }
1310  
getSensorValues(std::unique_ptr<Status> & occ)1311  void Manager::getSensorValues(std::unique_ptr<Status>& occ)
1312  {
1313      static bool tracedError[8] = {0};
1314      const fs::path sensorPath = occ->getHwmonPath();
1315      const uint32_t id = occ->getOccInstanceID();
1316  
1317      if (fs::exists(sensorPath))
1318      {
1319          // Read temperature sensors
1320          readTempSensors(sensorPath, id);
1321  
1322          if (occ->isMasterOcc())
1323          {
1324              // Read power sensors
1325              readPowerSensors(sensorPath, id);
1326          }
1327          tracedError[id] = false;
1328      }
1329      else
1330      {
1331          if (!tracedError[id])
1332          {
1333              lg2::error(
1334                  "Manager::getSensorValues: OCC{INST} sensor path missing: {PATH}",
1335                  "INST", id, "PATH", sensorPath);
1336              tracedError[id] = true;
1337          }
1338      }
1339  
1340      return;
1341  }
1342  #endif
1343  
1344  // Read the altitude from DBus
readAltitude()1345  void Manager::readAltitude()
1346  {
1347      static bool traceAltitudeErr = true;
1348  
1349      utils::PropertyValue altitudeProperty{};
1350      try
1351      {
1352          altitudeProperty = utils::getProperty(ALTITUDE_PATH, ALTITUDE_INTERFACE,
1353                                                ALTITUDE_PROP);
1354          auto sensorVal = std::get<double>(altitudeProperty);
1355          if (sensorVal < 0xFFFF)
1356          {
1357              if (sensorVal < 0)
1358              {
1359                  altitude = 0;
1360              }
1361              else
1362              {
1363                  // Round to nearest meter
1364                  altitude = uint16_t(sensorVal + 0.5);
1365              }
1366              lg2::debug("readAltitude: sensor={VALUE} ({ALT}m)", "VALUE",
1367                         sensorVal, "ALT", altitude);
1368              traceAltitudeErr = true;
1369          }
1370          else
1371          {
1372              if (traceAltitudeErr)
1373              {
1374                  traceAltitudeErr = false;
1375                  lg2::debug("Invalid altitude value: {ALT}", "ALT", sensorVal);
1376              }
1377          }
1378      }
1379      catch (const sdbusplus::exception_t& e)
1380      {
1381          if (traceAltitudeErr)
1382          {
1383              traceAltitudeErr = false;
1384              lg2::info("Unable to read Altitude: {ERROR}", "ERROR", e.what());
1385          }
1386          altitude = 0xFFFF; // not available
1387      }
1388  }
1389  
1390  // Callback function when ambient temperature changes
ambientCallback(sdbusplus::message_t & msg)1391  void Manager::ambientCallback(sdbusplus::message_t& msg)
1392  {
1393      double currentTemp = 0;
1394      uint8_t truncatedTemp = 0xFF;
1395      std::string msgSensor;
1396      std::map<std::string, std::variant<double>> msgData;
1397      msg.read(msgSensor, msgData);
1398  
1399      auto valPropMap = msgData.find(AMBIENT_PROP);
1400      if (valPropMap == msgData.end())
1401      {
1402          lg2::debug("ambientCallback: Unknown ambient property changed");
1403          return;
1404      }
1405      currentTemp = std::get<double>(valPropMap->second);
1406      if (std::isnan(currentTemp))
1407      {
1408          truncatedTemp = 0xFF;
1409      }
1410      else
1411      {
1412          if (currentTemp < 0)
1413          {
1414              truncatedTemp = 0;
1415          }
1416          else
1417          {
1418              // Round to nearest degree C
1419              truncatedTemp = uint8_t(currentTemp + 0.5);
1420          }
1421      }
1422  
1423      // If ambient changes, notify OCCs
1424      if (truncatedTemp != ambient)
1425      {
1426          lg2::debug("ambientCallback: Ambient change from {OLD} to {NEW}C",
1427                     "OLD", ambient, "NEW", currentTemp);
1428  
1429          ambient = truncatedTemp;
1430          if (altitude == 0xFFFF)
1431          {
1432              // No altitude yet, try reading again
1433              readAltitude();
1434          }
1435  
1436          lg2::debug("ambientCallback: Ambient: {TEMP}C, altitude: {ALT}m",
1437                     "TEMP", ambient, "ALT", altitude);
1438  #ifdef POWER10
1439          // Send ambient and altitude to all OCCs
1440          for (auto& obj : statusObjects)
1441          {
1442              if (obj->occActive())
1443              {
1444                  obj->sendAmbient(ambient, altitude);
1445              }
1446          }
1447  #endif // POWER10
1448      }
1449  }
1450  
1451  // return the current ambient and altitude readings
getAmbientData(bool & ambientValid,uint8_t & ambientTemp,uint16_t & altitudeValue) const1452  void Manager::getAmbientData(bool& ambientValid, uint8_t& ambientTemp,
1453                               uint16_t& altitudeValue) const
1454  {
1455      ambientValid = true;
1456      ambientTemp = ambient;
1457      altitudeValue = altitude;
1458  
1459      if (ambient == 0xFF)
1460      {
1461          ambientValid = false;
1462      }
1463  }
1464  
1465  #ifdef POWER10
1466  // Called when waitForAllOccsTimer expires
1467  // After the first OCC goes active, this timer will be started (60 seconds)
occsNotAllRunning()1468  void Manager::occsNotAllRunning()
1469  {
1470      if (resetInProgress)
1471      {
1472          lg2::warning(
1473              "occsNotAllRunning: Ignoring waitForAllOccsTimer because reset is in progress");
1474          return;
1475      }
1476      if (activeCount != statusObjects.size())
1477      {
1478          // Not all OCCs went active
1479          lg2::warning(
1480              "occsNotAllRunning: Active OCC count ({COUNT}) does not match expected count ({EXP})",
1481              "COUNT", activeCount, "EXP", statusObjects.size());
1482          // Procs may be garded, so may be expected
1483      }
1484  
1485      if (resetRequired)
1486      {
1487          initiateOccRequest(resetInstance);
1488  
1489          if (!waitForAllOccsTimer->isEnabled())
1490          {
1491              lg2::warning("occsNotAllRunning: Restarting waitForAllOccTimer");
1492              // restart occ wait timer
1493              waitForAllOccsTimer->restartOnce(60s);
1494          }
1495      }
1496      else
1497      {
1498          validateOccMaster();
1499      }
1500  }
1501  
1502  #ifdef PLDM
1503  // Called when throttlePldmTraceTimer expires.
1504  // If this timer expires, that indicates there are no OCC active sensor PDRs
1505  // found which will trigger pldm traces to be throttled.
1506  // The second time this timer expires, a PEL will get created.
throttlePldmTraceExpired()1507  void Manager::throttlePldmTraceExpired()
1508  {
1509      if (utils::isHostRunning())
1510      {
1511          if (!onPldmTimeoutCreatePel)
1512          {
1513              // Throttle traces
1514              pldmHandle->setTraceThrottle(true);
1515              // Restart timer to log a PEL when timer expires
1516              onPldmTimeoutCreatePel = true;
1517              throttlePldmTraceTimer->restartOnce(40min);
1518          }
1519          else
1520          {
1521              lg2::error(
1522                  "throttlePldmTraceExpired(): OCC active sensors still not available!");
1523              // Create PEL
1524              createPldmSensorPEL();
1525          }
1526      }
1527      else
1528      {
1529          // Make sure traces are not throttled
1530          pldmHandle->setTraceThrottle(false);
1531          lg2::info(
1532              "throttlePldmTraceExpired(): host it not running ignoring sensor timer");
1533      }
1534  }
1535  
createPldmSensorPEL()1536  void Manager::createPldmSensorPEL()
1537  {
1538      Error::Descriptor d = Error::Descriptor(MISSING_OCC_SENSORS_PATH);
1539      std::map<std::string, std::string> additionalData;
1540  
1541      additionalData.emplace("_PID", std::to_string(getpid()));
1542  
1543      lg2::info(
1544          "createPldmSensorPEL(): Unable to find PLDM sensors for the OCCs");
1545  
1546      auto& bus = utils::getBus();
1547  
1548      try
1549      {
1550          FFDCFiles ffdc;
1551          // Add occ-control journal traces to PEL FFDC
1552          auto occJournalFile =
1553              FFDC::addJournalEntries(ffdc, "openpower-occ-control", 40);
1554  
1555          static constexpr auto loggingObjectPath =
1556              "/xyz/openbmc_project/logging";
1557          static constexpr auto opLoggingInterface = "org.open_power.Logging.PEL";
1558          std::string service =
1559              utils::getService(loggingObjectPath, opLoggingInterface);
1560          auto method =
1561              bus.new_method_call(service.c_str(), loggingObjectPath,
1562                                  opLoggingInterface, "CreatePELWithFFDCFiles");
1563  
1564          // Set level to Warning (Predictive).
1565          auto level =
1566              sdbusplus::xyz::openbmc_project::Logging::server::convertForMessage(
1567                  sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level::
1568                      Warning);
1569  
1570          method.append(d.path, level, additionalData, ffdc);
1571          bus.call(method);
1572      }
1573      catch (const sdbusplus::exception_t& e)
1574      {
1575          lg2::error("Failed to create MISSING_OCC_SENSORS PEL: {ERROR}", "ERROR",
1576                     e.what());
1577      }
1578  }
1579  #endif // PLDM
1580  #endif // POWER10
1581  
1582  // Verify single master OCC and start presence monitor
validateOccMaster()1583  void Manager::validateOccMaster()
1584  {
1585      int masterInstance = -1;
1586      for (auto& obj : statusObjects)
1587      {
1588          auto instance = obj->getOccInstanceID();
1589  #ifdef POWER10
1590          if (!obj->occActive())
1591          {
1592              if (utils::isHostRunning())
1593              {
1594                  // Check if sensor was queued while waiting for discovery
1595                  auto match = queuedActiveState.find(instance);
1596                  if (match != queuedActiveState.end())
1597                  {
1598                      queuedActiveState.erase(match);
1599                      lg2::info("validateOccMaster: OCC{INST} is ACTIVE (queued)",
1600                                "INST", instance);
1601                      obj->occActive(true);
1602                  }
1603                  else
1604                  {
1605                      // OCC does not appear to be active yet, check active sensor
1606  #ifdef PLDM
1607                      pldmHandle->checkActiveSensor(instance);
1608  #endif
1609                      if (obj->occActive())
1610                      {
1611                          lg2::info(
1612                              "validateOccMaster: OCC{INST} is ACTIVE after reading sensor",
1613                              "INST", instance);
1614                      }
1615                  }
1616              }
1617              else
1618              {
1619                  lg2::warning(
1620                      "validateOccMaster: HOST is not running (OCC{INST})",
1621                      "INST", instance);
1622                  return;
1623              }
1624          }
1625  #endif // POWER10
1626  
1627          if (obj->isMasterOcc())
1628          {
1629              obj->addPresenceWatchMaster();
1630  
1631              if (masterInstance == -1)
1632              {
1633                  masterInstance = instance;
1634              }
1635              else
1636              {
1637                  lg2::error(
1638                      "validateOccMaster: Multiple OCC masters! ({MAST1} and {MAST2})",
1639                      "MAST1", masterInstance, "MAST2", instance);
1640                  // request reset
1641                  obj->deviceError(Error::Descriptor(PRESENCE_ERROR_PATH));
1642              }
1643          }
1644      }
1645  
1646      if (masterInstance < 0)
1647      {
1648          lg2::error("validateOccMaster: Master OCC not found! (of {NUM} OCCs)",
1649                     "NUM", statusObjects.size());
1650          // request reset
1651          statusObjects.front()->deviceError(
1652              Error::Descriptor(PRESENCE_ERROR_PATH));
1653      }
1654      else
1655      {
1656          lg2::info("validateOccMaster: OCC{INST} is master of {COUNT} OCCs",
1657                    "INST", masterInstance, "COUNT", activeCount);
1658  #ifdef POWER10
1659          pmode->updateDbusSafeMode(false);
1660  #endif
1661      }
1662  }
1663  
updatePcapBounds() const1664  void Manager::updatePcapBounds() const
1665  {
1666      if (pcap)
1667      {
1668          pcap->updatePcapBounds();
1669      }
1670  }
1671  
1672  } // namespace occ
1673  } // namespace open_power
1674