xref: /openbmc/openpower-occ-control/occ_manager.cpp (revision 37abe9be91df2bb173c9642a2740a425904d7921)
1 #include "config.h"
2 
3 #include "occ_manager.hpp"
4 
5 #include "i2c_occ.hpp"
6 #include "occ_dbus.hpp"
7 #include "occ_errors.hpp"
8 #include "utils.hpp"
9 
10 #include <phosphor-logging/elog-errors.hpp>
11 #include <phosphor-logging/lg2.hpp>
12 #include <xyz/openbmc_project/Common/error.hpp>
13 
14 #include <chrono>
15 #include <cmath>
16 #include <filesystem>
17 #include <fstream>
18 #include <regex>
19 
20 namespace open_power
21 {
22 namespace occ
23 {
24 
25 constexpr uint32_t fruTypeNotAvailable = 0xFF;
26 constexpr auto fruTypeSuffix = "fru_type";
27 constexpr auto faultSuffix = "fault";
28 constexpr auto inputSuffix = "input";
29 constexpr auto maxSuffix = "max";
30 
31 const auto HOST_ON_FILE = "/run/openbmc/host@0-on";
32 
33 using namespace phosphor::logging;
34 using namespace std::literals::chrono_literals;
35 
36 template <typename T>
readFile(const std::string & path)37 T readFile(const std::string& path)
38 {
39     std::ifstream ifs;
40     ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit |
41                    std::ifstream::eofbit);
42     T data;
43 
44     try
45     {
46         ifs.open(path);
47         ifs >> data;
48         ifs.close();
49     }
50     catch (const std::exception& e)
51     {
52         auto err = errno;
53         throw std::system_error(err, std::generic_category());
54     }
55 
56     return data;
57 }
58 
59 // findAndCreateObjects():
60 // Takes care of getting the required objects created and
61 // finds the available devices/processors.
62 // (function is called everytime the discoverTimer expires)
63 // - create the PowerMode object to control OCC modes
64 // - create statusObjects for each OCC device found
65 // - waits for OCC Active sensors PDRs to become available
66 // - restart discoverTimer if all data is not available yet
findAndCreateObjects()67 void Manager::findAndCreateObjects()
68 {
69 #ifndef POWER10
70     for (auto id = 0; id < MAX_CPUS; ++id)
71     {
72         // Create one occ per cpu
73         auto occ = std::string(OCC_NAME) + std::to_string(id);
74         createObjects(occ);
75     }
76 #else
77     if (!pmode)
78     {
79         // Create the power mode object
80         pmode = std::make_unique<powermode::PowerMode>(
81             *this, powermode::PMODE_PATH, powermode::PIPS_PATH, event);
82     }
83 
84     if (!fs::exists(HOST_ON_FILE))
85     {
86         static bool statusObjCreated = false;
87         if (!statusObjCreated)
88         {
89             // Create the OCCs based on on the /dev/occX devices
90             auto occs = findOCCsInDev();
91 
92             if (occs.empty() || (prevOCCSearch.size() != occs.size()))
93             {
94                 // Something changed or no OCCs yet, try again in 10s.
95                 // Note on the first pass prevOCCSearch will be empty,
96                 // so there will be at least one delay to give things
97                 // a chance to settle.
98                 prevOCCSearch = occs;
99 
100                 lg2::info(
101                     "Manager::findAndCreateObjects(): Waiting for OCCs (currently {QTY})",
102                     "QTY", occs.size());
103 
104                 discoverTimer->restartOnce(10s);
105             }
106             else
107             {
108                 // All OCCs appear to be available, create status objects
109 
110                 // createObjects requires OCC0 first.
111                 std::sort(occs.begin(), occs.end());
112 
113                 lg2::info(
114                     "Manager::findAndCreateObjects(): Creating {QTY} OCC Status Objects",
115                     "QTY", occs.size());
116                 for (auto id : occs)
117                 {
118                     createObjects(std::string(OCC_NAME) + std::to_string(id));
119                 }
120                 statusObjCreated = true;
121                 waitingForAllOccActiveSensors = true;
122 
123                 // Find/update the processor path associated with each OCC
124                 for (auto& obj : statusObjects)
125                 {
126                     obj->updateProcAssociation();
127                 }
128             }
129         }
130 
131         if (statusObjCreated && waitingForAllOccActiveSensors)
132         {
133             static bool tracedHostWait = false;
134             if (utils::isHostRunning())
135             {
136                 if (tracedHostWait)
137                 {
138                     lg2::info(
139                         "Manager::findAndCreateObjects(): Host is running");
140                     tracedHostWait = false;
141                 }
142                 checkAllActiveSensors();
143             }
144             else
145             {
146                 if (!tracedHostWait)
147                 {
148                     lg2::info(
149                         "Manager::findAndCreateObjects(): Waiting for host to start");
150                     tracedHostWait = true;
151                 }
152                 discoverTimer->restartOnce(30s);
153 #ifdef PLDM
154                 if (throttlePldmTraceTimer->isEnabled())
155                 {
156                     // Host is no longer running, disable throttle timer and
157                     // make sure traces are not throttled
158                     lg2::info("findAndCreateObjects(): disabling sensor timer");
159                     throttlePldmTraceTimer->setEnabled(false);
160                     pldmHandle->setTraceThrottle(false);
161                 }
162 #endif
163             }
164         }
165     }
166     else
167     {
168         lg2::info(
169             "Manager::findAndCreateObjects(): Waiting for {FILE} to complete...",
170             "FILE", HOST_ON_FILE);
171         discoverTimer->restartOnce(10s);
172     }
173 #endif
174 }
175 
176 #ifdef POWER10
177 // Check if all occActive sensors are available
checkAllActiveSensors()178 void Manager::checkAllActiveSensors()
179 {
180     static bool allActiveSensorAvailable = false;
181     static bool tracedSensorWait = false;
182     static bool waitingForHost = false;
183 
184     if (open_power::occ::utils::isHostRunning())
185     {
186         if (waitingForHost)
187         {
188             waitingForHost = false;
189             lg2::info("checkAllActiveSensors(): Host is now running");
190         }
191 
192         // Start with the assumption that all are available
193         allActiveSensorAvailable = true;
194         for (auto& obj : statusObjects)
195         {
196             if ((!obj->occActive()) && (!obj->getPldmSensorReceived()))
197             {
198                 auto instance = obj->getOccInstanceID();
199                 // Check if sensor was queued while waiting for discovery
200                 auto match = queuedActiveState.find(instance);
201                 if (match != queuedActiveState.end())
202                 {
203                     queuedActiveState.erase(match);
204                     lg2::info(
205                         "checkAllActiveSensors(): OCC{INST} is ACTIVE (queued)",
206                         "INST", instance);
207                     obj->occActive(true);
208                 }
209                 else
210                 {
211                     allActiveSensorAvailable = false;
212                     if (!tracedSensorWait)
213                     {
214                         lg2::info(
215                             "checkAllActiveSensors(): Waiting on OCC{INST} Active sensor",
216                             "INST", instance);
217                         tracedSensorWait = true;
218 #ifdef PLDM
219                         // Make sure PLDM traces are not throttled
220                         pldmHandle->setTraceThrottle(false);
221                         // Start timer to throttle PLDM traces when timer
222                         // expires
223                         onPldmTimeoutCreatePel = false;
224                         throttlePldmTraceTimer->restartOnce(5min);
225 #endif
226                     }
227 #ifdef PLDM
228                     // Ignore active sensor check if the OCCs are being reset
229                     if (!resetInProgress)
230                     {
231                         pldmHandle->checkActiveSensor(obj->getOccInstanceID());
232                     }
233 #endif
234                     break;
235                 }
236             }
237         }
238     }
239     else
240     {
241         if (!waitingForHost)
242         {
243             waitingForHost = true;
244             lg2::info("checkAllActiveSensors(): Waiting for host to start");
245 #ifdef PLDM
246             if (throttlePldmTraceTimer->isEnabled())
247             {
248                 // Host is no longer running, disable throttle timer and
249                 // make sure traces are not throttled
250                 lg2::info("checkAllActiveSensors(): disabling sensor timer");
251                 throttlePldmTraceTimer->setEnabled(false);
252                 pldmHandle->setTraceThrottle(false);
253             }
254 #endif
255         }
256     }
257 
258     if (allActiveSensorAvailable)
259     {
260         // All sensors were found, disable the discovery timer
261         if (discoverTimer->isEnabled())
262         {
263             discoverTimer->setEnabled(false);
264         }
265 #ifdef PLDM
266         if (throttlePldmTraceTimer->isEnabled())
267         {
268             // Disable throttle timer and make sure traces are not throttled
269             throttlePldmTraceTimer->setEnabled(false);
270             pldmHandle->setTraceThrottle(false);
271         }
272 #endif
273         if (waitingForAllOccActiveSensors)
274         {
275             lg2::info(
276                 "checkAllActiveSensors(): OCC Active sensors are available");
277             waitingForAllOccActiveSensors = false;
278 
279             if (resetRequired)
280             {
281                 initiateOccRequest(resetInstance);
282 
283                 if (!waitForAllOccsTimer->isEnabled())
284                 {
285                     lg2::warning(
286                         "occsNotAllRunning: Restarting waitForAllOccTimer");
287                     // restart occ wait timer to check status after reset
288                     // completes
289                     waitForAllOccsTimer->restartOnce(60s);
290                 }
291             }
292         }
293         queuedActiveState.clear();
294         tracedSensorWait = false;
295     }
296     else
297     {
298         // Not all sensors were available, so keep waiting
299         if (!tracedSensorWait)
300         {
301             lg2::info(
302                 "checkAllActiveSensors(): Waiting for OCC Active sensors to become available");
303             tracedSensorWait = true;
304         }
305         discoverTimer->restartOnce(10s);
306     }
307 }
308 #endif
309 
findOCCsInDev()310 std::vector<int> Manager::findOCCsInDev()
311 {
312     std::vector<int> occs;
313     std::regex expr{R"(occ(\d+)$)"};
314 
315     for (auto& file : fs::directory_iterator("/dev"))
316     {
317         std::smatch match;
318         std::string path{file.path().string()};
319         if (std::regex_search(path, match, expr))
320         {
321             auto num = std::stoi(match[1].str());
322 
323             // /dev numbering starts at 1, ours starts at 0.
324             occs.push_back(num - 1);
325         }
326     }
327 
328     return occs;
329 }
330 
cpuCreated(sdbusplus::message_t & msg)331 int Manager::cpuCreated(sdbusplus::message_t& msg)
332 {
333     namespace fs = std::filesystem;
334 
335     sdbusplus::message::object_path o;
336     msg.read(o);
337     fs::path cpuPath(std::string(std::move(o)));
338 
339     auto name = cpuPath.filename().string();
340     auto index = name.find(CPU_NAME);
341     name.replace(index, std::strlen(CPU_NAME), OCC_NAME);
342 
343     createObjects(name);
344 
345     return 0;
346 }
347 
createObjects(const std::string & occ)348 void Manager::createObjects(const std::string& occ)
349 {
350     auto path = fs::path(OCC_CONTROL_ROOT) / occ;
351 
352     statusObjects.emplace_back(std::make_unique<Status>(
353         event, path.c_str(), *this,
354 #ifdef POWER10
355         pmode,
356 #endif
357         std::bind(std::mem_fn(&Manager::statusCallBack), this,
358                   std::placeholders::_1, std::placeholders::_2)
359 #ifdef PLDM
360             ,
361         // Callback will set flag indicating reset needs to be done
362         // instead of immediately issuing a reset via PLDM.
363         std::bind(std::mem_fn(&Manager::resetOccRequest), this,
364                   std::placeholders::_1)
365 #endif
366             ));
367 
368     // Create the power cap monitor object
369     if (!pcap)
370     {
371         pcap = std::make_unique<open_power::occ::powercap::PowerCap>(
372             *statusObjects.back());
373     }
374 
375     if (statusObjects.back()->isMasterOcc())
376     {
377         lg2::info("Manager::createObjects(): OCC{INST} is the master", "INST",
378                   statusObjects.back()->getOccInstanceID());
379         _pollTimer->setEnabled(false);
380 
381 #ifdef POWER10
382         // Set the master OCC on the PowerMode object
383         pmode->setMasterOcc(path);
384 #endif
385     }
386 
387     passThroughObjects.emplace_back(std::make_unique<PassThrough>(
388         path.c_str()
389 #ifdef POWER10
390             ,
391         pmode
392 #endif
393         ));
394 }
395 
396 // If a reset is not already outstanding, set a flag to indicate that a reset is
397 // needed.
resetOccRequest(instanceID instance)398 void Manager::resetOccRequest(instanceID instance)
399 {
400     if (!resetRequired)
401     {
402         resetRequired = true;
403         resetInstance = instance;
404         lg2::error(
405             "resetOccRequest: PM Complex reset was requested due to OCC{INST}",
406             "INST", instance);
407     }
408     else if (instance != resetInstance)
409     {
410         lg2::warning(
411             "resetOccRequest: Ignoring PM Complex reset request for OCC{INST}, because reset already outstanding for OCC{RINST}",
412             "INST", instance, "RINST", resetInstance);
413     }
414 }
415 
416 // If a reset has not been started, initiate an OCC reset via PLDM
initiateOccRequest(instanceID instance)417 void Manager::initiateOccRequest(instanceID instance)
418 {
419     if (!resetInProgress)
420     {
421         resetInProgress = true;
422         resetInstance = instance;
423         lg2::error(
424             "initiateOccRequest: Initiating PM Complex reset due to OCC{INST}",
425             "INST", instance);
426 #ifdef PLDM
427         pldmHandle->resetOCC(instance);
428 #endif
429         resetRequired = false;
430     }
431     else
432     {
433         lg2::warning(
434             "initiateOccRequest: Ignoring PM Complex reset request for OCC{INST}, because reset already in process for OCC{RINST}",
435             "INST", instance, "RINST", resetInstance);
436     }
437 }
438 
statusCallBack(instanceID instance,bool status)439 void Manager::statusCallBack(instanceID instance, bool status)
440 {
441     if (status == true)
442     {
443         if (resetInProgress)
444         {
445             lg2::info(
446                 "statusCallBack: Ignoring OCC{INST} activate because a reset has been initiated due to OCC{INST}",
447                 "INST", instance, "RINST", resetInstance);
448             return;
449         }
450 
451         // OCC went active
452         ++activeCount;
453 
454 #ifdef POWER10
455         if (activeCount == 1)
456         {
457             // First OCC went active (allow some time for all OCCs to go active)
458             waitForAllOccsTimer->restartOnce(60s);
459         }
460 #endif
461 
462         if (activeCount == statusObjects.size())
463         {
464 #ifdef POWER10
465             // All OCCs are now running
466             if (waitForAllOccsTimer->isEnabled())
467             {
468                 // stop occ wait timer
469                 waitForAllOccsTimer->setEnabled(false);
470             }
471 
472             // All OCCs have been found, check if we need a reset
473             if (resetRequired)
474             {
475                 initiateOccRequest(resetInstance);
476 
477                 if (!waitForAllOccsTimer->isEnabled())
478                 {
479                     lg2::warning(
480                         "occsNotAllRunning: Restarting waitForAllOccTimer");
481                     // restart occ wait timer
482                     waitForAllOccsTimer->restartOnce(60s);
483                 }
484             }
485             else
486             {
487                 // Verify master OCC and start presence monitor
488                 validateOccMaster();
489             }
490 #else
491             // Verify master OCC and start presence monitor
492             validateOccMaster();
493 #endif
494         }
495 
496         // Start poll timer if not already started
497         if (!_pollTimer->isEnabled())
498         {
499             lg2::info("Manager: OCCs will be polled every {TIME} seconds",
500                       "TIME", pollInterval);
501 
502             // Send poll and start OCC poll timer
503             pollerTimerExpired();
504         }
505     }
506     else
507     {
508         // OCC went away
509         if (activeCount > 0)
510         {
511             --activeCount;
512         }
513         else
514         {
515             lg2::info("OCC{INST} disabled, but currently no active OCCs",
516                       "INST", instance);
517         }
518 
519         if (activeCount == 0)
520         {
521             // No OCCs are running
522 
523             if (resetInProgress)
524             {
525                 // All OCC active sensors are clear (reset should be in
526                 // progress)
527                 lg2::info(
528                     "statusCallBack: Clearing resetInProgress (activeCount={COUNT}, OCC{INST}, status={STATUS})",
529                     "COUNT", activeCount, "INST", instance, "STATUS", status);
530                 resetInProgress = false;
531                 resetInstance = 255;
532             }
533 
534             // Stop OCC poll timer
535             if (_pollTimer->isEnabled())
536             {
537                 lg2::info(
538                     "Manager::statusCallBack(): OCCs are not running, stopping poll timer");
539                 _pollTimer->setEnabled(false);
540             }
541 
542 #ifdef POWER10
543             // stop wait timer
544             if (waitForAllOccsTimer->isEnabled())
545             {
546                 waitForAllOccsTimer->setEnabled(false);
547             }
548 #endif
549         }
550         else if (resetInProgress)
551         {
552             lg2::info(
553                 "statusCallBack: Skipping clear of resetInProgress (activeCount={COUNT}, OCC{INST}, status={STATUS})",
554                 "COUNT", activeCount, "INST", instance, "STATUS", status);
555         }
556 #ifdef READ_OCC_SENSORS
557         // Clear OCC sensors
558         setSensorValueToNaN(instance);
559 #endif
560     }
561 
562 #ifdef POWER10
563     if (waitingForAllOccActiveSensors)
564     {
565         if (utils::isHostRunning())
566         {
567             checkAllActiveSensors();
568         }
569     }
570 #endif
571 }
572 
573 #ifdef I2C_OCC
initStatusObjects()574 void Manager::initStatusObjects()
575 {
576     // Make sure we have a valid path string
577     static_assert(sizeof(DEV_PATH) != 0);
578 
579     auto deviceNames = i2c_occ::getOccHwmonDevices(DEV_PATH);
580     for (auto& name : deviceNames)
581     {
582         i2c_occ::i2cToDbus(name);
583         name = std::string(OCC_NAME) + '_' + name;
584         auto path = fs::path(OCC_CONTROL_ROOT) / name;
585         statusObjects.emplace_back(
586             std::make_unique<Status>(event, path.c_str(), *this));
587     }
588     // The first device is master occ
589     pcap = std::make_unique<open_power::occ::powercap::PowerCap>(
590         *statusObjects.front());
591 #ifdef POWER10
592     pmode = std::make_unique<powermode::PowerMode>(*this, powermode::PMODE_PATH,
593                                                    powermode::PIPS_PATH);
594     // Set the master OCC on the PowerMode object
595     pmode->setMasterOcc(path);
596 #endif
597 }
598 #endif
599 
600 #ifdef PLDM
sbeTimeout(unsigned int instance)601 void Manager::sbeTimeout(unsigned int instance)
602 {
603     auto obj = std::find_if(statusObjects.begin(), statusObjects.end(),
604                             [instance](const auto& obj) {
605                                 return instance == obj->getOccInstanceID();
606                             });
607 
608     if (obj != statusObjects.end() && (*obj)->occActive())
609     {
610         lg2::info("SBE timeout, requesting HRESET (OCC{INST})", "INST",
611                   instance);
612 
613         setSBEState(instance, SBE_STATE_NOT_USABLE);
614 
615         pldmHandle->sendHRESET(instance);
616     }
617 }
618 
updateOCCActive(instanceID instance,bool status)619 bool Manager::updateOCCActive(instanceID instance, bool status)
620 {
621     auto obj = std::find_if(statusObjects.begin(), statusObjects.end(),
622                             [instance](const auto& obj) {
623                                 return instance == obj->getOccInstanceID();
624                             });
625 
626     const bool hostRunning = open_power::occ::utils::isHostRunning();
627     if (obj != statusObjects.end())
628     {
629         if (!hostRunning && (status == true))
630         {
631             lg2::warning(
632                 "updateOCCActive: Host is not running yet (OCC{INST} active={STAT}), clearing sensor received",
633                 "INST", instance, "STAT", status);
634             (*obj)->setPldmSensorReceived(false);
635             if (!waitingForAllOccActiveSensors)
636             {
637                 lg2::info(
638                     "updateOCCActive: Waiting for Host and all OCC Active Sensors");
639                 waitingForAllOccActiveSensors = true;
640             }
641 #ifdef POWER10
642             discoverTimer->restartOnce(30s);
643 #endif
644             return false;
645         }
646         else
647         {
648             (*obj)->setPldmSensorReceived(true);
649             return (*obj)->occActive(status);
650         }
651     }
652     else
653     {
654         if (hostRunning)
655         {
656             lg2::warning(
657                 "updateOCCActive: No status object to update for OCC{INST} (active={STAT})",
658                 "INST", instance, "STAT", status);
659         }
660         else
661         {
662             if (status == true)
663             {
664                 lg2::warning(
665                     "updateOCCActive: No status objects and Host is not running yet (OCC{INST} active={STAT})",
666                     "INST", instance, "STAT", status);
667             }
668         }
669         if (status == true)
670         {
671             // OCC went active
672             queuedActiveState.insert(instance);
673         }
674         else
675         {
676             auto match = queuedActiveState.find(instance);
677             if (match != queuedActiveState.end())
678             {
679                 // OCC was disabled
680                 queuedActiveState.erase(match);
681             }
682         }
683         return false;
684     }
685 }
686 
687 // Called upon pldm event To set powermode Safe Mode State for system.
updateOccSafeMode(bool safeMode)688 void Manager::updateOccSafeMode(bool safeMode)
689 {
690 #ifdef POWER10
691     pmode->updateDbusSafeMode(safeMode);
692 #endif
693     // Update the processor throttle status on dbus
694     for (auto& obj : statusObjects)
695     {
696         obj->updateThrottle(safeMode, THROTTLED_SAFE);
697     }
698 }
699 
sbeHRESETResult(instanceID instance,bool success)700 void Manager::sbeHRESETResult(instanceID instance, bool success)
701 {
702     if (success)
703     {
704         lg2::info("HRESET succeeded (OCC{INST})", "INST", instance);
705 
706         setSBEState(instance, SBE_STATE_BOOTED);
707 
708         return;
709     }
710 
711     setSBEState(instance, SBE_STATE_FAILED);
712 
713     if (sbeCanDump(instance))
714     {
715         lg2::info("HRESET failed (OCC{INST}), triggering SBE dump", "INST",
716                   instance);
717 
718         auto& bus = utils::getBus();
719         uint32_t src6 = instance << 16;
720         uint32_t logId =
721             FFDC::createPEL("org.open_power.Processor.Error.SbeChipOpTimeout",
722                             src6, "SBE command timeout");
723 
724         try
725         {
726             constexpr auto interface = "xyz.openbmc_project.Dump.Create";
727             constexpr auto function = "CreateDump";
728 
729             std::string service =
730                 utils::getService(OP_DUMP_OBJ_PATH, interface);
731             auto method = bus.new_method_call(service.c_str(), OP_DUMP_OBJ_PATH,
732                                               interface, function);
733 
734             std::map<std::string, std::variant<std::string, uint64_t>>
735                 createParams{
736                     {"com.ibm.Dump.Create.CreateParameters.ErrorLogId",
737                      uint64_t(logId)},
738                     {"com.ibm.Dump.Create.CreateParameters.DumpType",
739                      "com.ibm.Dump.Create.DumpType.SBE"},
740                     {"com.ibm.Dump.Create.CreateParameters.FailingUnitId",
741                      uint64_t(instance)},
742                 };
743 
744             method.append(createParams);
745 
746             auto response = bus.call(method);
747         }
748         catch (const sdbusplus::exception_t& e)
749         {
750             constexpr auto ERROR_DUMP_DISABLED =
751                 "xyz.openbmc_project.Dump.Create.Error.Disabled";
752             if (e.name() == ERROR_DUMP_DISABLED)
753             {
754                 lg2::info("Dump is disabled, skipping");
755             }
756             else
757             {
758                 lg2::error("Dump failed");
759             }
760         }
761     }
762 
763     // SBE Reset failed, try PM Complex reset
764     lg2::error("sbeHRESETResult: Forcing PM Complex reset");
765     resetOccRequest(instance);
766 }
767 
sbeCanDump(unsigned int instance)768 bool Manager::sbeCanDump(unsigned int instance)
769 {
770     struct pdbg_target* proc = getPdbgTarget(instance);
771 
772     if (!proc)
773     {
774         // allow the dump in the error case
775         return true;
776     }
777 
778     try
779     {
780         if (!openpower::phal::sbe::isDumpAllowed(proc))
781         {
782             return false;
783         }
784 
785         if (openpower::phal::pdbg::isSbeVitalAttnActive(proc))
786         {
787             return false;
788         }
789     }
790     catch (openpower::phal::exception::SbeError& e)
791     {
792         lg2::info("Failed to query SBE state");
793     }
794 
795     // allow the dump in the error case
796     return true;
797 }
798 
setSBEState(unsigned int instance,enum sbe_state state)799 void Manager::setSBEState(unsigned int instance, enum sbe_state state)
800 {
801     struct pdbg_target* proc = getPdbgTarget(instance);
802 
803     if (!proc)
804     {
805         return;
806     }
807 
808     try
809     {
810         openpower::phal::sbe::setState(proc, state);
811     }
812     catch (const openpower::phal::exception::SbeError& e)
813     {
814         lg2::error("Failed to set SBE state: {ERROR}", "ERROR", e.what());
815     }
816 }
817 
getPdbgTarget(unsigned int instance)818 struct pdbg_target* Manager::getPdbgTarget(unsigned int instance)
819 {
820     if (!pdbgInitialized)
821     {
822         try
823         {
824             openpower::phal::pdbg::init();
825             pdbgInitialized = true;
826         }
827         catch (const openpower::phal::exception::PdbgError& e)
828         {
829             lg2::error("pdbg initialization failed");
830             return nullptr;
831         }
832     }
833 
834     struct pdbg_target* proc = nullptr;
835     pdbg_for_each_class_target("proc", proc)
836     {
837         if (pdbg_target_index(proc) == instance)
838         {
839             return proc;
840         }
841     }
842 
843     lg2::error("Failed to get pdbg target");
844     return nullptr;
845 }
846 #endif
847 
pollerTimerExpired()848 void Manager::pollerTimerExpired()
849 {
850     if (!_pollTimer)
851     {
852         lg2::error("pollerTimerExpired() ERROR: Timer not defined");
853         return;
854     }
855 
856 #ifdef POWER10
857     if (resetRequired)
858     {
859         lg2::error("pollerTimerExpired() - Initiating PM Complex reset");
860         initiateOccRequest(resetInstance);
861 
862         if (!waitForAllOccsTimer->isEnabled())
863         {
864             lg2::warning("pollerTimerExpired: Restarting waitForAllOccTimer");
865             // restart occ wait timer
866             waitForAllOccsTimer->restartOnce(60s);
867         }
868         return;
869     }
870 #endif
871 
872     for (auto& obj : statusObjects)
873     {
874         if (!obj->occActive())
875         {
876             // OCC is not running yet
877 #ifdef READ_OCC_SENSORS
878             auto id = obj->getOccInstanceID();
879             setSensorValueToNaN(id);
880 #endif
881             continue;
882         }
883 
884         // Read sysfs to force kernel to poll OCC
885         obj->readOccState();
886 
887 #ifdef READ_OCC_SENSORS
888         // Read occ sensor values
889         getSensorValues(obj);
890 #endif
891     }
892 
893     if (activeCount > 0)
894     {
895         // Restart OCC poll timer
896         _pollTimer->restartOnce(std::chrono::seconds(pollInterval));
897     }
898     else
899     {
900         // No OCCs running, so poll timer will not be restarted
901         lg2::info(
902             "Manager::pollerTimerExpired: poll timer will not be restarted");
903     }
904 }
905 
906 #ifdef READ_OCC_SENSORS
readTempSensors(const fs::path & path,uint32_t occInstance)907 void Manager::readTempSensors(const fs::path& path, uint32_t occInstance)
908 {
909     // There may be more than one sensor with the same FRU type
910     // and label so make two passes: the first to read the temps
911     // from sysfs, and the second to put them on D-Bus after
912     // resolving any conflicts.
913     std::map<std::string, double> sensorData;
914 
915     std::regex expr{"temp\\d+_label$"}; // Example: temp5_label
916     for (auto& file : fs::directory_iterator(path))
917     {
918         if (!std::regex_search(file.path().string(), expr))
919         {
920             continue;
921         }
922 
923         uint32_t labelValue{0};
924 
925         try
926         {
927             labelValue = readFile<uint32_t>(file.path());
928         }
929         catch (const std::system_error& e)
930         {
931             lg2::debug(
932                 "readTempSensors: Failed reading {PATH}, errno = {ERROR}",
933                 "PATH", file.path().string(), "ERROR", e.code().value());
934             continue;
935         }
936 
937         const std::string& tempLabel = "label";
938         const std::string filePathString = file.path().string().substr(
939             0, file.path().string().length() - tempLabel.length());
940 
941         uint32_t fruTypeValue{0};
942         try
943         {
944             fruTypeValue = readFile<uint32_t>(filePathString + fruTypeSuffix);
945         }
946         catch (const std::system_error& e)
947         {
948             lg2::debug(
949                 "readTempSensors: Failed reading {PATH}, errno = {ERROR}",
950                 "PATH", filePathString + fruTypeSuffix, "ERROR",
951                 e.code().value());
952             continue;
953         }
954 
955         std::string sensorPath =
956             OCC_SENSORS_ROOT + std::string("/temperature/");
957 
958         std::string dvfsTempPath;
959 
960         if (fruTypeValue == VRMVdd)
961         {
962             sensorPath.append(
963                 "vrm_vdd" + std::to_string(occInstance) + "_temp");
964         }
965         else if (fruTypeValue == processorIoRing)
966         {
967             sensorPath.append(
968                 "proc" + std::to_string(occInstance) + "_ioring_temp");
969             dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/proc" +
970                            std::to_string(occInstance) + "_ioring_dvfs_temp";
971         }
972         else
973         {
974             uint16_t type = (labelValue & 0xFF000000) >> 24;
975             uint16_t instanceID = labelValue & 0x0000FFFF;
976 
977             if (type == OCC_DIMM_TEMP_SENSOR_TYPE)
978             {
979                 if (fruTypeValue == fruTypeNotAvailable)
980                 {
981                     // Not all DIMM related temps are available to read
982                     // (no _input file in this case)
983                     continue;
984                 }
985                 auto iter = dimmTempSensorName.find(fruTypeValue);
986                 if (iter == dimmTempSensorName.end())
987                 {
988                     lg2::error(
989                         "readTempSensors: Fru type error! fruTypeValue = {FRU}) ",
990                         "FRU", fruTypeValue);
991                     continue;
992                 }
993 
994                 sensorPath.append(
995                     "dimm" + std::to_string(instanceID) + iter->second);
996 
997                 dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/" +
998                                dimmDVFSSensorName.at(fruTypeValue);
999             }
1000             else if (type == OCC_CPU_TEMP_SENSOR_TYPE)
1001             {
1002                 if (fruTypeValue == processorCore)
1003                 {
1004                     // The OCC reports small core temps, of which there are
1005                     // two per big core.  All current P10 systems are in big
1006                     // core mode, so use a big core name.
1007                     uint16_t coreNum = instanceID / 2;
1008                     uint16_t tempNum = instanceID % 2;
1009                     sensorPath.append("proc" + std::to_string(occInstance) +
1010                                       "_core" + std::to_string(coreNum) + "_" +
1011                                       std::to_string(tempNum) + "_temp");
1012 
1013                     dvfsTempPath =
1014                         std::string{OCC_SENSORS_ROOT} + "/temperature/proc" +
1015                         std::to_string(occInstance) + "_core_dvfs_temp";
1016                 }
1017                 else
1018                 {
1019                     continue;
1020                 }
1021             }
1022             else
1023             {
1024                 continue;
1025             }
1026         }
1027 
1028         // The dvfs temp file only needs to be read once per chip per type.
1029         if (!dvfsTempPath.empty() &&
1030             !dbus::OccDBusSensors::getOccDBus().hasDvfsTemp(dvfsTempPath))
1031         {
1032             try
1033             {
1034                 auto dvfsValue = readFile<double>(filePathString + maxSuffix);
1035 
1036                 dbus::OccDBusSensors::getOccDBus().setDvfsTemp(
1037                     dvfsTempPath, dvfsValue * std::pow(10, -3));
1038             }
1039             catch (const std::system_error& e)
1040             {
1041                 lg2::debug(
1042                     "readTempSensors: Failed reading {PATH}, errno = {ERROR}",
1043                     "PATH", filePathString + maxSuffix, "ERROR",
1044                     e.code().value());
1045             }
1046         }
1047 
1048         uint32_t faultValue{0};
1049         try
1050         {
1051             faultValue = readFile<uint32_t>(filePathString + faultSuffix);
1052         }
1053         catch (const std::system_error& e)
1054         {
1055             lg2::debug(
1056                 "readTempSensors: Failed reading {PATH}, errno = {ERROR}",
1057                 "PATH", filePathString + faultSuffix, "ERROR",
1058                 e.code().value());
1059             continue;
1060         }
1061 
1062         double tempValue{0};
1063         // NOTE: if OCC sends back 0xFF, kernal sets this fault value to 1.
1064         if (faultValue != 0)
1065         {
1066             tempValue = std::numeric_limits<double>::quiet_NaN();
1067         }
1068         else
1069         {
1070             // Read the temperature
1071             try
1072             {
1073                 tempValue = readFile<double>(filePathString + inputSuffix);
1074             }
1075             catch (const std::system_error& e)
1076             {
1077                 lg2::debug(
1078                     "readTempSensors: Failed reading {PATH}, errno = {ERROR}",
1079                     "PATH", filePathString + inputSuffix, "ERROR",
1080                     e.code().value());
1081 
1082                 // if errno == EAGAIN(Resource temporarily unavailable) then set
1083                 // temp to 0, to avoid using old temp, and affecting FAN
1084                 // Control.
1085                 if (e.code().value() == EAGAIN)
1086                 {
1087                     tempValue = 0;
1088                 }
1089                 // else the errno would be something like
1090                 //     EBADF(Bad file descriptor)
1091                 // or ENOENT(No such file or directory)
1092                 else
1093                 {
1094                     continue;
1095                 }
1096             }
1097         }
1098 
1099         // If this object path already has a value, only overwite
1100         // it if the previous one was an NaN or a smaller value.
1101         auto existing = sensorData.find(sensorPath);
1102         if (existing != sensorData.end())
1103         {
1104             // Multiple sensors found for this FRU type
1105             if ((std::isnan(existing->second) && (tempValue == 0)) ||
1106                 ((existing->second == 0) && std::isnan(tempValue)))
1107             {
1108                 // One of the redundant sensors has failed (0xFF/nan), and the
1109                 // other sensor has no reading (0), so set the FRU to NaN to
1110                 // force fan increase
1111                 tempValue = std::numeric_limits<double>::quiet_NaN();
1112                 existing->second = tempValue;
1113             }
1114             if (std::isnan(existing->second) || (tempValue > existing->second))
1115             {
1116                 existing->second = tempValue;
1117             }
1118         }
1119         else
1120         {
1121             // First sensor for this FRU type
1122             sensorData[sensorPath] = tempValue;
1123         }
1124     }
1125 
1126     // Now publish the values on D-Bus.
1127     for (const auto& [objectPath, value] : sensorData)
1128     {
1129         dbus::OccDBusSensors::getOccDBus().setValue(objectPath,
1130                                                     value * std::pow(10, -3));
1131 
1132         dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1133             objectPath, !std::isnan(value));
1134 
1135         if (existingSensors.find(objectPath) == existingSensors.end())
1136         {
1137             dbus::OccDBusSensors::getOccDBus().setChassisAssociation(
1138                 objectPath, {"all_sensors"});
1139         }
1140 
1141         existingSensors[objectPath] = occInstance;
1142     }
1143 }
1144 
1145 std::optional<std::string>
getPowerLabelFunctionID(const std::string & value)1146     Manager::getPowerLabelFunctionID(const std::string& value)
1147 {
1148     // If the value is "system", then the FunctionID is "system".
1149     if (value == "system")
1150     {
1151         return value;
1152     }
1153 
1154     // If the value is not "system", then the label value have 3 numbers, of
1155     // which we only care about the middle one:
1156     // <sensor id>_<function id>_<apss channel>
1157     // eg: The value is "0_10_5" , then the FunctionID is "10".
1158     if (value.find("_") == std::string::npos)
1159     {
1160         return std::nullopt;
1161     }
1162 
1163     auto powerLabelValue = value.substr((value.find("_") + 1));
1164 
1165     if (powerLabelValue.find("_") == std::string::npos)
1166     {
1167         return std::nullopt;
1168     }
1169 
1170     return powerLabelValue.substr(0, powerLabelValue.find("_"));
1171 }
1172 
readPowerSensors(const fs::path & path,uint32_t id)1173 void Manager::readPowerSensors(const fs::path& path, uint32_t id)
1174 {
1175     std::regex expr{"power\\d+_label$"}; // Example: power5_label
1176     for (auto& file : fs::directory_iterator(path))
1177     {
1178         if (!std::regex_search(file.path().string(), expr))
1179         {
1180             continue;
1181         }
1182 
1183         std::string labelValue;
1184         try
1185         {
1186             labelValue = readFile<std::string>(file.path());
1187         }
1188         catch (const std::system_error& e)
1189         {
1190             lg2::debug(
1191                 "readPowerSensors: Failed reading {PATH}, errno = {ERROR}",
1192                 "PATH", file.path().string(), "ERROR", e.code().value());
1193             continue;
1194         }
1195 
1196         auto functionID = getPowerLabelFunctionID(labelValue);
1197         if (functionID == std::nullopt)
1198         {
1199             continue;
1200         }
1201 
1202         const std::string& tempLabel = "label";
1203         const std::string filePathString = file.path().string().substr(
1204             0, file.path().string().length() - tempLabel.length());
1205 
1206         std::string sensorPath = OCC_SENSORS_ROOT + std::string("/power/");
1207 
1208         auto iter = powerSensorName.find(*functionID);
1209         if (iter == powerSensorName.end())
1210         {
1211             continue;
1212         }
1213         sensorPath.append(iter->second);
1214 
1215         double tempValue{0};
1216 
1217         try
1218         {
1219             tempValue = readFile<double>(filePathString + inputSuffix);
1220         }
1221         catch (const std::system_error& e)
1222         {
1223             lg2::debug(
1224                 "readPowerSensors: Failed reading {PATH}, errno = {ERROR}",
1225                 "PATH", filePathString + inputSuffix, "ERROR",
1226                 e.code().value());
1227             continue;
1228         }
1229 
1230         dbus::OccDBusSensors::getOccDBus().setUnit(
1231             sensorPath, "xyz.openbmc_project.Sensor.Value.Unit.Watts");
1232 
1233         dbus::OccDBusSensors::getOccDBus().setValue(
1234             sensorPath, tempValue * std::pow(10, -3) * std::pow(10, -3));
1235 
1236         dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1237             sensorPath, true);
1238 
1239         if (existingSensors.find(sensorPath) == existingSensors.end())
1240         {
1241             std::vector<int> occs;
1242             std::vector<std::string> fTypeList = {"all_sensors"};
1243             if (iter->second == "total_power")
1244             {
1245                 // Total system power has its own chassis association
1246                 fTypeList.push_back("total_power");
1247             }
1248             dbus::OccDBusSensors::getOccDBus().setChassisAssociation(
1249                 sensorPath, fTypeList);
1250         }
1251 
1252         existingSensors[sensorPath] = id;
1253     }
1254     return;
1255 }
1256 
setSensorValueToNaN(uint32_t id) const1257 void Manager::setSensorValueToNaN(uint32_t id) const
1258 {
1259     for (const auto& [sensorPath, occId] : existingSensors)
1260     {
1261         if (occId == id)
1262         {
1263             dbus::OccDBusSensors::getOccDBus().setValue(
1264                 sensorPath, std::numeric_limits<double>::quiet_NaN());
1265 
1266             dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1267                 sensorPath, true);
1268         }
1269     }
1270     return;
1271 }
1272 
setSensorValueToNonFunctional(uint32_t id) const1273 void Manager::setSensorValueToNonFunctional(uint32_t id) const
1274 {
1275     for (const auto& [sensorPath, occId] : existingSensors)
1276     {
1277         if (occId == id)
1278         {
1279             dbus::OccDBusSensors::getOccDBus().setValue(
1280                 sensorPath, std::numeric_limits<double>::quiet_NaN());
1281 
1282             dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1283                 sensorPath, false);
1284         }
1285     }
1286     return;
1287 }
1288 
getSensorValues(std::unique_ptr<Status> & occ)1289 void Manager::getSensorValues(std::unique_ptr<Status>& occ)
1290 {
1291     static bool tracedError[8] = {0};
1292     const fs::path sensorPath = occ->getHwmonPath();
1293     const uint32_t id = occ->getOccInstanceID();
1294 
1295     if (fs::exists(sensorPath))
1296     {
1297         // Read temperature sensors
1298         readTempSensors(sensorPath, id);
1299 
1300         if (occ->isMasterOcc())
1301         {
1302             // Read power sensors
1303             readPowerSensors(sensorPath, id);
1304         }
1305         tracedError[id] = false;
1306     }
1307     else
1308     {
1309         if (!tracedError[id])
1310         {
1311             lg2::error(
1312                 "Manager::getSensorValues: OCC{INST} sensor path missing: {PATH}",
1313                 "INST", id, "PATH", sensorPath);
1314             tracedError[id] = true;
1315         }
1316     }
1317 
1318     return;
1319 }
1320 #endif
1321 
1322 // Read the altitude from DBus
readAltitude()1323 void Manager::readAltitude()
1324 {
1325     static bool traceAltitudeErr = true;
1326 
1327     utils::PropertyValue altitudeProperty{};
1328     try
1329     {
1330         altitudeProperty = utils::getProperty(ALTITUDE_PATH, ALTITUDE_INTERFACE,
1331                                               ALTITUDE_PROP);
1332         auto sensorVal = std::get<double>(altitudeProperty);
1333         if (sensorVal < 0xFFFF)
1334         {
1335             if (sensorVal < 0)
1336             {
1337                 altitude = 0;
1338             }
1339             else
1340             {
1341                 // Round to nearest meter
1342                 altitude = uint16_t(sensorVal + 0.5);
1343             }
1344             lg2::debug("readAltitude: sensor={VALUE} ({ALT}m)", "VALUE",
1345                        sensorVal, "ALT", altitude);
1346             traceAltitudeErr = true;
1347         }
1348         else
1349         {
1350             if (traceAltitudeErr)
1351             {
1352                 traceAltitudeErr = false;
1353                 lg2::debug("Invalid altitude value: {ALT}", "ALT", sensorVal);
1354             }
1355         }
1356     }
1357     catch (const sdbusplus::exception_t& e)
1358     {
1359         if (traceAltitudeErr)
1360         {
1361             traceAltitudeErr = false;
1362             lg2::info("Unable to read Altitude: {ERROR}", "ERROR", e.what());
1363         }
1364         altitude = 0xFFFF; // not available
1365     }
1366 }
1367 
1368 // Callback function when ambient temperature changes
ambientCallback(sdbusplus::message_t & msg)1369 void Manager::ambientCallback(sdbusplus::message_t& msg)
1370 {
1371     double currentTemp = 0;
1372     uint8_t truncatedTemp = 0xFF;
1373     std::string msgSensor;
1374     std::map<std::string, std::variant<double>> msgData;
1375     msg.read(msgSensor, msgData);
1376 
1377     auto valPropMap = msgData.find(AMBIENT_PROP);
1378     if (valPropMap == msgData.end())
1379     {
1380         lg2::debug("ambientCallback: Unknown ambient property changed");
1381         return;
1382     }
1383     currentTemp = std::get<double>(valPropMap->second);
1384     if (std::isnan(currentTemp))
1385     {
1386         truncatedTemp = 0xFF;
1387     }
1388     else
1389     {
1390         if (currentTemp < 0)
1391         {
1392             truncatedTemp = 0;
1393         }
1394         else
1395         {
1396             // Round to nearest degree C
1397             truncatedTemp = uint8_t(currentTemp + 0.5);
1398         }
1399     }
1400 
1401     // If ambient changes, notify OCCs
1402     if (truncatedTemp != ambient)
1403     {
1404         lg2::debug("ambientCallback: Ambient change from {OLD} to {NEW}C",
1405                    "OLD", ambient, "NEW", currentTemp);
1406 
1407         ambient = truncatedTemp;
1408         if (altitude == 0xFFFF)
1409         {
1410             // No altitude yet, try reading again
1411             readAltitude();
1412         }
1413 
1414         lg2::debug("ambientCallback: Ambient: {TEMP}C, altitude: {ALT}m",
1415                    "TEMP", ambient, "ALT", altitude);
1416 #ifdef POWER10
1417         // Send ambient and altitude to all OCCs
1418         for (auto& obj : statusObjects)
1419         {
1420             if (obj->occActive())
1421             {
1422                 obj->sendAmbient(ambient, altitude);
1423             }
1424         }
1425 #endif // POWER10
1426     }
1427 }
1428 
1429 // return the current ambient and altitude readings
getAmbientData(bool & ambientValid,uint8_t & ambientTemp,uint16_t & altitudeValue) const1430 void Manager::getAmbientData(bool& ambientValid, uint8_t& ambientTemp,
1431                              uint16_t& altitudeValue) const
1432 {
1433     ambientValid = true;
1434     ambientTemp = ambient;
1435     altitudeValue = altitude;
1436 
1437     if (ambient == 0xFF)
1438     {
1439         ambientValid = false;
1440     }
1441 }
1442 
1443 #ifdef POWER10
1444 // Called when waitForAllOccsTimer expires
1445 // After the first OCC goes active, this timer will be started (60 seconds)
occsNotAllRunning()1446 void Manager::occsNotAllRunning()
1447 {
1448     if (resetInProgress)
1449     {
1450         lg2::warning(
1451             "occsNotAllRunning: Ignoring waitForAllOccsTimer because reset is in progress");
1452         return;
1453     }
1454     if (activeCount != statusObjects.size())
1455     {
1456         // Not all OCCs went active
1457         lg2::warning(
1458             "occsNotAllRunning: Active OCC count ({COUNT}) does not match expected count ({EXP})",
1459             "COUNT", activeCount, "EXP", statusObjects.size());
1460         // Procs may be garded, so may be expected
1461     }
1462 
1463     if (resetRequired)
1464     {
1465         initiateOccRequest(resetInstance);
1466 
1467         if (!waitForAllOccsTimer->isEnabled())
1468         {
1469             lg2::warning("occsNotAllRunning: Restarting waitForAllOccTimer");
1470             // restart occ wait timer
1471             waitForAllOccsTimer->restartOnce(60s);
1472         }
1473     }
1474     else
1475     {
1476         validateOccMaster();
1477     }
1478 }
1479 
1480 #ifdef PLDM
1481 // Called when throttlePldmTraceTimer expires.
1482 // If this timer expires, that indicates there are no OCC active sensor PDRs
1483 // found which will trigger pldm traces to be throttled.
1484 // The second time this timer expires, a PEL will get created.
throttlePldmTraceExpired()1485 void Manager::throttlePldmTraceExpired()
1486 {
1487     if (utils::isHostRunning())
1488     {
1489         if (!onPldmTimeoutCreatePel)
1490         {
1491             // Throttle traces
1492             pldmHandle->setTraceThrottle(true);
1493             // Restart timer to log a PEL when timer expires
1494             onPldmTimeoutCreatePel = true;
1495             throttlePldmTraceTimer->restartOnce(40min);
1496         }
1497         else
1498         {
1499             lg2::error(
1500                 "throttlePldmTraceExpired(): OCC active sensors still not available!");
1501             // Create PEL
1502             createPldmSensorPEL();
1503         }
1504     }
1505     else
1506     {
1507         // Make sure traces are not throttled
1508         pldmHandle->setTraceThrottle(false);
1509         lg2::info(
1510             "throttlePldmTraceExpired(): host it not running ignoring sensor timer");
1511     }
1512 }
1513 
createPldmSensorPEL()1514 void Manager::createPldmSensorPEL()
1515 {
1516     Error::Descriptor d = Error::Descriptor(MISSING_OCC_SENSORS_PATH);
1517     std::map<std::string, std::string> additionalData;
1518 
1519     additionalData.emplace("_PID", std::to_string(getpid()));
1520 
1521     lg2::info(
1522         "createPldmSensorPEL(): Unable to find PLDM sensors for the OCCs");
1523 
1524     auto& bus = utils::getBus();
1525 
1526     try
1527     {
1528         FFDCFiles ffdc;
1529         // Add occ-control journal traces to PEL FFDC
1530         auto occJournalFile =
1531             FFDC::addJournalEntries(ffdc, "openpower-occ-control", 40);
1532 
1533         static constexpr auto loggingObjectPath =
1534             "/xyz/openbmc_project/logging";
1535         static constexpr auto opLoggingInterface = "org.open_power.Logging.PEL";
1536         std::string service =
1537             utils::getService(loggingObjectPath, opLoggingInterface);
1538         auto method =
1539             bus.new_method_call(service.c_str(), loggingObjectPath,
1540                                 opLoggingInterface, "CreatePELWithFFDCFiles");
1541 
1542         // Set level to Warning (Predictive).
1543         auto level =
1544             sdbusplus::xyz::openbmc_project::Logging::server::convertForMessage(
1545                 sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level::
1546                     Warning);
1547 
1548         method.append(d.path, level, additionalData, ffdc);
1549         bus.call(method);
1550     }
1551     catch (const sdbusplus::exception_t& e)
1552     {
1553         lg2::error("Failed to create MISSING_OCC_SENSORS PEL: {ERROR}", "ERROR",
1554                    e.what());
1555     }
1556 }
1557 #endif // PLDM
1558 #endif // POWER10
1559 
1560 // Verify single master OCC and start presence monitor
validateOccMaster()1561 void Manager::validateOccMaster()
1562 {
1563     int masterInstance = -1;
1564     for (auto& obj : statusObjects)
1565     {
1566         auto instance = obj->getOccInstanceID();
1567 #ifdef POWER10
1568         if (!obj->occActive())
1569         {
1570             if (utils::isHostRunning())
1571             {
1572                 // Check if sensor was queued while waiting for discovery
1573                 auto match = queuedActiveState.find(instance);
1574                 if (match != queuedActiveState.end())
1575                 {
1576                     queuedActiveState.erase(match);
1577                     lg2::info("validateOccMaster: OCC{INST} is ACTIVE (queued)",
1578                               "INST", instance);
1579                     obj->occActive(true);
1580                 }
1581                 else
1582                 {
1583                     // OCC does not appear to be active yet, check active sensor
1584 #ifdef PLDM
1585                     pldmHandle->checkActiveSensor(instance);
1586 #endif
1587                     if (obj->occActive())
1588                     {
1589                         lg2::info(
1590                             "validateOccMaster: OCC{INST} is ACTIVE after reading sensor",
1591                             "INST", instance);
1592                     }
1593                 }
1594             }
1595             else
1596             {
1597                 lg2::warning(
1598                     "validateOccMaster: HOST is not running (OCC{INST})",
1599                     "INST", instance);
1600                 return;
1601             }
1602         }
1603 #endif // POWER10
1604 
1605         if (obj->isMasterOcc())
1606         {
1607             obj->addPresenceWatchMaster();
1608 
1609             if (masterInstance == -1)
1610             {
1611                 masterInstance = instance;
1612             }
1613             else
1614             {
1615                 lg2::error(
1616                     "validateOccMaster: Multiple OCC masters! ({MAST1} and {MAST2})",
1617                     "MAST1", masterInstance, "MAST2", instance);
1618                 // request reset
1619                 obj->deviceError(Error::Descriptor(PRESENCE_ERROR_PATH));
1620             }
1621         }
1622     }
1623 
1624     if (masterInstance < 0)
1625     {
1626         lg2::error("validateOccMaster: Master OCC not found! (of {NUM} OCCs)",
1627                    "NUM", statusObjects.size());
1628         // request reset
1629         statusObjects.front()->deviceError(
1630             Error::Descriptor(PRESENCE_ERROR_PATH));
1631     }
1632     else
1633     {
1634         lg2::info("validateOccMaster: OCC{INST} is master of {COUNT} OCCs",
1635                   "INST", masterInstance, "COUNT", activeCount);
1636 #ifdef POWER10
1637         pmode->updateDbusSafeMode(false);
1638 #endif
1639     }
1640 }
1641 
updatePcapBounds() const1642 void Manager::updatePcapBounds() const
1643 {
1644     if (pcap)
1645     {
1646         pcap->updatePcapBounds();
1647     }
1648 }
1649 
1650 } // namespace occ
1651 } // namespace open_power
1652