1 #include "config.h"
2 
3 #include "occ_manager.hpp"
4 
5 #include "i2c_occ.hpp"
6 #include "occ_dbus.hpp"
7 #include "occ_errors.hpp"
8 #include "utils.hpp"
9 
10 #include <phosphor-logging/elog-errors.hpp>
11 #include <phosphor-logging/log.hpp>
12 #include <xyz/openbmc_project/Common/error.hpp>
13 
14 #include <chrono>
15 #include <cmath>
16 #include <filesystem>
17 #include <fstream>
18 #include <regex>
19 
20 namespace open_power
21 {
22 namespace occ
23 {
24 
25 constexpr uint32_t fruTypeNotAvailable = 0xFF;
26 constexpr auto fruTypeSuffix = "fru_type";
27 constexpr auto faultSuffix = "fault";
28 constexpr auto inputSuffix = "input";
29 constexpr auto maxSuffix = "max";
30 
31 const auto HOST_ON_FILE = "/run/openbmc/host@0-on";
32 
33 using namespace phosphor::logging;
34 using namespace std::literals::chrono_literals;
35 
36 template <typename T>
readFile(const std::string & path)37 T readFile(const std::string& path)
38 {
39     std::ifstream ifs;
40     ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit |
41                    std::ifstream::eofbit);
42     T data;
43 
44     try
45     {
46         ifs.open(path);
47         ifs >> data;
48         ifs.close();
49     }
50     catch (const std::exception& e)
51     {
52         auto err = errno;
53         throw std::system_error(err, std::generic_category());
54     }
55 
56     return data;
57 }
58 
59 // findAndCreateObjects():
60 // Takes care of getting the required objects created and
61 // finds the available devices/processors.
62 // (function is called everytime the discoverTimer expires)
63 // - create the PowerMode object to control OCC modes
64 // - create statusObjects for each OCC device found
65 // - waits for OCC Active sensors PDRs to become available
66 // - restart discoverTimer if all data is not available yet
findAndCreateObjects()67 void Manager::findAndCreateObjects()
68 {
69 #ifndef POWER10
70     for (auto id = 0; id < MAX_CPUS; ++id)
71     {
72         // Create one occ per cpu
73         auto occ = std::string(OCC_NAME) + std::to_string(id);
74         createObjects(occ);
75     }
76 #else
77     if (!pmode)
78     {
79         // Create the power mode object
80         pmode = std::make_unique<powermode::PowerMode>(
81             *this, powermode::PMODE_PATH, powermode::PIPS_PATH, event);
82     }
83 
84     if (!fs::exists(HOST_ON_FILE))
85     {
86         static bool statusObjCreated = false;
87         if (!statusObjCreated)
88         {
89             // Create the OCCs based on on the /dev/occX devices
90             auto occs = findOCCsInDev();
91 
92             if (occs.empty() || (prevOCCSearch.size() != occs.size()))
93             {
94                 // Something changed or no OCCs yet, try again in 10s.
95                 // Note on the first pass prevOCCSearch will be empty,
96                 // so there will be at least one delay to give things
97                 // a chance to settle.
98                 prevOCCSearch = occs;
99 
100                 log<level::INFO>(
101                     std::format(
102                         "Manager::findAndCreateObjects(): Waiting for OCCs (currently {})",
103                         occs.size())
104                         .c_str());
105 
106                 discoverTimer->restartOnce(10s);
107             }
108             else
109             {
110                 // All OCCs appear to be available, create status objects
111 
112                 // createObjects requires OCC0 first.
113                 std::sort(occs.begin(), occs.end());
114 
115                 log<level::INFO>(
116                     std::format(
117                         "Manager::findAndCreateObjects(): Creating {} OCC Status Objects",
118                         occs.size())
119                         .c_str());
120                 for (auto id : occs)
121                 {
122                     createObjects(std::string(OCC_NAME) + std::to_string(id));
123                 }
124                 statusObjCreated = true;
125                 waitingForAllOccActiveSensors = true;
126 
127                 // Find/update the processor path associated with each OCC
128                 for (auto& obj : statusObjects)
129                 {
130                     obj->updateProcAssociation();
131                 }
132             }
133         }
134 
135         if (statusObjCreated && waitingForAllOccActiveSensors)
136         {
137             static bool tracedHostWait = false;
138             if (utils::isHostRunning())
139             {
140                 if (tracedHostWait)
141                 {
142                     log<level::INFO>(
143                         "Manager::findAndCreateObjects(): Host is running");
144                     tracedHostWait = false;
145                 }
146                 checkAllActiveSensors();
147             }
148             else
149             {
150                 if (!tracedHostWait)
151                 {
152                     log<level::INFO>(
153                         "Manager::findAndCreateObjects(): Waiting for host to start");
154                     tracedHostWait = true;
155                 }
156                 discoverTimer->restartOnce(30s);
157 #ifdef PLDM
158                 if (throttlePldmTraceTimer->isEnabled())
159                 {
160                     // Host is no longer running, disable throttle timer and
161                     // make sure traces are not throttled
162                     log<level::INFO>(
163                         "findAndCreateObjects(): disabling sensor timer");
164                     throttlePldmTraceTimer->setEnabled(false);
165                     pldmHandle->setTraceThrottle(false);
166                 }
167 #endif
168             }
169         }
170     }
171     else
172     {
173         log<level::INFO>(
174             std::format(
175                 "Manager::findAndCreateObjects(): Waiting for {} to complete...",
176                 HOST_ON_FILE)
177                 .c_str());
178         discoverTimer->restartOnce(10s);
179     }
180 #endif
181 }
182 
183 #ifdef POWER10
184 // Check if all occActive sensors are available
checkAllActiveSensors()185 void Manager::checkAllActiveSensors()
186 {
187     static bool allActiveSensorAvailable = false;
188     static bool tracedSensorWait = false;
189     static bool waitingForHost = false;
190 
191     if (open_power::occ::utils::isHostRunning())
192     {
193         if (waitingForHost)
194         {
195             waitingForHost = false;
196             log<level::INFO>("checkAllActiveSensors(): Host is now running");
197         }
198 
199         // Start with the assumption that all are available
200         allActiveSensorAvailable = true;
201         for (auto& obj : statusObjects)
202         {
203             if ((!obj->occActive()) && (!obj->getPldmSensorReceived()))
204             {
205                 auto instance = obj->getOccInstanceID();
206                 // Check if sensor was queued while waiting for discovery
207                 auto match = queuedActiveState.find(instance);
208                 if (match != queuedActiveState.end())
209                 {
210                     queuedActiveState.erase(match);
211                     log<level::INFO>(
212                         std::format(
213                             "checkAllActiveSensors(): OCC{} is ACTIVE (queued)",
214                             instance)
215                             .c_str());
216                     obj->occActive(true);
217                 }
218                 else
219                 {
220                     allActiveSensorAvailable = false;
221                     if (!tracedSensorWait)
222                     {
223                         log<level::INFO>(
224                             std::format(
225                                 "checkAllActiveSensors(): Waiting on OCC{} Active sensor",
226                                 instance)
227                                 .c_str());
228                         tracedSensorWait = true;
229 #ifdef PLDM
230                         // Make sure PLDM traces are not throttled
231                         pldmHandle->setTraceThrottle(false);
232                         // Start timer to throttle PLDM traces when timer
233                         // expires
234                         onPldmTimeoutCreatePel = false;
235                         throttlePldmTraceTimer->restartOnce(5min);
236 #endif
237                     }
238 #ifdef PLDM
239                     // Ignore active sensor check if the OCCs are being reset
240                     if (!resetInProgress)
241                     {
242                         pldmHandle->checkActiveSensor(obj->getOccInstanceID());
243                     }
244 #endif
245                     break;
246                 }
247             }
248         }
249     }
250     else
251     {
252         if (!waitingForHost)
253         {
254             waitingForHost = true;
255             log<level::INFO>(
256                 "checkAllActiveSensors(): Waiting for host to start");
257 #ifdef PLDM
258             if (throttlePldmTraceTimer->isEnabled())
259             {
260                 // Host is no longer running, disable throttle timer and
261                 // make sure traces are not throttled
262                 log<level::INFO>(
263                     "checkAllActiveSensors(): disabling sensor timer");
264                 throttlePldmTraceTimer->setEnabled(false);
265                 pldmHandle->setTraceThrottle(false);
266             }
267 #endif
268         }
269     }
270 
271     if (allActiveSensorAvailable)
272     {
273         // All sensors were found, disable the discovery timer
274         if (discoverTimer->isEnabled())
275         {
276             discoverTimer->setEnabled(false);
277         }
278 #ifdef PLDM
279         if (throttlePldmTraceTimer->isEnabled())
280         {
281             // Disable throttle timer and make sure traces are not throttled
282             throttlePldmTraceTimer->setEnabled(false);
283             pldmHandle->setTraceThrottle(false);
284         }
285 #endif
286         if (waitingForAllOccActiveSensors)
287         {
288             log<level::INFO>(
289                 "checkAllActiveSensors(): OCC Active sensors are available");
290             waitingForAllOccActiveSensors = false;
291 
292             if (resetRequired)
293             {
294                 initiateOccRequest(resetInstance);
295 
296                 if (!waitForAllOccsTimer->isEnabled())
297                 {
298                     log<level::WARNING>(
299                         "occsNotAllRunning: Restarting waitForAllOccTimer");
300                     // restart occ wait timer to check status after reset
301                     // completes
302                     waitForAllOccsTimer->restartOnce(60s);
303                 }
304             }
305         }
306         queuedActiveState.clear();
307         tracedSensorWait = false;
308     }
309     else
310     {
311         // Not all sensors were available, so keep waiting
312         if (!tracedSensorWait)
313         {
314             log<level::INFO>(
315                 "checkAllActiveSensors(): Waiting for OCC Active sensors to become available");
316             tracedSensorWait = true;
317         }
318         discoverTimer->restartOnce(10s);
319     }
320 }
321 #endif
322 
findOCCsInDev()323 std::vector<int> Manager::findOCCsInDev()
324 {
325     std::vector<int> occs;
326     std::regex expr{R"(occ(\d+)$)"};
327 
328     for (auto& file : fs::directory_iterator("/dev"))
329     {
330         std::smatch match;
331         std::string path{file.path().string()};
332         if (std::regex_search(path, match, expr))
333         {
334             auto num = std::stoi(match[1].str());
335 
336             // /dev numbering starts at 1, ours starts at 0.
337             occs.push_back(num - 1);
338         }
339     }
340 
341     return occs;
342 }
343 
cpuCreated(sdbusplus::message_t & msg)344 int Manager::cpuCreated(sdbusplus::message_t& msg)
345 {
346     namespace fs = std::filesystem;
347 
348     sdbusplus::message::object_path o;
349     msg.read(o);
350     fs::path cpuPath(std::string(std::move(o)));
351 
352     auto name = cpuPath.filename().string();
353     auto index = name.find(CPU_NAME);
354     name.replace(index, std::strlen(CPU_NAME), OCC_NAME);
355 
356     createObjects(name);
357 
358     return 0;
359 }
360 
createObjects(const std::string & occ)361 void Manager::createObjects(const std::string& occ)
362 {
363     auto path = fs::path(OCC_CONTROL_ROOT) / occ;
364 
365     statusObjects.emplace_back(std::make_unique<Status>(
366         event, path.c_str(), *this,
367 #ifdef POWER10
368         pmode,
369 #endif
370         std::bind(std::mem_fn(&Manager::statusCallBack), this,
371                   std::placeholders::_1, std::placeholders::_2)
372 #ifdef PLDM
373             ,
374         // Callback will set flag indicating reset needs to be done
375         // instead of immediately issuing a reset via PLDM.
376         std::bind(std::mem_fn(&Manager::resetOccRequest), this,
377                   std::placeholders::_1)
378 #endif
379             ));
380 
381     // Create the power cap monitor object
382     if (!pcap)
383     {
384         pcap = std::make_unique<open_power::occ::powercap::PowerCap>(
385             *statusObjects.back());
386     }
387 
388     if (statusObjects.back()->isMasterOcc())
389     {
390         log<level::INFO>(
391             std::format("Manager::createObjects(): OCC{} is the master",
392                         statusObjects.back()->getOccInstanceID())
393                 .c_str());
394         _pollTimer->setEnabled(false);
395 
396 #ifdef POWER10
397         // Set the master OCC on the PowerMode object
398         pmode->setMasterOcc(path);
399 #endif
400     }
401 
402     passThroughObjects.emplace_back(std::make_unique<PassThrough>(
403         path.c_str()
404 #ifdef POWER10
405             ,
406         pmode
407 #endif
408         ));
409 }
410 
411 // If a reset is not already outstanding, set a flag to indicate that a reset is
412 // needed.
resetOccRequest(instanceID instance)413 void Manager::resetOccRequest(instanceID instance)
414 {
415     if (!resetRequired)
416     {
417         resetRequired = true;
418         resetInstance = instance;
419         log<level::ERR>(
420             std::format(
421                 "resetOccRequest: PM Complex reset was requested due to OCC{}",
422                 instance)
423                 .c_str());
424     }
425     else if (instance != resetInstance)
426     {
427         log<level::WARNING>(
428             std::format(
429                 "resetOccRequest: Ignoring PM Complex reset request for OCC{}, because reset already outstanding for OCC{}",
430                 instance, resetInstance)
431                 .c_str());
432     }
433 }
434 
435 // If a reset has not been started, initiate an OCC reset via PLDM
initiateOccRequest(instanceID instance)436 void Manager::initiateOccRequest(instanceID instance)
437 {
438     if (!resetInProgress)
439     {
440         resetInProgress = true;
441         resetInstance = instance;
442         log<level::ERR>(
443             std::format(
444                 "initiateOccRequest: Initiating PM Complex reset due to OCC{}",
445                 instance)
446                 .c_str());
447 #ifdef PLDM
448         pldmHandle->resetOCC(instance);
449 #endif
450         resetRequired = false;
451     }
452     else
453     {
454         log<level::WARNING>(
455             std::format(
456                 "initiateOccRequest: Ignoring PM Complex reset request for OCC{}, because reset already in process for OCC{}",
457                 instance, resetInstance)
458                 .c_str());
459     }
460 }
461 
statusCallBack(instanceID instance,bool status)462 void Manager::statusCallBack(instanceID instance, bool status)
463 {
464     if (status == true)
465     {
466         if (resetInProgress)
467         {
468             log<level::INFO>(
469                 std::format(
470                     "statusCallBack: Ignoring OCC{} activate because a reset has been initiated due to OCC{}",
471                     instance, resetInstance)
472                     .c_str());
473             return;
474         }
475 
476         // OCC went active
477         ++activeCount;
478 
479 #ifdef POWER10
480         if (activeCount == 1)
481         {
482             // First OCC went active (allow some time for all OCCs to go active)
483             waitForAllOccsTimer->restartOnce(60s);
484         }
485 #endif
486 
487         if (activeCount == statusObjects.size())
488         {
489 #ifdef POWER10
490             // All OCCs are now running
491             if (waitForAllOccsTimer->isEnabled())
492             {
493                 // stop occ wait timer
494                 waitForAllOccsTimer->setEnabled(false);
495             }
496 
497             // All OCCs have been found, check if we need a reset
498             if (resetRequired)
499             {
500                 initiateOccRequest(resetInstance);
501 
502                 if (!waitForAllOccsTimer->isEnabled())
503                 {
504                     log<level::WARNING>(
505                         "occsNotAllRunning: Restarting waitForAllOccTimer");
506                     // restart occ wait timer
507                     waitForAllOccsTimer->restartOnce(60s);
508                 }
509             }
510             else
511             {
512                 // Verify master OCC and start presence monitor
513                 validateOccMaster();
514             }
515 #else
516             // Verify master OCC and start presence monitor
517             validateOccMaster();
518 #endif
519         }
520 
521         // Start poll timer if not already started
522         if (!_pollTimer->isEnabled())
523         {
524             log<level::INFO>(
525                 std::format("Manager: OCCs will be polled every {} seconds",
526                             pollInterval)
527                     .c_str());
528 
529             // Send poll and start OCC poll timer
530             pollerTimerExpired();
531         }
532     }
533     else
534     {
535         // OCC went away
536         if (activeCount > 0)
537         {
538             --activeCount;
539         }
540         else
541         {
542             log<level::INFO>(
543                 std::format("OCC{} disabled, but currently no active OCCs",
544                             instance)
545                     .c_str());
546         }
547 
548         if (activeCount == 0)
549         {
550             // No OCCs are running
551 
552             if (resetInProgress)
553             {
554                 // All OCC active sensors are clear (reset should be in
555                 // progress)
556                 log<level::INFO>(
557                     std::format(
558                         "statusCallBack: Clearing resetInProgress (activeCount={}, OCC{}, status={})",
559                         activeCount, instance, status)
560                         .c_str());
561                 resetInProgress = false;
562                 resetInstance = 255;
563             }
564 
565             // Stop OCC poll timer
566             if (_pollTimer->isEnabled())
567             {
568                 log<level::INFO>(
569                     "Manager::statusCallBack(): OCCs are not running, stopping poll timer");
570                 _pollTimer->setEnabled(false);
571             }
572 
573 #ifdef POWER10
574             // stop wait timer
575             if (waitForAllOccsTimer->isEnabled())
576             {
577                 waitForAllOccsTimer->setEnabled(false);
578             }
579 #endif
580         }
581         else if (resetInProgress)
582         {
583             log<level::INFO>(
584                 std::format(
585                     "statusCallBack: Skipping clear of resetInProgress (activeCount={}, OCC{}, status={})",
586                     activeCount, instance, status)
587                     .c_str());
588         }
589 #ifdef READ_OCC_SENSORS
590         // Clear OCC sensors
591         setSensorValueToNaN(instance);
592 #endif
593     }
594 
595 #ifdef POWER10
596     if (waitingForAllOccActiveSensors)
597     {
598         if (utils::isHostRunning())
599         {
600             checkAllActiveSensors();
601         }
602     }
603 #endif
604 }
605 
606 #ifdef I2C_OCC
initStatusObjects()607 void Manager::initStatusObjects()
608 {
609     // Make sure we have a valid path string
610     static_assert(sizeof(DEV_PATH) != 0);
611 
612     auto deviceNames = i2c_occ::getOccHwmonDevices(DEV_PATH);
613     for (auto& name : deviceNames)
614     {
615         i2c_occ::i2cToDbus(name);
616         name = std::string(OCC_NAME) + '_' + name;
617         auto path = fs::path(OCC_CONTROL_ROOT) / name;
618         statusObjects.emplace_back(
619             std::make_unique<Status>(event, path.c_str(), *this));
620     }
621     // The first device is master occ
622     pcap = std::make_unique<open_power::occ::powercap::PowerCap>(
623         *statusObjects.front());
624 #ifdef POWER10
625     pmode = std::make_unique<powermode::PowerMode>(*this, powermode::PMODE_PATH,
626                                                    powermode::PIPS_PATH);
627     // Set the master OCC on the PowerMode object
628     pmode->setMasterOcc(path);
629 #endif
630 }
631 #endif
632 
633 #ifdef PLDM
sbeTimeout(unsigned int instance)634 void Manager::sbeTimeout(unsigned int instance)
635 {
636     auto obj = std::find_if(statusObjects.begin(), statusObjects.end(),
637                             [instance](const auto& obj) {
638                                 return instance == obj->getOccInstanceID();
639                             });
640 
641     if (obj != statusObjects.end() && (*obj)->occActive())
642     {
643         log<level::INFO>(
644             std::format("SBE timeout, requesting HRESET (OCC{})", instance)
645                 .c_str());
646 
647         setSBEState(instance, SBE_STATE_NOT_USABLE);
648 
649         pldmHandle->sendHRESET(instance);
650     }
651 }
652 
updateOCCActive(instanceID instance,bool status)653 bool Manager::updateOCCActive(instanceID instance, bool status)
654 {
655     auto obj = std::find_if(statusObjects.begin(), statusObjects.end(),
656                             [instance](const auto& obj) {
657                                 return instance == obj->getOccInstanceID();
658                             });
659 
660     const bool hostRunning = open_power::occ::utils::isHostRunning();
661     if (obj != statusObjects.end())
662     {
663         if (!hostRunning && (status == true))
664         {
665             log<level::WARNING>(
666                 std::format(
667                     "updateOCCActive: Host is not running yet (OCC{} active={}), clearing sensor received",
668                     instance, status)
669                     .c_str());
670             (*obj)->setPldmSensorReceived(false);
671             if (!waitingForAllOccActiveSensors)
672             {
673                 log<level::INFO>(
674                     "updateOCCActive: Waiting for Host and all OCC Active Sensors");
675                 waitingForAllOccActiveSensors = true;
676             }
677 #ifdef POWER10
678             discoverTimer->restartOnce(30s);
679 #endif
680             return false;
681         }
682         else
683         {
684             (*obj)->setPldmSensorReceived(true);
685             return (*obj)->occActive(status);
686         }
687     }
688     else
689     {
690         if (hostRunning)
691         {
692             log<level::WARNING>(
693                 std::format(
694                     "updateOCCActive: No status object to update for OCC{} (active={})",
695                     instance, status)
696                     .c_str());
697         }
698         else
699         {
700             if (status == true)
701             {
702                 log<level::WARNING>(
703                     std::format(
704                         "updateOCCActive: No status objects and Host is not running yet (OCC{} active={})",
705                         instance, status)
706                         .c_str());
707             }
708         }
709         if (status == true)
710         {
711             // OCC went active
712             queuedActiveState.insert(instance);
713         }
714         else
715         {
716             auto match = queuedActiveState.find(instance);
717             if (match != queuedActiveState.end())
718             {
719                 // OCC was disabled
720                 queuedActiveState.erase(match);
721             }
722         }
723         return false;
724     }
725 }
726 
727 // Called upon pldm event To set powermode Safe Mode State for system.
updateOccSafeMode(bool safeMode)728 void Manager::updateOccSafeMode(bool safeMode)
729 {
730 #ifdef POWER10
731     pmode->updateDbusSafeMode(safeMode);
732 #endif
733     // Update the processor throttle status on dbus
734     for (auto& obj : statusObjects)
735     {
736         obj->updateThrottle(safeMode, THROTTLED_SAFE);
737     }
738 }
739 
sbeHRESETResult(instanceID instance,bool success)740 void Manager::sbeHRESETResult(instanceID instance, bool success)
741 {
742     if (success)
743     {
744         log<level::INFO>(
745             std::format("HRESET succeeded (OCC{})", instance).c_str());
746 
747         setSBEState(instance, SBE_STATE_BOOTED);
748 
749         return;
750     }
751 
752     setSBEState(instance, SBE_STATE_FAILED);
753 
754     if (sbeCanDump(instance))
755     {
756         log<level::INFO>(
757             std::format("HRESET failed (OCC{}), triggering SBE dump", instance)
758                 .c_str());
759 
760         auto& bus = utils::getBus();
761         uint32_t src6 = instance << 16;
762         uint32_t logId =
763             FFDC::createPEL("org.open_power.Processor.Error.SbeChipOpTimeout",
764                             src6, "SBE command timeout");
765 
766         try
767         {
768             constexpr auto interface = "xyz.openbmc_project.Dump.Create";
769             constexpr auto function = "CreateDump";
770 
771             std::string service =
772                 utils::getService(OP_DUMP_OBJ_PATH, interface);
773             auto method = bus.new_method_call(service.c_str(), OP_DUMP_OBJ_PATH,
774                                               interface, function);
775 
776             std::map<std::string, std::variant<std::string, uint64_t>>
777                 createParams{
778                     {"com.ibm.Dump.Create.CreateParameters.ErrorLogId",
779                      uint64_t(logId)},
780                     {"com.ibm.Dump.Create.CreateParameters.DumpType",
781                      "com.ibm.Dump.Create.DumpType.SBE"},
782                     {"com.ibm.Dump.Create.CreateParameters.FailingUnitId",
783                      uint64_t(instance)},
784                 };
785 
786             method.append(createParams);
787 
788             auto response = bus.call(method);
789         }
790         catch (const sdbusplus::exception_t& e)
791         {
792             constexpr auto ERROR_DUMP_DISABLED =
793                 "xyz.openbmc_project.Dump.Create.Error.Disabled";
794             if (e.name() == ERROR_DUMP_DISABLED)
795             {
796                 log<level::INFO>("Dump is disabled, skipping");
797             }
798             else
799             {
800                 log<level::ERR>("Dump failed");
801             }
802         }
803     }
804 
805     // SBE Reset failed, try PM Complex reset
806     log<level::ERR>("sbeHRESETResult: Forcing PM Complex reset");
807     resetOccRequest(instance);
808 }
809 
sbeCanDump(unsigned int instance)810 bool Manager::sbeCanDump(unsigned int instance)
811 {
812     struct pdbg_target* proc = getPdbgTarget(instance);
813 
814     if (!proc)
815     {
816         // allow the dump in the error case
817         return true;
818     }
819 
820     try
821     {
822         if (!openpower::phal::sbe::isDumpAllowed(proc))
823         {
824             return false;
825         }
826 
827         if (openpower::phal::pdbg::isSbeVitalAttnActive(proc))
828         {
829             return false;
830         }
831     }
832     catch (openpower::phal::exception::SbeError& e)
833     {
834         log<level::INFO>("Failed to query SBE state");
835     }
836 
837     // allow the dump in the error case
838     return true;
839 }
840 
setSBEState(unsigned int instance,enum sbe_state state)841 void Manager::setSBEState(unsigned int instance, enum sbe_state state)
842 {
843     struct pdbg_target* proc = getPdbgTarget(instance);
844 
845     if (!proc)
846     {
847         return;
848     }
849 
850     try
851     {
852         openpower::phal::sbe::setState(proc, state);
853     }
854     catch (const openpower::phal::exception::SbeError& e)
855     {
856         log<level::ERR>(
857             std::format("Failed to set SBE state: {}", e.what()).c_str());
858     }
859 }
860 
getPdbgTarget(unsigned int instance)861 struct pdbg_target* Manager::getPdbgTarget(unsigned int instance)
862 {
863     if (!pdbgInitialized)
864     {
865         try
866         {
867             openpower::phal::pdbg::init();
868             pdbgInitialized = true;
869         }
870         catch (const openpower::phal::exception::PdbgError& e)
871         {
872             log<level::ERR>("pdbg initialization failed");
873             return nullptr;
874         }
875     }
876 
877     struct pdbg_target* proc = nullptr;
878     pdbg_for_each_class_target("proc", proc)
879     {
880         if (pdbg_target_index(proc) == instance)
881         {
882             return proc;
883         }
884     }
885 
886     log<level::ERR>("Failed to get pdbg target");
887     return nullptr;
888 }
889 #endif
890 
pollerTimerExpired()891 void Manager::pollerTimerExpired()
892 {
893     if (!_pollTimer)
894     {
895         log<level::ERR>("pollerTimerExpired() ERROR: Timer not defined");
896         return;
897     }
898 
899 #ifdef POWER10
900     if (resetRequired)
901     {
902         log<level::ERR>("pollerTimerExpired() - Initiating PM Complex reset");
903         initiateOccRequest(resetInstance);
904 
905         if (!waitForAllOccsTimer->isEnabled())
906         {
907             log<level::WARNING>(
908                 "pollerTimerExpired: Restarting waitForAllOccTimer");
909             // restart occ wait timer
910             waitForAllOccsTimer->restartOnce(60s);
911         }
912         return;
913     }
914 #endif
915 
916     for (auto& obj : statusObjects)
917     {
918         if (!obj->occActive())
919         {
920             // OCC is not running yet
921 #ifdef READ_OCC_SENSORS
922             auto id = obj->getOccInstanceID();
923             setSensorValueToNaN(id);
924 #endif
925             continue;
926         }
927 
928         // Read sysfs to force kernel to poll OCC
929         obj->readOccState();
930 
931 #ifdef READ_OCC_SENSORS
932         // Read occ sensor values
933         getSensorValues(obj);
934 #endif
935     }
936 
937     if (activeCount > 0)
938     {
939         // Restart OCC poll timer
940         _pollTimer->restartOnce(std::chrono::seconds(pollInterval));
941     }
942     else
943     {
944         // No OCCs running, so poll timer will not be restarted
945         log<level::INFO>(
946             std::format(
947                 "Manager::pollerTimerExpired: poll timer will not be restarted")
948                 .c_str());
949     }
950 }
951 
952 #ifdef READ_OCC_SENSORS
readTempSensors(const fs::path & path,uint32_t occInstance)953 void Manager::readTempSensors(const fs::path& path, uint32_t occInstance)
954 {
955     // There may be more than one sensor with the same FRU type
956     // and label so make two passes: the first to read the temps
957     // from sysfs, and the second to put them on D-Bus after
958     // resolving any conflicts.
959     std::map<std::string, double> sensorData;
960 
961     std::regex expr{"temp\\d+_label$"}; // Example: temp5_label
962     for (auto& file : fs::directory_iterator(path))
963     {
964         if (!std::regex_search(file.path().string(), expr))
965         {
966             continue;
967         }
968 
969         uint32_t labelValue{0};
970 
971         try
972         {
973             labelValue = readFile<uint32_t>(file.path());
974         }
975         catch (const std::system_error& e)
976         {
977             log<level::DEBUG>(
978                 std::format("readTempSensors: Failed reading {}, errno = {}",
979                             file.path().string(), e.code().value())
980                     .c_str());
981             continue;
982         }
983 
984         const std::string& tempLabel = "label";
985         const std::string filePathString = file.path().string().substr(
986             0, file.path().string().length() - tempLabel.length());
987 
988         uint32_t fruTypeValue{0};
989         try
990         {
991             fruTypeValue = readFile<uint32_t>(filePathString + fruTypeSuffix);
992         }
993         catch (const std::system_error& e)
994         {
995             log<level::DEBUG>(
996                 std::format("readTempSensors: Failed reading {}, errno = {}",
997                             filePathString + fruTypeSuffix, e.code().value())
998                     .c_str());
999             continue;
1000         }
1001 
1002         std::string sensorPath =
1003             OCC_SENSORS_ROOT + std::string("/temperature/");
1004 
1005         std::string dvfsTempPath;
1006 
1007         if (fruTypeValue == VRMVdd)
1008         {
1009             sensorPath.append(
1010                 "vrm_vdd" + std::to_string(occInstance) + "_temp");
1011         }
1012         else if (fruTypeValue == processorIoRing)
1013         {
1014             sensorPath.append(
1015                 "proc" + std::to_string(occInstance) + "_ioring_temp");
1016             dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/proc" +
1017                            std::to_string(occInstance) + "_ioring_dvfs_temp";
1018         }
1019         else
1020         {
1021             uint16_t type = (labelValue & 0xFF000000) >> 24;
1022             uint16_t instanceID = labelValue & 0x0000FFFF;
1023 
1024             if (type == OCC_DIMM_TEMP_SENSOR_TYPE)
1025             {
1026                 if (fruTypeValue == fruTypeNotAvailable)
1027                 {
1028                     // Not all DIMM related temps are available to read
1029                     // (no _input file in this case)
1030                     continue;
1031                 }
1032                 auto iter = dimmTempSensorName.find(fruTypeValue);
1033                 if (iter == dimmTempSensorName.end())
1034                 {
1035                     log<level::ERR>(
1036                         std::format(
1037                             "readTempSensors: Fru type error! fruTypeValue = {}) ",
1038                             fruTypeValue)
1039                             .c_str());
1040                     continue;
1041                 }
1042 
1043                 sensorPath.append(
1044                     "dimm" + std::to_string(instanceID) + iter->second);
1045 
1046                 dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/" +
1047                                dimmDVFSSensorName.at(fruTypeValue);
1048             }
1049             else if (type == OCC_CPU_TEMP_SENSOR_TYPE)
1050             {
1051                 if (fruTypeValue == processorCore)
1052                 {
1053                     // The OCC reports small core temps, of which there are
1054                     // two per big core.  All current P10 systems are in big
1055                     // core mode, so use a big core name.
1056                     uint16_t coreNum = instanceID / 2;
1057                     uint16_t tempNum = instanceID % 2;
1058                     sensorPath.append("proc" + std::to_string(occInstance) +
1059                                       "_core" + std::to_string(coreNum) + "_" +
1060                                       std::to_string(tempNum) + "_temp");
1061 
1062                     dvfsTempPath =
1063                         std::string{OCC_SENSORS_ROOT} + "/temperature/proc" +
1064                         std::to_string(occInstance) + "_core_dvfs_temp";
1065                 }
1066                 else
1067                 {
1068                     continue;
1069                 }
1070             }
1071             else
1072             {
1073                 continue;
1074             }
1075         }
1076 
1077         // The dvfs temp file only needs to be read once per chip per type.
1078         if (!dvfsTempPath.empty() &&
1079             !dbus::OccDBusSensors::getOccDBus().hasDvfsTemp(dvfsTempPath))
1080         {
1081             try
1082             {
1083                 auto dvfsValue = readFile<double>(filePathString + maxSuffix);
1084 
1085                 dbus::OccDBusSensors::getOccDBus().setDvfsTemp(
1086                     dvfsTempPath, dvfsValue * std::pow(10, -3));
1087             }
1088             catch (const std::system_error& e)
1089             {
1090                 log<level::DEBUG>(
1091                     std::format(
1092                         "readTempSensors: Failed reading {}, errno = {}",
1093                         filePathString + maxSuffix, e.code().value())
1094                         .c_str());
1095             }
1096         }
1097 
1098         uint32_t faultValue{0};
1099         try
1100         {
1101             faultValue = readFile<uint32_t>(filePathString + faultSuffix);
1102         }
1103         catch (const std::system_error& e)
1104         {
1105             log<level::DEBUG>(
1106                 std::format("readTempSensors: Failed reading {}, errno = {}",
1107                             filePathString + faultSuffix, e.code().value())
1108                     .c_str());
1109             continue;
1110         }
1111 
1112         double tempValue{0};
1113         // NOTE: if OCC sends back 0xFF, kernal sets this fault value to 1.
1114         if (faultValue != 0)
1115         {
1116             tempValue = std::numeric_limits<double>::quiet_NaN();
1117         }
1118         else
1119         {
1120             // Read the temperature
1121             try
1122             {
1123                 tempValue = readFile<double>(filePathString + inputSuffix);
1124             }
1125             catch (const std::system_error& e)
1126             {
1127                 log<level::DEBUG>(
1128                     std::format(
1129                         "readTempSensors: Failed reading {}, errno = {}",
1130                         filePathString + inputSuffix, e.code().value())
1131                         .c_str());
1132 
1133                 // if errno == EAGAIN(Resource temporarily unavailable) then set
1134                 // temp to 0, to avoid using old temp, and affecting FAN
1135                 // Control.
1136                 if (e.code().value() == EAGAIN)
1137                 {
1138                     tempValue = 0;
1139                 }
1140                 // else the errno would be something like
1141                 //     EBADF(Bad file descriptor)
1142                 // or ENOENT(No such file or directory)
1143                 else
1144                 {
1145                     continue;
1146                 }
1147             }
1148         }
1149 
1150         // If this object path already has a value, only overwite
1151         // it if the previous one was an NaN or a smaller value.
1152         auto existing = sensorData.find(sensorPath);
1153         if (existing != sensorData.end())
1154         {
1155             // Multiple sensors found for this FRU type
1156             if ((std::isnan(existing->second) && (tempValue == 0)) ||
1157                 ((existing->second == 0) && std::isnan(tempValue)))
1158             {
1159                 // One of the redundant sensors has failed (0xFF/nan), and the
1160                 // other sensor has no reading (0), so set the FRU to NaN to
1161                 // force fan increase
1162                 tempValue = std::numeric_limits<double>::quiet_NaN();
1163                 existing->second = tempValue;
1164             }
1165             if (std::isnan(existing->second) || (tempValue > existing->second))
1166             {
1167                 existing->second = tempValue;
1168             }
1169         }
1170         else
1171         {
1172             // First sensor for this FRU type
1173             sensorData[sensorPath] = tempValue;
1174         }
1175     }
1176 
1177     // Now publish the values on D-Bus.
1178     for (const auto& [objectPath, value] : sensorData)
1179     {
1180         dbus::OccDBusSensors::getOccDBus().setValue(objectPath,
1181                                                     value * std::pow(10, -3));
1182 
1183         dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1184             objectPath, !std::isnan(value));
1185 
1186         if (existingSensors.find(objectPath) == existingSensors.end())
1187         {
1188             dbus::OccDBusSensors::getOccDBus().setChassisAssociation(
1189                 objectPath);
1190         }
1191 
1192         existingSensors[objectPath] = occInstance;
1193     }
1194 }
1195 
1196 std::optional<std::string>
getPowerLabelFunctionID(const std::string & value)1197     Manager::getPowerLabelFunctionID(const std::string& value)
1198 {
1199     // If the value is "system", then the FunctionID is "system".
1200     if (value == "system")
1201     {
1202         return value;
1203     }
1204 
1205     // If the value is not "system", then the label value have 3 numbers, of
1206     // which we only care about the middle one:
1207     // <sensor id>_<function id>_<apss channel>
1208     // eg: The value is "0_10_5" , then the FunctionID is "10".
1209     if (value.find("_") == std::string::npos)
1210     {
1211         return std::nullopt;
1212     }
1213 
1214     auto powerLabelValue = value.substr((value.find("_") + 1));
1215 
1216     if (powerLabelValue.find("_") == std::string::npos)
1217     {
1218         return std::nullopt;
1219     }
1220 
1221     return powerLabelValue.substr(0, powerLabelValue.find("_"));
1222 }
1223 
readPowerSensors(const fs::path & path,uint32_t id)1224 void Manager::readPowerSensors(const fs::path& path, uint32_t id)
1225 {
1226     std::regex expr{"power\\d+_label$"}; // Example: power5_label
1227     for (auto& file : fs::directory_iterator(path))
1228     {
1229         if (!std::regex_search(file.path().string(), expr))
1230         {
1231             continue;
1232         }
1233 
1234         std::string labelValue;
1235         try
1236         {
1237             labelValue = readFile<std::string>(file.path());
1238         }
1239         catch (const std::system_error& e)
1240         {
1241             log<level::DEBUG>(
1242                 std::format("readPowerSensors: Failed reading {}, errno = {}",
1243                             file.path().string(), e.code().value())
1244                     .c_str());
1245             continue;
1246         }
1247 
1248         auto functionID = getPowerLabelFunctionID(labelValue);
1249         if (functionID == std::nullopt)
1250         {
1251             continue;
1252         }
1253 
1254         const std::string& tempLabel = "label";
1255         const std::string filePathString = file.path().string().substr(
1256             0, file.path().string().length() - tempLabel.length());
1257 
1258         std::string sensorPath = OCC_SENSORS_ROOT + std::string("/power/");
1259 
1260         auto iter = powerSensorName.find(*functionID);
1261         if (iter == powerSensorName.end())
1262         {
1263             continue;
1264         }
1265         sensorPath.append(iter->second);
1266 
1267         double tempValue{0};
1268 
1269         try
1270         {
1271             tempValue = readFile<double>(filePathString + inputSuffix);
1272         }
1273         catch (const std::system_error& e)
1274         {
1275             log<level::DEBUG>(
1276                 std::format("readPowerSensors: Failed reading {}, errno = {}",
1277                             filePathString + inputSuffix, e.code().value())
1278                     .c_str());
1279             continue;
1280         }
1281 
1282         dbus::OccDBusSensors::getOccDBus().setUnit(
1283             sensorPath, "xyz.openbmc_project.Sensor.Value.Unit.Watts");
1284 
1285         dbus::OccDBusSensors::getOccDBus().setValue(
1286             sensorPath, tempValue * std::pow(10, -3) * std::pow(10, -3));
1287 
1288         dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1289             sensorPath, true);
1290 
1291         if (existingSensors.find(sensorPath) == existingSensors.end())
1292         {
1293             dbus::OccDBusSensors::getOccDBus().setChassisAssociation(
1294                 sensorPath);
1295         }
1296 
1297         existingSensors[sensorPath] = id;
1298     }
1299     return;
1300 }
1301 
setSensorValueToNaN(uint32_t id) const1302 void Manager::setSensorValueToNaN(uint32_t id) const
1303 {
1304     for (const auto& [sensorPath, occId] : existingSensors)
1305     {
1306         if (occId == id)
1307         {
1308             dbus::OccDBusSensors::getOccDBus().setValue(
1309                 sensorPath, std::numeric_limits<double>::quiet_NaN());
1310 
1311             dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1312                 sensorPath, true);
1313         }
1314     }
1315     return;
1316 }
1317 
setSensorValueToNonFunctional(uint32_t id) const1318 void Manager::setSensorValueToNonFunctional(uint32_t id) const
1319 {
1320     for (const auto& [sensorPath, occId] : existingSensors)
1321     {
1322         if (occId == id)
1323         {
1324             dbus::OccDBusSensors::getOccDBus().setValue(
1325                 sensorPath, std::numeric_limits<double>::quiet_NaN());
1326 
1327             dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1328                 sensorPath, false);
1329         }
1330     }
1331     return;
1332 }
1333 
getSensorValues(std::unique_ptr<Status> & occ)1334 void Manager::getSensorValues(std::unique_ptr<Status>& occ)
1335 {
1336     static bool tracedError[8] = {0};
1337     const fs::path sensorPath = occ->getHwmonPath();
1338     const uint32_t id = occ->getOccInstanceID();
1339 
1340     if (fs::exists(sensorPath))
1341     {
1342         // Read temperature sensors
1343         readTempSensors(sensorPath, id);
1344 
1345         if (occ->isMasterOcc())
1346         {
1347             // Read power sensors
1348             readPowerSensors(sensorPath, id);
1349         }
1350         tracedError[id] = false;
1351     }
1352     else
1353     {
1354         if (!tracedError[id])
1355         {
1356             log<level::ERR>(
1357                 std::format(
1358                     "Manager::getSensorValues: OCC{} sensor path missing: {}",
1359                     id, sensorPath.c_str())
1360                     .c_str());
1361             tracedError[id] = true;
1362         }
1363     }
1364 
1365     return;
1366 }
1367 #endif
1368 
1369 // Read the altitude from DBus
readAltitude()1370 void Manager::readAltitude()
1371 {
1372     static bool traceAltitudeErr = true;
1373 
1374     utils::PropertyValue altitudeProperty{};
1375     try
1376     {
1377         altitudeProperty = utils::getProperty(ALTITUDE_PATH, ALTITUDE_INTERFACE,
1378                                               ALTITUDE_PROP);
1379         auto sensorVal = std::get<double>(altitudeProperty);
1380         if (sensorVal < 0xFFFF)
1381         {
1382             if (sensorVal < 0)
1383             {
1384                 altitude = 0;
1385             }
1386             else
1387             {
1388                 // Round to nearest meter
1389                 altitude = uint16_t(sensorVal + 0.5);
1390             }
1391             log<level::DEBUG>(std::format("readAltitude: sensor={} ({}m)",
1392                                           sensorVal, altitude)
1393                                   .c_str());
1394             traceAltitudeErr = true;
1395         }
1396         else
1397         {
1398             if (traceAltitudeErr)
1399             {
1400                 traceAltitudeErr = false;
1401                 log<level::DEBUG>(
1402                     std::format("Invalid altitude value: {}", sensorVal)
1403                         .c_str());
1404             }
1405         }
1406     }
1407     catch (const sdbusplus::exception_t& e)
1408     {
1409         if (traceAltitudeErr)
1410         {
1411             traceAltitudeErr = false;
1412             log<level::INFO>(
1413                 std::format("Unable to read Altitude: {}", e.what()).c_str());
1414         }
1415         altitude = 0xFFFF; // not available
1416     }
1417 }
1418 
1419 // Callback function when ambient temperature changes
ambientCallback(sdbusplus::message_t & msg)1420 void Manager::ambientCallback(sdbusplus::message_t& msg)
1421 {
1422     double currentTemp = 0;
1423     uint8_t truncatedTemp = 0xFF;
1424     std::string msgSensor;
1425     std::map<std::string, std::variant<double>> msgData;
1426     msg.read(msgSensor, msgData);
1427 
1428     auto valPropMap = msgData.find(AMBIENT_PROP);
1429     if (valPropMap == msgData.end())
1430     {
1431         log<level::DEBUG>("ambientCallback: Unknown ambient property changed");
1432         return;
1433     }
1434     currentTemp = std::get<double>(valPropMap->second);
1435     if (std::isnan(currentTemp))
1436     {
1437         truncatedTemp = 0xFF;
1438     }
1439     else
1440     {
1441         if (currentTemp < 0)
1442         {
1443             truncatedTemp = 0;
1444         }
1445         else
1446         {
1447             // Round to nearest degree C
1448             truncatedTemp = uint8_t(currentTemp + 0.5);
1449         }
1450     }
1451 
1452     // If ambient changes, notify OCCs
1453     if (truncatedTemp != ambient)
1454     {
1455         log<level::DEBUG>(
1456             std::format("ambientCallback: Ambient change from {} to {}C",
1457                         ambient, currentTemp)
1458                 .c_str());
1459 
1460         ambient = truncatedTemp;
1461         if (altitude == 0xFFFF)
1462         {
1463             // No altitude yet, try reading again
1464             readAltitude();
1465         }
1466 
1467         log<level::DEBUG>(
1468             std::format("ambientCallback: Ambient: {}C, altitude: {}m", ambient,
1469                         altitude)
1470                 .c_str());
1471 #ifdef POWER10
1472         // Send ambient and altitude to all OCCs
1473         for (auto& obj : statusObjects)
1474         {
1475             if (obj->occActive())
1476             {
1477                 obj->sendAmbient(ambient, altitude);
1478             }
1479         }
1480 #endif // POWER10
1481     }
1482 }
1483 
1484 // return the current ambient and altitude readings
getAmbientData(bool & ambientValid,uint8_t & ambientTemp,uint16_t & altitudeValue) const1485 void Manager::getAmbientData(bool& ambientValid, uint8_t& ambientTemp,
1486                              uint16_t& altitudeValue) const
1487 {
1488     ambientValid = true;
1489     ambientTemp = ambient;
1490     altitudeValue = altitude;
1491 
1492     if (ambient == 0xFF)
1493     {
1494         ambientValid = false;
1495     }
1496 }
1497 
1498 #ifdef POWER10
1499 // Called when waitForAllOccsTimer expires
1500 // After the first OCC goes active, this timer will be started (60 seconds)
occsNotAllRunning()1501 void Manager::occsNotAllRunning()
1502 {
1503     if (resetInProgress)
1504     {
1505         log<level::WARNING>(
1506             "occsNotAllRunning: Ignoring waitForAllOccsTimer because reset is in progress");
1507         return;
1508     }
1509     if (activeCount != statusObjects.size())
1510     {
1511         // Not all OCCs went active
1512         log<level::WARNING>(
1513             std::format(
1514                 "occsNotAllRunning: Active OCC count ({}) does not match expected count ({})",
1515                 activeCount, statusObjects.size())
1516                 .c_str());
1517         // Procs may be garded, so may be expected
1518     }
1519 
1520     if (resetRequired)
1521     {
1522         initiateOccRequest(resetInstance);
1523 
1524         if (!waitForAllOccsTimer->isEnabled())
1525         {
1526             log<level::WARNING>(
1527                 "occsNotAllRunning: Restarting waitForAllOccTimer");
1528             // restart occ wait timer
1529             waitForAllOccsTimer->restartOnce(60s);
1530         }
1531     }
1532     else
1533     {
1534         validateOccMaster();
1535     }
1536 }
1537 
1538 #ifdef PLDM
1539 // Called when throttlePldmTraceTimer expires.
1540 // If this timer expires, that indicates there are no OCC active sensor PDRs
1541 // found which will trigger pldm traces to be throttled.
1542 // The second time this timer expires, a PEL will get created.
throttlePldmTraceExpired()1543 void Manager::throttlePldmTraceExpired()
1544 {
1545     if (utils::isHostRunning())
1546     {
1547         if (!onPldmTimeoutCreatePel)
1548         {
1549             // Throttle traces
1550             pldmHandle->setTraceThrottle(true);
1551             // Restart timer to log a PEL when timer expires
1552             onPldmTimeoutCreatePel = true;
1553             throttlePldmTraceTimer->restartOnce(40min);
1554         }
1555         else
1556         {
1557             log<level::ERR>(
1558                 "throttlePldmTraceExpired(): OCC active sensors still not available!");
1559             // Create PEL
1560             createPldmSensorPEL();
1561         }
1562     }
1563     else
1564     {
1565         // Make sure traces are not throttled
1566         pldmHandle->setTraceThrottle(false);
1567         log<level::INFO>(
1568             "throttlePldmTraceExpired(): host it not running ignoring sensor timer");
1569     }
1570 }
1571 
createPldmSensorPEL()1572 void Manager::createPldmSensorPEL()
1573 {
1574     Error::Descriptor d = Error::Descriptor(MISSING_OCC_SENSORS_PATH);
1575     std::map<std::string, std::string> additionalData;
1576 
1577     additionalData.emplace("_PID", std::to_string(getpid()));
1578 
1579     log<level::INFO>(
1580         std::format(
1581             "createPldmSensorPEL(): Unable to find PLDM sensors for the OCCs")
1582             .c_str());
1583 
1584     auto& bus = utils::getBus();
1585 
1586     try
1587     {
1588         FFDCFiles ffdc;
1589         // Add occ-control journal traces to PEL FFDC
1590         auto occJournalFile =
1591             FFDC::addJournalEntries(ffdc, "openpower-occ-control", 40);
1592 
1593         static constexpr auto loggingObjectPath =
1594             "/xyz/openbmc_project/logging";
1595         static constexpr auto opLoggingInterface = "org.open_power.Logging.PEL";
1596         std::string service =
1597             utils::getService(loggingObjectPath, opLoggingInterface);
1598         auto method =
1599             bus.new_method_call(service.c_str(), loggingObjectPath,
1600                                 opLoggingInterface, "CreatePELWithFFDCFiles");
1601 
1602         // Set level to Warning (Predictive).
1603         auto level =
1604             sdbusplus::xyz::openbmc_project::Logging::server::convertForMessage(
1605                 sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level::
1606                     Warning);
1607 
1608         method.append(d.path, level, additionalData, ffdc);
1609         bus.call(method);
1610     }
1611     catch (const sdbusplus::exception_t& e)
1612     {
1613         log<level::ERR>(
1614             std::format("Failed to create MISSING_OCC_SENSORS PEL: {}",
1615                         e.what())
1616                 .c_str());
1617     }
1618 }
1619 #endif // PLDM
1620 #endif // POWER10
1621 
1622 // Verify single master OCC and start presence monitor
validateOccMaster()1623 void Manager::validateOccMaster()
1624 {
1625     int masterInstance = -1;
1626     for (auto& obj : statusObjects)
1627     {
1628         auto instance = obj->getOccInstanceID();
1629 #ifdef POWER10
1630         if (!obj->occActive())
1631         {
1632             if (utils::isHostRunning())
1633             {
1634                 // Check if sensor was queued while waiting for discovery
1635                 auto match = queuedActiveState.find(instance);
1636                 if (match != queuedActiveState.end())
1637                 {
1638                     queuedActiveState.erase(match);
1639                     log<level::INFO>(
1640                         std::format(
1641                             "validateOccMaster: OCC{} is ACTIVE (queued)",
1642                             instance)
1643                             .c_str());
1644                     obj->occActive(true);
1645                 }
1646                 else
1647                 {
1648                     // OCC does not appear to be active yet, check active sensor
1649 #ifdef PLDM
1650                     pldmHandle->checkActiveSensor(instance);
1651 #endif
1652                     if (obj->occActive())
1653                     {
1654                         log<level::INFO>(
1655                             std::format(
1656                                 "validateOccMaster: OCC{} is ACTIVE after reading sensor",
1657                                 instance)
1658                                 .c_str());
1659                     }
1660                 }
1661             }
1662             else
1663             {
1664                 log<level::WARNING>(
1665                     std::format(
1666                         "validateOccMaster: HOST is not running (OCC{})",
1667                         instance)
1668                         .c_str());
1669                 return;
1670             }
1671         }
1672 #endif // POWER10
1673 
1674         if (obj->isMasterOcc())
1675         {
1676             obj->addPresenceWatchMaster();
1677 
1678             if (masterInstance == -1)
1679             {
1680                 masterInstance = instance;
1681             }
1682             else
1683             {
1684                 log<level::ERR>(
1685                     std::format(
1686                         "validateOccMaster: Multiple OCC masters! ({} and {})",
1687                         masterInstance, instance)
1688                         .c_str());
1689                 // request reset
1690                 obj->deviceError(Error::Descriptor(PRESENCE_ERROR_PATH));
1691             }
1692         }
1693     }
1694 
1695     if (masterInstance < 0)
1696     {
1697         log<level::ERR>(
1698             std::format("validateOccMaster: Master OCC not found! (of {} OCCs)",
1699                         statusObjects.size())
1700                 .c_str());
1701         // request reset
1702         statusObjects.front()->deviceError(
1703             Error::Descriptor(PRESENCE_ERROR_PATH));
1704     }
1705     else
1706     {
1707         log<level::INFO>(
1708             std::format("validateOccMaster: OCC{} is master of {} OCCs",
1709                         masterInstance, activeCount)
1710                 .c_str());
1711 #ifdef POWER10
1712         pmode->updateDbusSafeMode(false);
1713 #endif
1714     }
1715 }
1716 
updatePcapBounds() const1717 void Manager::updatePcapBounds() const
1718 {
1719     if (pcap)
1720     {
1721         pcap->updatePcapBounds();
1722     }
1723 }
1724 
1725 } // namespace occ
1726 } // namespace open_power
1727