xref: /openbmc/openpower-occ-control/occ_manager.cpp (revision b89d619cadbdcea10ba773e4ac9dfdc2efd7dd37)
1 #include "config.h"
2 
3 #include "occ_manager.hpp"
4 
5 #include "i2c_occ.hpp"
6 #include "occ_dbus.hpp"
7 #include "occ_errors.hpp"
8 #include "utils.hpp"
9 
10 #include <phosphor-logging/elog-errors.hpp>
11 #include <phosphor-logging/lg2.hpp>
12 #include <xyz/openbmc_project/Common/error.hpp>
13 
14 #include <chrono>
15 #include <cmath>
16 #include <filesystem>
17 #include <fstream>
18 #include <regex>
19 
20 namespace open_power
21 {
22 namespace occ
23 {
24 
25 constexpr uint32_t fruTypeNotAvailable = 0xFF;
26 constexpr auto fruTypeSuffix = "fru_type";
27 constexpr auto faultSuffix = "fault";
28 constexpr auto inputSuffix = "input";
29 constexpr auto maxSuffix = "max";
30 
31 const auto HOST_ON_FILE = "/run/openbmc/host@0-on";
32 
33 using namespace phosphor::logging;
34 using namespace std::literals::chrono_literals;
35 
36 template <typename T>
readFile(const std::string & path)37 T readFile(const std::string& path)
38 {
39     std::ifstream ifs;
40     ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit |
41                    std::ifstream::eofbit);
42     T data;
43 
44     try
45     {
46         ifs.open(path);
47         ifs >> data;
48         ifs.close();
49     }
50     catch (const std::exception& e)
51     {
52         auto err = errno;
53         throw std::system_error(err, std::generic_category());
54     }
55 
56     return data;
57 }
58 
createPldmHandle()59 void Manager::createPldmHandle()
60 {
61 #ifdef PLDM
62     pldmHandle = std::make_unique<pldm::Interface>(
63         std::bind(std::mem_fn(&Manager::updateOCCActive), this,
64                   std::placeholders::_1, std::placeholders::_2),
65         std::bind(std::mem_fn(&Manager::sbeHRESETResult), this,
66                   std::placeholders::_1, std::placeholders::_2),
67         std::bind(std::mem_fn(&Manager::updateOccSafeMode), this,
68                   std::placeholders::_1),
69         event);
70 #endif
71 }
72 
73 // findAndCreateObjects():
74 // Takes care of getting the required objects created and
75 // finds the available devices/processors.
76 // (function is called everytime the discoverTimer expires)
77 // - create the PowerMode object to control OCC modes
78 // - create statusObjects for each OCC device found
79 // - waits for OCC Active sensors PDRs to become available
80 // - restart discoverTimer if all data is not available yet
findAndCreateObjects()81 void Manager::findAndCreateObjects()
82 {
83 #ifndef POWER10
84     for (auto id = 0; id < MAX_CPUS; ++id)
85     {
86         // Create one occ per cpu
87         auto occ = std::string(OCC_NAME) + std::to_string(id);
88         createObjects(occ);
89     }
90 #else
91     if (!pmode)
92     {
93         // Create the power mode object
94         pmode = std::make_unique<powermode::PowerMode>(
95             *this, powermode::PMODE_PATH, powermode::PIPS_PATH, event);
96     }
97 
98     if (!fs::exists(HOST_ON_FILE))
99     {
100         static bool statusObjCreated = false;
101         if (!statusObjCreated)
102         {
103             // Create the OCCs based on on the /dev/occX devices
104             auto occs = findOCCsInDev();
105 
106             if (occs.empty() || (prevOCCSearch.size() != occs.size()))
107             {
108                 // Something changed or no OCCs yet, try again in 10s.
109                 // Note on the first pass prevOCCSearch will be empty,
110                 // so there will be at least one delay to give things
111                 // a chance to settle.
112                 prevOCCSearch = occs;
113 
114                 lg2::info(
115                     "Manager::findAndCreateObjects(): Waiting for OCCs (currently {QTY})",
116                     "QTY", occs.size());
117 
118                 discoverTimer->restartOnce(10s);
119             }
120             else
121             {
122                 // All OCCs appear to be available, create status objects
123 
124                 // createObjects requires OCC0 first.
125                 std::sort(occs.begin(), occs.end());
126 
127                 lg2::info(
128                     "Manager::findAndCreateObjects(): Creating {QTY} OCC Status Objects",
129                     "QTY", occs.size());
130                 for (auto id : occs)
131                 {
132                     createObjects(std::string(OCC_NAME) + std::to_string(id));
133                 }
134                 statusObjCreated = true;
135                 waitingForAllOccActiveSensors = true;
136 
137                 // Find/update the processor path associated with each OCC
138                 for (auto& obj : statusObjects)
139                 {
140                     obj->updateProcAssociation();
141                 }
142             }
143         }
144 
145         if (statusObjCreated && waitingForAllOccActiveSensors)
146         {
147             static bool tracedHostWait = false;
148             if (utils::isHostRunning())
149             {
150                 if (tracedHostWait)
151                 {
152                     lg2::info(
153                         "Manager::findAndCreateObjects(): Host is running");
154                     tracedHostWait = false;
155                 }
156                 checkAllActiveSensors();
157             }
158             else
159             {
160                 if (!tracedHostWait)
161                 {
162                     lg2::info(
163                         "Manager::findAndCreateObjects(): Waiting for host to start");
164                     tracedHostWait = true;
165                 }
166                 discoverTimer->restartOnce(30s);
167 #ifdef PLDM
168                 if (throttlePldmTraceTimer->isEnabled())
169                 {
170                     // Host is no longer running, disable throttle timer and
171                     // make sure traces are not throttled
172                     lg2::info("findAndCreateObjects(): disabling sensor timer");
173                     throttlePldmTraceTimer->setEnabled(false);
174                     pldmHandle->setTraceThrottle(false);
175                 }
176 #endif
177             }
178         }
179     }
180     else
181     {
182         lg2::info(
183             "Manager::findAndCreateObjects(): Waiting for {FILE} to complete...",
184             "FILE", HOST_ON_FILE);
185         discoverTimer->restartOnce(10s);
186     }
187 #endif
188 }
189 
190 #ifdef POWER10
191 // Check if all occActive sensors are available
checkAllActiveSensors()192 void Manager::checkAllActiveSensors()
193 {
194     static bool allActiveSensorAvailable = false;
195     static bool tracedSensorWait = false;
196     static bool waitingForHost = false;
197 
198     if (open_power::occ::utils::isHostRunning())
199     {
200         if (waitingForHost)
201         {
202             waitingForHost = false;
203             lg2::info("checkAllActiveSensors(): Host is now running");
204         }
205 
206         // Start with the assumption that all are available
207         allActiveSensorAvailable = true;
208         for (auto& obj : statusObjects)
209         {
210             if ((!obj->occActive()) && (!obj->getPldmSensorReceived()))
211             {
212                 auto instance = obj->getOccInstanceID();
213                 // Check if sensor was queued while waiting for discovery
214                 auto match = queuedActiveState.find(instance);
215                 if (match != queuedActiveState.end())
216                 {
217                     queuedActiveState.erase(match);
218                     lg2::info(
219                         "checkAllActiveSensors(): OCC{INST} is ACTIVE (queued)",
220                         "INST", instance);
221                     obj->occActive(true);
222                 }
223                 else
224                 {
225                     allActiveSensorAvailable = false;
226                     if (!tracedSensorWait)
227                     {
228                         lg2::info(
229                             "checkAllActiveSensors(): Waiting on OCC{INST} Active sensor",
230                             "INST", instance);
231                         tracedSensorWait = true;
232 #ifdef PLDM
233                         // Make sure PLDM traces are not throttled
234                         pldmHandle->setTraceThrottle(false);
235                         // Start timer to throttle PLDM traces when timer
236                         // expires
237                         onPldmTimeoutCreatePel = false;
238                         throttlePldmTraceTimer->restartOnce(5min);
239 #endif
240                     }
241 #ifdef PLDM
242                     // Ignore active sensor check if the OCCs are being reset
243                     if (!resetInProgress)
244                     {
245                         pldmHandle->checkActiveSensor(obj->getOccInstanceID());
246                     }
247 #endif
248                     break;
249                 }
250             }
251         }
252     }
253     else
254     {
255         if (!waitingForHost)
256         {
257             waitingForHost = true;
258             lg2::info("checkAllActiveSensors(): Waiting for host to start");
259 #ifdef PLDM
260             if (throttlePldmTraceTimer->isEnabled())
261             {
262                 // Host is no longer running, disable throttle timer and
263                 // make sure traces are not throttled
264                 lg2::info("checkAllActiveSensors(): disabling sensor timer");
265                 throttlePldmTraceTimer->setEnabled(false);
266                 pldmHandle->setTraceThrottle(false);
267             }
268 #endif
269         }
270     }
271 
272     if (allActiveSensorAvailable)
273     {
274         // All sensors were found, disable the discovery timer
275         if (discoverTimer->isEnabled())
276         {
277             discoverTimer->setEnabled(false);
278         }
279 #ifdef PLDM
280         if (throttlePldmTraceTimer->isEnabled())
281         {
282             // Disable throttle timer and make sure traces are not throttled
283             throttlePldmTraceTimer->setEnabled(false);
284             pldmHandle->setTraceThrottle(false);
285         }
286 #endif
287         if (waitingForAllOccActiveSensors)
288         {
289             lg2::info(
290                 "checkAllActiveSensors(): OCC Active sensors are available");
291             waitingForAllOccActiveSensors = false;
292 
293             if (resetRequired)
294             {
295                 initiateOccRequest(resetInstance);
296 
297                 if (!waitForAllOccsTimer->isEnabled())
298                 {
299                     lg2::warning(
300                         "occsNotAllRunning: Restarting waitForAllOccTimer");
301                     // restart occ wait timer to check status after reset
302                     // completes
303                     waitForAllOccsTimer->restartOnce(60s);
304                 }
305             }
306         }
307         queuedActiveState.clear();
308         tracedSensorWait = false;
309     }
310     else
311     {
312         // Not all sensors were available, so keep waiting
313         if (!tracedSensorWait)
314         {
315             lg2::info(
316                 "checkAllActiveSensors(): Waiting for OCC Active sensors to become available");
317             tracedSensorWait = true;
318         }
319         discoverTimer->restartOnce(10s);
320     }
321 }
322 #endif
323 
findOCCsInDev()324 std::vector<int> Manager::findOCCsInDev()
325 {
326     std::vector<int> occs;
327     std::regex expr{R"(occ(\d+)$)"};
328 
329     for (auto& file : fs::directory_iterator("/dev"))
330     {
331         std::smatch match;
332         std::string path{file.path().string()};
333         if (std::regex_search(path, match, expr))
334         {
335             auto num = std::stoi(match[1].str());
336 
337             // /dev numbering starts at 1, ours starts at 0.
338             occs.push_back(num - 1);
339         }
340     }
341 
342     return occs;
343 }
344 
cpuCreated(sdbusplus::message_t & msg)345 int Manager::cpuCreated(sdbusplus::message_t& msg)
346 {
347     namespace fs = std::filesystem;
348 
349     sdbusplus::message::object_path o;
350     msg.read(o);
351     fs::path cpuPath(std::string(std::move(o)));
352 
353     auto name = cpuPath.filename().string();
354     auto index = name.find(CPU_NAME);
355     name.replace(index, std::strlen(CPU_NAME), OCC_NAME);
356 
357     createObjects(name);
358 
359     return 0;
360 }
361 
createObjects(const std::string & occ)362 void Manager::createObjects(const std::string& occ)
363 {
364     auto path = fs::path(OCC_CONTROL_ROOT) / occ;
365 
366     statusObjects.emplace_back(std::make_unique<Status>(
367         event, path.c_str(), *this,
368 #ifdef POWER10
369         pmode,
370 #endif
371         std::bind(std::mem_fn(&Manager::statusCallBack), this,
372                   std::placeholders::_1, std::placeholders::_2)
373 #ifdef PLDM
374             ,
375         // Callback will set flag indicating reset needs to be done
376         // instead of immediately issuing a reset via PLDM.
377         std::bind(std::mem_fn(&Manager::resetOccRequest), this,
378                   std::placeholders::_1)
379 #endif
380             ));
381 
382     // Create the power cap monitor object
383     if (!pcap)
384     {
385         pcap = std::make_unique<open_power::occ::powercap::PowerCap>(
386             *statusObjects.back());
387     }
388 
389     if (statusObjects.back()->isMasterOcc())
390     {
391         lg2::info("Manager::createObjects(): OCC{INST} is the master", "INST",
392                   statusObjects.back()->getOccInstanceID());
393         _pollTimer->setEnabled(false);
394 
395 #ifdef POWER10
396         // Set the master OCC on the PowerMode object
397         pmode->setMasterOcc(path);
398 #endif
399     }
400 
401     passThroughObjects.emplace_back(std::make_unique<PassThrough>(
402         path.c_str()
403 #ifdef POWER10
404             ,
405         pmode
406 #endif
407         ));
408 }
409 
410 // If a reset is not already outstanding, set a flag to indicate that a reset is
411 // needed.
resetOccRequest(instanceID instance)412 void Manager::resetOccRequest(instanceID instance)
413 {
414     if (!resetRequired)
415     {
416         resetRequired = true;
417         resetInstance = instance;
418         lg2::error(
419             "resetOccRequest: PM Complex reset was requested due to OCC{INST}",
420             "INST", instance);
421     }
422     else if (instance != resetInstance)
423     {
424         lg2::warning(
425             "resetOccRequest: Ignoring PM Complex reset request for OCC{INST}, because reset already outstanding for OCC{RINST}",
426             "INST", instance, "RINST", resetInstance);
427     }
428 }
429 
430 // If a reset has not been started, initiate an OCC reset via PLDM
initiateOccRequest(instanceID instance)431 void Manager::initiateOccRequest(instanceID instance)
432 {
433     if (!resetInProgress)
434     {
435         resetInProgress = true;
436         resetInstance = instance;
437         lg2::error(
438             "initiateOccRequest: Initiating PM Complex reset due to OCC{INST}",
439             "INST", instance);
440 #ifdef PLDM
441         pldmHandle->resetOCC(instance);
442 #endif
443         resetRequired = false;
444     }
445     else
446     {
447         lg2::warning(
448             "initiateOccRequest: Ignoring PM Complex reset request for OCC{INST}, because reset already in process for OCC{RINST}",
449             "INST", instance, "RINST", resetInstance);
450     }
451 }
452 
statusCallBack(instanceID instance,bool status)453 void Manager::statusCallBack(instanceID instance, bool status)
454 {
455     if (status == true)
456     {
457         if (resetInProgress)
458         {
459             lg2::info(
460                 "statusCallBack: Ignoring OCC{INST} activate because a reset has been initiated due to OCC{RINST}",
461                 "INST", instance, "RINST", resetInstance);
462             return;
463         }
464 
465         // OCC went active
466         ++activeCount;
467 
468 #ifdef POWER10
469         if (activeCount == 1)
470         {
471             // First OCC went active (allow some time for all OCCs to go active)
472             waitForAllOccsTimer->restartOnce(60s);
473         }
474 #endif
475 
476         if (activeCount == statusObjects.size())
477         {
478 #ifdef POWER10
479             // All OCCs are now running
480             if (waitForAllOccsTimer->isEnabled())
481             {
482                 // stop occ wait timer
483                 waitForAllOccsTimer->setEnabled(false);
484             }
485 
486             // All OCCs have been found, check if we need a reset
487             if (resetRequired)
488             {
489                 initiateOccRequest(resetInstance);
490 
491                 if (!waitForAllOccsTimer->isEnabled())
492                 {
493                     lg2::warning(
494                         "occsNotAllRunning: Restarting waitForAllOccTimer");
495                     // restart occ wait timer
496                     waitForAllOccsTimer->restartOnce(60s);
497                 }
498             }
499             else
500             {
501                 // Verify master OCC and start presence monitor
502                 validateOccMaster();
503             }
504 #else
505             // Verify master OCC and start presence monitor
506             validateOccMaster();
507 #endif
508         }
509 
510         // Start poll timer if not already started
511         if (!_pollTimer->isEnabled())
512         {
513             lg2::info("Manager: OCCs will be polled every {TIME} seconds",
514                       "TIME", pollInterval);
515 
516             // Send poll and start OCC poll timer
517             pollerTimerExpired();
518         }
519     }
520     else
521     {
522         // OCC went away
523         if (activeCount > 0)
524         {
525             --activeCount;
526         }
527         else
528         {
529             lg2::info("OCC{INST} disabled, and no other OCCs are active",
530                       "INST", instance);
531         }
532 
533         if (activeCount == 0)
534         {
535             // No OCCs are running
536 
537             if (resetInProgress)
538             {
539                 // All OCC active sensors are clear (reset should be in
540                 // progress)
541                 lg2::info(
542                     "statusCallBack: Clearing resetInProgress (activeCount={COUNT}, OCC{INST}, status={STATUS})",
543                     "COUNT", activeCount, "INST", instance, "STATUS", status);
544                 resetInProgress = false;
545                 resetInstance = 255;
546             }
547 
548             // Stop OCC poll timer
549             if (_pollTimer->isEnabled())
550             {
551                 lg2::info(
552                     "Manager::statusCallBack(): OCCs are not running, stopping poll timer");
553                 _pollTimer->setEnabled(false);
554             }
555 
556 #ifdef POWER10
557             // stop wait timer
558             if (waitForAllOccsTimer->isEnabled())
559             {
560                 waitForAllOccsTimer->setEnabled(false);
561             }
562 #endif
563         }
564         else if (resetInProgress)
565         {
566             lg2::info(
567                 "statusCallBack: Skipping clear of resetInProgress (activeCount={COUNT}, OCC{INST}, status={STATUS})",
568                 "COUNT", activeCount, "INST", instance, "STATUS", status);
569         }
570 #ifdef READ_OCC_SENSORS
571         // Clear OCC sensors
572         setSensorValueToNaN(instance);
573 #endif
574     }
575 
576 #ifdef POWER10
577     if (waitingForAllOccActiveSensors)
578     {
579         if (utils::isHostRunning())
580         {
581             checkAllActiveSensors();
582         }
583     }
584 #endif
585 }
586 
587 #ifdef I2C_OCC
initStatusObjects()588 void Manager::initStatusObjects()
589 {
590     // Make sure we have a valid path string
591     static_assert(sizeof(DEV_PATH) != 0);
592 
593     auto deviceNames = i2c_occ::getOccHwmonDevices(DEV_PATH);
594     for (auto& name : deviceNames)
595     {
596         i2c_occ::i2cToDbus(name);
597         name = std::string(OCC_NAME) + '_' + name;
598         auto path = fs::path(OCC_CONTROL_ROOT) / name;
599         statusObjects.emplace_back(
600             std::make_unique<Status>(event, path.c_str(), *this));
601     }
602     // The first device is master occ
603     pcap = std::make_unique<open_power::occ::powercap::PowerCap>(
604         *statusObjects.front());
605 #ifdef POWER10
606     pmode = std::make_unique<powermode::PowerMode>(*this, powermode::PMODE_PATH,
607                                                    powermode::PIPS_PATH);
608     // Set the master OCC on the PowerMode object
609     pmode->setMasterOcc(path);
610 #endif
611 }
612 #endif
613 
614 #ifdef PLDM
sbeTimeout(unsigned int instance)615 void Manager::sbeTimeout(unsigned int instance)
616 {
617     auto obj = std::find_if(statusObjects.begin(), statusObjects.end(),
618                             [instance](const auto& obj) {
619                                 return instance == obj->getOccInstanceID();
620                             });
621 
622     if (obj != statusObjects.end() && (*obj)->occActive())
623     {
624         lg2::info("SBE timeout, requesting HRESET (OCC{INST})", "INST",
625                   instance);
626 
627 #ifdef PHAL_SUPPORT
628         setSBEState(instance, SBE_STATE_NOT_USABLE);
629 #endif
630 
631         // Stop communication with this OCC
632         (*obj)->occActive(false);
633 
634         pldmHandle->sendHRESET(instance);
635     }
636 }
637 
updateOCCActive(instanceID instance,bool status)638 bool Manager::updateOCCActive(instanceID instance, bool status)
639 {
640     auto obj = std::find_if(statusObjects.begin(), statusObjects.end(),
641                             [instance](const auto& obj) {
642                                 return instance == obj->getOccInstanceID();
643                             });
644 
645     const bool hostRunning = open_power::occ::utils::isHostRunning();
646     if (obj != statusObjects.end())
647     {
648         if (!hostRunning && (status == true))
649         {
650             lg2::warning(
651                 "updateOCCActive: Host is not running yet (OCC{INST} active={STAT}), clearing sensor received",
652                 "INST", instance, "STAT", status);
653             (*obj)->setPldmSensorReceived(false);
654             if (!waitingForAllOccActiveSensors)
655             {
656                 lg2::info(
657                     "updateOCCActive: Waiting for Host and all OCC Active Sensors");
658                 waitingForAllOccActiveSensors = true;
659             }
660 #ifdef POWER10
661             discoverTimer->restartOnce(30s);
662 #endif
663             return false;
664         }
665         else
666         {
667             (*obj)->setPldmSensorReceived(true);
668             return (*obj)->occActive(status);
669         }
670     }
671     else
672     {
673         if (hostRunning)
674         {
675             lg2::warning(
676                 "updateOCCActive: No status object to update for OCC{INST} (active={STAT})",
677                 "INST", instance, "STAT", status);
678         }
679         else
680         {
681             if (status == true)
682             {
683                 lg2::warning(
684                     "updateOCCActive: No status objects and Host is not running yet (OCC{INST} active={STAT})",
685                     "INST", instance, "STAT", status);
686             }
687         }
688         if (status == true)
689         {
690             // OCC went active
691             queuedActiveState.insert(instance);
692         }
693         else
694         {
695             auto match = queuedActiveState.find(instance);
696             if (match != queuedActiveState.end())
697             {
698                 // OCC was disabled
699                 queuedActiveState.erase(match);
700             }
701         }
702         return false;
703     }
704 }
705 
706 // Called upon pldm event To set powermode Safe Mode State for system.
updateOccSafeMode(bool safeMode)707 void Manager::updateOccSafeMode(bool safeMode)
708 {
709 #ifdef POWER10
710     pmode->updateDbusSafeMode(safeMode);
711 #endif
712     // Update the processor throttle status on dbus
713     for (auto& obj : statusObjects)
714     {
715         obj->updateThrottle(safeMode, THROTTLED_SAFE);
716     }
717 }
718 
sbeHRESETResult(instanceID instance,bool success)719 void Manager::sbeHRESETResult(instanceID instance, bool success)
720 {
721     if (success)
722     {
723         lg2::info("HRESET succeeded (OCC{INST})", "INST", instance);
724 
725 #ifdef PHAL_SUPPORT
726         setSBEState(instance, SBE_STATE_BOOTED);
727 #endif
728 
729         // Re-enable communication with this OCC
730         auto obj = std::find_if(statusObjects.begin(), statusObjects.end(),
731                                 [instance](const auto& obj) {
732                                     return instance == obj->getOccInstanceID();
733                                 });
734         if (obj != statusObjects.end() && (!(*obj)->occActive()))
735         {
736             (*obj)->occActive(true);
737         }
738 
739         return;
740     }
741 
742 #ifdef PHAL_SUPPORT
743     setSBEState(instance, SBE_STATE_FAILED);
744 
745     if (sbeCanDump(instance))
746     {
747         lg2::info("HRESET failed (OCC{INST}), triggering SBE dump", "INST",
748                   instance);
749 
750         auto& bus = utils::getBus();
751         uint32_t src6 = instance << 16;
752         uint32_t logId =
753             FFDC::createPEL("org.open_power.Processor.Error.SbeChipOpTimeout",
754                             src6, "SBE command timeout");
755 
756         try
757         {
758             constexpr auto interface = "xyz.openbmc_project.Dump.Create";
759             constexpr auto function = "CreateDump";
760 
761             std::string service =
762                 utils::getService(OP_DUMP_OBJ_PATH, interface);
763             auto method = bus.new_method_call(service.c_str(), OP_DUMP_OBJ_PATH,
764                                               interface, function);
765 
766             std::map<std::string, std::variant<std::string, uint64_t>>
767                 createParams{
768                     {"com.ibm.Dump.Create.CreateParameters.ErrorLogId",
769                      uint64_t(logId)},
770                     {"com.ibm.Dump.Create.CreateParameters.DumpType",
771                      "com.ibm.Dump.Create.DumpType.SBE"},
772                     {"com.ibm.Dump.Create.CreateParameters.FailingUnitId",
773                      uint64_t(instance)},
774                 };
775 
776             method.append(createParams);
777 
778             auto response = bus.call(method);
779         }
780         catch (const sdbusplus::exception_t& e)
781         {
782             constexpr auto ERROR_DUMP_DISABLED =
783                 "xyz.openbmc_project.Dump.Create.Error.Disabled";
784             if (e.name() == ERROR_DUMP_DISABLED)
785             {
786                 lg2::info("Dump is disabled, skipping");
787             }
788             else
789             {
790                 lg2::error("Dump failed");
791             }
792         }
793     }
794 #endif
795 
796     // SBE Reset failed, try PM Complex reset
797     lg2::error("sbeHRESETResult: Forcing PM Complex reset");
798     resetOccRequest(instance);
799 }
800 
801 #ifdef PHAL_SUPPORT
sbeCanDump(unsigned int instance)802 bool Manager::sbeCanDump(unsigned int instance)
803 {
804     struct pdbg_target* proc = getPdbgTarget(instance);
805 
806     if (!proc)
807     {
808         // allow the dump in the error case
809         return true;
810     }
811 
812     try
813     {
814         if (!openpower::phal::sbe::isDumpAllowed(proc))
815         {
816             return false;
817         }
818 
819         if (openpower::phal::pdbg::isSbeVitalAttnActive(proc))
820         {
821             return false;
822         }
823     }
824     catch (openpower::phal::exception::SbeError& e)
825     {
826         lg2::info("Failed to query SBE state");
827     }
828 
829     // allow the dump in the error case
830     return true;
831 }
832 
setSBEState(unsigned int instance,enum sbe_state state)833 void Manager::setSBEState(unsigned int instance, enum sbe_state state)
834 {
835     struct pdbg_target* proc = getPdbgTarget(instance);
836 
837     if (!proc)
838     {
839         return;
840     }
841 
842     try
843     {
844         openpower::phal::sbe::setState(proc, state);
845     }
846     catch (const openpower::phal::exception::SbeError& e)
847     {
848         lg2::error("Failed to set SBE state: {ERROR}", "ERROR", e.what());
849     }
850 }
851 
getPdbgTarget(unsigned int instance)852 struct pdbg_target* Manager::getPdbgTarget(unsigned int instance)
853 {
854     if (!pdbgInitialized)
855     {
856         try
857         {
858             openpower::phal::pdbg::init();
859             pdbgInitialized = true;
860         }
861         catch (const openpower::phal::exception::PdbgError& e)
862         {
863             lg2::error("pdbg initialization failed");
864             return nullptr;
865         }
866     }
867 
868     struct pdbg_target* proc = nullptr;
869     pdbg_for_each_class_target("proc", proc)
870     {
871         if (pdbg_target_index(proc) == instance)
872         {
873             return proc;
874         }
875     }
876 
877     lg2::error("Failed to get pdbg target");
878     return nullptr;
879 }
880 #endif
881 #endif
882 
pollerTimerExpired()883 void Manager::pollerTimerExpired()
884 {
885     if (!_pollTimer)
886     {
887         lg2::error("pollerTimerExpired() ERROR: Timer not defined");
888         return;
889     }
890 
891 #ifdef POWER10
892     if (resetRequired)
893     {
894         lg2::error("pollerTimerExpired() - Initiating PM Complex reset");
895         initiateOccRequest(resetInstance);
896 
897         if (!waitForAllOccsTimer->isEnabled())
898         {
899             lg2::warning("pollerTimerExpired: Restarting waitForAllOccTimer");
900             // restart occ wait timer
901             waitForAllOccsTimer->restartOnce(60s);
902         }
903         return;
904     }
905 #endif
906 
907     for (auto& obj : statusObjects)
908     {
909         if (!obj->occActive())
910         {
911             // OCC is not running yet
912 #ifdef READ_OCC_SENSORS
913             auto id = obj->getOccInstanceID();
914             setSensorValueToNaN(id);
915 #endif
916             continue;
917         }
918 
919         // Read sysfs to force kernel to poll OCC
920         obj->readOccState();
921 
922 #ifdef READ_OCC_SENSORS
923         // Read occ sensor values
924         getSensorValues(obj);
925 #endif
926     }
927 
928     if (activeCount > 0)
929     {
930         // Restart OCC poll timer
931         _pollTimer->restartOnce(std::chrono::seconds(pollInterval));
932     }
933     else
934     {
935         // No OCCs running, so poll timer will not be restarted
936         lg2::info(
937             "Manager::pollerTimerExpired: poll timer will not be restarted");
938     }
939 }
940 
941 #ifdef READ_OCC_SENSORS
readTempSensors(const fs::path & path,uint32_t occInstance)942 void Manager::readTempSensors(const fs::path& path, uint32_t occInstance)
943 {
944     // There may be more than one sensor with the same FRU type
945     // and label so make two passes: the first to read the temps
946     // from sysfs, and the second to put them on D-Bus after
947     // resolving any conflicts.
948     std::map<std::string, double> sensorData;
949 
950     std::regex expr{"temp\\d+_label$"}; // Example: temp5_label
951     for (auto& file : fs::directory_iterator(path))
952     {
953         if (!std::regex_search(file.path().string(), expr))
954         {
955             continue;
956         }
957 
958         uint32_t labelValue{0};
959 
960         try
961         {
962             labelValue = readFile<uint32_t>(file.path());
963         }
964         catch (const std::system_error& e)
965         {
966             lg2::debug(
967                 "readTempSensors: Failed reading {PATH}, errno = {ERROR}",
968                 "PATH", file.path().string(), "ERROR", e.code().value());
969             continue;
970         }
971 
972         const std::string& tempLabel = "label";
973         const std::string filePathString = file.path().string().substr(
974             0, file.path().string().length() - tempLabel.length());
975 
976         uint32_t fruTypeValue{0};
977         try
978         {
979             fruTypeValue = readFile<uint32_t>(filePathString + fruTypeSuffix);
980         }
981         catch (const std::system_error& e)
982         {
983             lg2::debug(
984                 "readTempSensors: Failed reading {PATH}, errno = {ERROR}",
985                 "PATH", filePathString + fruTypeSuffix, "ERROR",
986                 e.code().value());
987             continue;
988         }
989 
990         std::string sensorPath =
991             OCC_SENSORS_ROOT + std::string("/temperature/");
992 
993         std::string dvfsTempPath;
994 
995         if (fruTypeValue == VRMVdd)
996         {
997             sensorPath.append(
998                 "vrm_vdd" + std::to_string(occInstance) + "_temp");
999         }
1000         else if (fruTypeValue == processorIoRing)
1001         {
1002             sensorPath.append(
1003                 "proc" + std::to_string(occInstance) + "_ioring_temp");
1004             dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/proc" +
1005                            std::to_string(occInstance) + "_ioring_dvfs_temp";
1006         }
1007         else
1008         {
1009             uint16_t type = (labelValue & 0xFF000000) >> 24;
1010             uint16_t instanceID = labelValue & 0x0000FFFF;
1011 
1012             if (type == OCC_DIMM_TEMP_SENSOR_TYPE)
1013             {
1014                 if (fruTypeValue == fruTypeNotAvailable)
1015                 {
1016                     // Not all DIMM related temps are available to read
1017                     // (no _input file in this case)
1018                     continue;
1019                 }
1020                 auto iter = dimmTempSensorName.find(fruTypeValue);
1021                 if (iter == dimmTempSensorName.end())
1022                 {
1023                     lg2::error(
1024                         "readTempSensors: Fru type error! fruTypeValue = {FRU}) ",
1025                         "FRU", fruTypeValue);
1026                     continue;
1027                 }
1028 
1029                 sensorPath.append(
1030                     "dimm" + std::to_string(instanceID) + iter->second);
1031 
1032                 dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/" +
1033                                dimmDVFSSensorName.at(fruTypeValue);
1034             }
1035             else if (type == OCC_CPU_TEMP_SENSOR_TYPE)
1036             {
1037                 if (fruTypeValue == processorCore)
1038                 {
1039                     // The OCC reports small core temps, of which there are
1040                     // two per big core.  All current P10 systems are in big
1041                     // core mode, so use a big core name.
1042                     uint16_t coreNum = instanceID / 2;
1043                     uint16_t tempNum = instanceID % 2;
1044                     sensorPath.append("proc" + std::to_string(occInstance) +
1045                                       "_core" + std::to_string(coreNum) + "_" +
1046                                       std::to_string(tempNum) + "_temp");
1047 
1048                     dvfsTempPath =
1049                         std::string{OCC_SENSORS_ROOT} + "/temperature/proc" +
1050                         std::to_string(occInstance) + "_core_dvfs_temp";
1051                 }
1052                 else
1053                 {
1054                     continue;
1055                 }
1056             }
1057             else
1058             {
1059                 continue;
1060             }
1061         }
1062 
1063         // The dvfs temp file only needs to be read once per chip per type.
1064         if (!dvfsTempPath.empty() &&
1065             !dbus::OccDBusSensors::getOccDBus().hasDvfsTemp(dvfsTempPath))
1066         {
1067             try
1068             {
1069                 auto dvfsValue = readFile<double>(filePathString + maxSuffix);
1070 
1071                 dbus::OccDBusSensors::getOccDBus().setDvfsTemp(
1072                     dvfsTempPath, dvfsValue * std::pow(10, -3));
1073             }
1074             catch (const std::system_error& e)
1075             {
1076                 lg2::debug(
1077                     "readTempSensors: Failed reading {PATH}, errno = {ERROR}",
1078                     "PATH", filePathString + maxSuffix, "ERROR",
1079                     e.code().value());
1080             }
1081         }
1082 
1083         uint32_t faultValue{0};
1084         try
1085         {
1086             faultValue = readFile<uint32_t>(filePathString + faultSuffix);
1087         }
1088         catch (const std::system_error& e)
1089         {
1090             lg2::debug(
1091                 "readTempSensors: Failed reading {PATH}, errno = {ERROR}",
1092                 "PATH", filePathString + faultSuffix, "ERROR",
1093                 e.code().value());
1094             continue;
1095         }
1096 
1097         double tempValue{0};
1098         // NOTE: if OCC sends back 0xFF, kernal sets this fault value to 1.
1099         if (faultValue != 0)
1100         {
1101             tempValue = std::numeric_limits<double>::quiet_NaN();
1102         }
1103         else
1104         {
1105             // Read the temperature
1106             try
1107             {
1108                 tempValue = readFile<double>(filePathString + inputSuffix);
1109             }
1110             catch (const std::system_error& e)
1111             {
1112                 lg2::debug(
1113                     "readTempSensors: Failed reading {PATH}, errno = {ERROR}",
1114                     "PATH", filePathString + inputSuffix, "ERROR",
1115                     e.code().value());
1116 
1117                 // if errno == EAGAIN(Resource temporarily unavailable) then set
1118                 // temp to 0, to avoid using old temp, and affecting FAN
1119                 // Control.
1120                 if (e.code().value() == EAGAIN)
1121                 {
1122                     tempValue = 0;
1123                 }
1124                 // else the errno would be something like
1125                 //     EBADF(Bad file descriptor)
1126                 // or ENOENT(No such file or directory)
1127                 else
1128                 {
1129                     continue;
1130                 }
1131             }
1132         }
1133 
1134         // If this object path already has a value, only overwite
1135         // it if the previous one was an NaN or a smaller value.
1136         auto existing = sensorData.find(sensorPath);
1137         if (existing != sensorData.end())
1138         {
1139             // Multiple sensors found for this FRU type
1140             if ((std::isnan(existing->second) && (tempValue == 0)) ||
1141                 ((existing->second == 0) && std::isnan(tempValue)))
1142             {
1143                 // One of the redundant sensors has failed (0xFF/nan), and the
1144                 // other sensor has no reading (0), so set the FRU to NaN to
1145                 // force fan increase
1146                 tempValue = std::numeric_limits<double>::quiet_NaN();
1147                 existing->second = tempValue;
1148             }
1149             if (std::isnan(existing->second) || (tempValue > existing->second))
1150             {
1151                 existing->second = tempValue;
1152             }
1153         }
1154         else
1155         {
1156             // First sensor for this FRU type
1157             sensorData[sensorPath] = tempValue;
1158         }
1159     }
1160 
1161     // Now publish the values on D-Bus.
1162     for (const auto& [objectPath, value] : sensorData)
1163     {
1164         dbus::OccDBusSensors::getOccDBus().setValue(objectPath,
1165                                                     value * std::pow(10, -3));
1166 
1167         dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1168             objectPath, !std::isnan(value));
1169 
1170         if (existingSensors.find(objectPath) == existingSensors.end())
1171         {
1172             dbus::OccDBusSensors::getOccDBus().setChassisAssociation(
1173                 objectPath, {"all_sensors"});
1174         }
1175         existingSensors[objectPath] = occInstance;
1176     }
1177 }
1178 
getPowerLabelFunctionID(const std::string & value)1179 std::optional<std::string> Manager::getPowerLabelFunctionID(
1180     const std::string& value)
1181 {
1182     // If the value is "system", then the FunctionID is "system".
1183     if (value == "system")
1184     {
1185         return value;
1186     }
1187 
1188     // If the value is not "system", then the label value have 3 numbers, of
1189     // which we only care about the middle one:
1190     // <sensor id>_<function id>_<apss channel>
1191     // eg: The value is "0_10_5" , then the FunctionID is "10".
1192     if (value.find("_") == std::string::npos)
1193     {
1194         return std::nullopt;
1195     }
1196 
1197     auto powerLabelValue = value.substr((value.find("_") + 1));
1198 
1199     if (powerLabelValue.find("_") == std::string::npos)
1200     {
1201         return std::nullopt;
1202     }
1203 
1204     return powerLabelValue.substr(0, powerLabelValue.find("_"));
1205 }
1206 
readPowerSensors(const fs::path & path,uint32_t id)1207 void Manager::readPowerSensors(const fs::path& path, uint32_t id)
1208 {
1209     std::regex expr{"power\\d+_label$"}; // Example: power5_label
1210     for (auto& file : fs::directory_iterator(path))
1211     {
1212         if (!std::regex_search(file.path().string(), expr))
1213         {
1214             continue;
1215         }
1216 
1217         std::string labelValue;
1218         try
1219         {
1220             labelValue = readFile<std::string>(file.path());
1221         }
1222         catch (const std::system_error& e)
1223         {
1224             lg2::debug(
1225                 "readPowerSensors: Failed reading {PATH}, errno = {ERROR}",
1226                 "PATH", file.path().string(), "ERROR", e.code().value());
1227             continue;
1228         }
1229 
1230         auto functionID = getPowerLabelFunctionID(labelValue);
1231         if (functionID == std::nullopt)
1232         {
1233             continue;
1234         }
1235 
1236         const std::string& tempLabel = "label";
1237         const std::string filePathString = file.path().string().substr(
1238             0, file.path().string().length() - tempLabel.length());
1239 
1240         std::string sensorPath = OCC_SENSORS_ROOT + std::string("/power/");
1241 
1242         auto iter = powerSensorName.find(*functionID);
1243         if (iter == powerSensorName.end())
1244         {
1245             continue;
1246         }
1247         sensorPath.append(iter->second);
1248 
1249         double tempValue{0};
1250 
1251         try
1252         {
1253             tempValue = readFile<double>(filePathString + inputSuffix);
1254         }
1255         catch (const std::system_error& e)
1256         {
1257             lg2::debug(
1258                 "readPowerSensors: Failed reading {PATH}, errno = {ERROR}",
1259                 "PATH", filePathString + inputSuffix, "ERROR",
1260                 e.code().value());
1261             continue;
1262         }
1263 
1264         dbus::OccDBusSensors::getOccDBus().setUnit(
1265             sensorPath, "xyz.openbmc_project.Sensor.Value.Unit.Watts");
1266 
1267         dbus::OccDBusSensors::getOccDBus().setValue(
1268             sensorPath, tempValue * std::pow(10, -3) * std::pow(10, -3));
1269 
1270         dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1271             sensorPath, true);
1272 
1273         if (existingSensors.find(sensorPath) == existingSensors.end())
1274         {
1275             std::vector<std::string> fTypeList = {"all_sensors"};
1276             if (iter->second == "total_power")
1277             {
1278                 // Set sensor purpose as TotalPower
1279                 dbus::OccDBusSensors::getOccDBus().setPurpose(
1280                     sensorPath,
1281                     "xyz.openbmc_project.Sensor.Purpose.SensorPurpose.TotalPower");
1282             }
1283             dbus::OccDBusSensors::getOccDBus().setChassisAssociation(
1284                 sensorPath, fTypeList);
1285         }
1286         existingSensors[sensorPath] = id;
1287     }
1288     return;
1289 }
1290 
readExtnSensors(const fs::path & path,uint32_t id)1291 void Manager::readExtnSensors(const fs::path& path, uint32_t id)
1292 {
1293     std::regex expr{"extn\\d+_label$"}; // Example: extn5_label
1294     for (auto& file : fs::directory_iterator(path))
1295     {
1296         if (!std::regex_search(file.path().string(), expr))
1297         {
1298             continue;
1299         }
1300 
1301         // Read in Label value of the sensor from file.
1302         std::string labelValue;
1303         try
1304         {
1305             labelValue = readFile<std::string>(file.path());
1306         }
1307         catch (const std::system_error& e)
1308         {
1309             lg2::debug(
1310                 "readExtnSensors:label Failed reading {PATH}, errno = {ERROR}",
1311                 "PATH", file.path().string(), "ERROR", e.code().value());
1312             continue;
1313         }
1314         const std::string& tempLabel = "label";
1315         const std::string filePathString = file.path().string().substr(
1316             0, file.path().string().length() - tempLabel.length());
1317 
1318         std::string sensorPath = OCC_SENSORS_ROOT + std::string("/power/");
1319 
1320         // Labels of EXTN sections from OCC interface Document
1321         //     have different formats.
1322         // 0x464d494e : FMIN            0x46444953 : FDIS
1323         // 0x46424153 : FBAS            0x46555400 : FUT
1324         // 0x464d4158 : FMAX            0x434c4950 : CLIP
1325         // 0x4d4f4445 : MODE            0x574f4643 : WOFC
1326         // 0x574f4649 : WOFI            0x5057524d : PWRM
1327         // 0x50575250 : PWRP            0x45525248 : ERRH
1328         // Label indicating byte 5 and 6 is the current (mem,proc) power in
1329         //      Watts.
1330         if ((labelValue == EXTN_LABEL_PWRM_MEMORY_POWER) ||
1331             (labelValue == EXTN_LABEL_PWRP_PROCESSOR_POWER))
1332         {
1333             // Build the dbus String for this chiplet power asset.
1334             if (labelValue == EXTN_LABEL_PWRP_PROCESSOR_POWER)
1335             {
1336                 labelValue = "_power";
1337             }
1338             else // else EXTN_LABEL_PWRM_MEMORY_POWER
1339             {
1340                 labelValue = "_mem_power";
1341             }
1342             sensorPath.append("chiplet" + std::to_string(id) + labelValue);
1343 
1344             // Read in data value of the sensor from file.
1345             // Read in as string due to different format of data in sensors.
1346             std::string extnValue;
1347             try
1348             {
1349                 extnValue = readFile<std::string>(filePathString + inputSuffix);
1350             }
1351             catch (const std::system_error& e)
1352             {
1353                 lg2::debug(
1354                     "readExtnSensors:value Failed reading {PATH}, errno = {ERROR}",
1355                     "PATH", filePathString + inputSuffix, "ERROR",
1356                     e.code().value());
1357                 continue;
1358             }
1359 
1360             // For Power field, Convert last 4 bytes of hex string into number
1361             //   value.
1362             std::stringstream ssData;
1363             ssData << std::hex << extnValue.substr(extnValue.length() - 4);
1364             uint16_t MyHexNumber;
1365             ssData >> MyHexNumber;
1366 
1367             // Convert output/DC power to input/AC power in Watts (round up)
1368             MyHexNumber =
1369                 std::round(((MyHexNumber / (PS_DERATING_FACTOR / 100.0))));
1370 
1371             dbus::OccDBusSensors::getOccDBus().setUnit(
1372                 sensorPath, "xyz.openbmc_project.Sensor.Value.Unit.Watts");
1373 
1374             dbus::OccDBusSensors::getOccDBus().setValue(sensorPath,
1375                                                         MyHexNumber);
1376 
1377             dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1378                 sensorPath, true);
1379 
1380             if (existingSensors.find(sensorPath) == existingSensors.end())
1381             {
1382                 dbus::OccDBusSensors::getOccDBus().setChassisAssociation(
1383                     sensorPath, {"all_sensors"});
1384             }
1385 
1386             existingSensors[sensorPath] = id;
1387         } // End Extended Power Sensors.
1388     } // End For loop on files for Extended Sensors.
1389     return;
1390 }
1391 
setSensorValueToNaN(uint32_t id) const1392 void Manager::setSensorValueToNaN(uint32_t id) const
1393 {
1394     for (const auto& [sensorPath, occId] : existingSensors)
1395     {
1396         if (occId == id)
1397         {
1398             dbus::OccDBusSensors::getOccDBus().setValue(
1399                 sensorPath, std::numeric_limits<double>::quiet_NaN());
1400 
1401             dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1402                 sensorPath, true);
1403         }
1404     }
1405     return;
1406 }
1407 
setSensorValueToNonFunctional(uint32_t id) const1408 void Manager::setSensorValueToNonFunctional(uint32_t id) const
1409 {
1410     for (const auto& [sensorPath, occId] : existingSensors)
1411     {
1412         if (occId == id)
1413         {
1414             dbus::OccDBusSensors::getOccDBus().setValue(
1415                 sensorPath, std::numeric_limits<double>::quiet_NaN());
1416 
1417             dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1418                 sensorPath, false);
1419         }
1420     }
1421     return;
1422 }
1423 
getSensorValues(std::unique_ptr<Status> & occ)1424 void Manager::getSensorValues(std::unique_ptr<Status>& occ)
1425 {
1426     static bool tracedError[8] = {0};
1427     const fs::path sensorPath = occ->getHwmonPath();
1428     const uint32_t id = occ->getOccInstanceID();
1429 
1430     if (fs::exists(sensorPath))
1431     {
1432         // Read temperature sensors
1433         readTempSensors(sensorPath, id);
1434         // Read Extended sensors
1435         readExtnSensors(sensorPath, id);
1436 
1437         if (occ->isMasterOcc())
1438         {
1439             // Read power sensors
1440             readPowerSensors(sensorPath, id);
1441         }
1442         tracedError[id] = false;
1443     }
1444     else
1445     {
1446         if (!tracedError[id])
1447         {
1448             lg2::error(
1449                 "Manager::getSensorValues: OCC{INST} sensor path missing: {PATH}",
1450                 "INST", id, "PATH", sensorPath);
1451             tracedError[id] = true;
1452         }
1453     }
1454 
1455     return;
1456 }
1457 #endif
1458 
1459 // Read the altitude from DBus
readAltitude()1460 void Manager::readAltitude()
1461 {
1462     static bool traceAltitudeErr = true;
1463 
1464     utils::PropertyValue altitudeProperty{};
1465     try
1466     {
1467         altitudeProperty = utils::getProperty(ALTITUDE_PATH, ALTITUDE_INTERFACE,
1468                                               ALTITUDE_PROP);
1469         auto sensorVal = std::get<double>(altitudeProperty);
1470         if (sensorVal < 0xFFFF)
1471         {
1472             if (sensorVal < 0)
1473             {
1474                 altitude = 0;
1475             }
1476             else
1477             {
1478                 // Round to nearest meter
1479                 altitude = uint16_t(sensorVal + 0.5);
1480             }
1481             lg2::debug("readAltitude: sensor={VALUE} ({ALT}m)", "VALUE",
1482                        sensorVal, "ALT", altitude);
1483             traceAltitudeErr = true;
1484         }
1485         else
1486         {
1487             if (traceAltitudeErr)
1488             {
1489                 traceAltitudeErr = false;
1490                 lg2::debug("Invalid altitude value: {ALT}", "ALT", sensorVal);
1491             }
1492         }
1493     }
1494     catch (const sdbusplus::exception_t& e)
1495     {
1496         if (traceAltitudeErr)
1497         {
1498             traceAltitudeErr = false;
1499             lg2::info("Unable to read Altitude: {ERROR}", "ERROR", e.what());
1500         }
1501         altitude = 0xFFFF; // not available
1502     }
1503 }
1504 
1505 // Callback function when ambient temperature changes
ambientCallback(sdbusplus::message_t & msg)1506 void Manager::ambientCallback(sdbusplus::message_t& msg)
1507 {
1508     double currentTemp = 0;
1509     uint8_t truncatedTemp = 0xFF;
1510     std::string msgSensor;
1511     std::map<std::string, std::variant<double>> msgData;
1512     msg.read(msgSensor, msgData);
1513 
1514     auto valPropMap = msgData.find(AMBIENT_PROP);
1515     if (valPropMap == msgData.end())
1516     {
1517         lg2::debug("ambientCallback: Unknown ambient property changed");
1518         return;
1519     }
1520     currentTemp = std::get<double>(valPropMap->second);
1521     if (std::isnan(currentTemp))
1522     {
1523         truncatedTemp = 0xFF;
1524     }
1525     else
1526     {
1527         if (currentTemp < 0)
1528         {
1529             truncatedTemp = 0;
1530         }
1531         else
1532         {
1533             // Round to nearest degree C
1534             truncatedTemp = uint8_t(currentTemp + 0.5);
1535         }
1536     }
1537 
1538     // If ambient changes, notify OCCs
1539     if (truncatedTemp != ambient)
1540     {
1541         lg2::debug("ambientCallback: Ambient change from {OLD} to {NEW}C",
1542                    "OLD", ambient, "NEW", currentTemp);
1543 
1544         ambient = truncatedTemp;
1545         if (altitude == 0xFFFF)
1546         {
1547             // No altitude yet, try reading again
1548             readAltitude();
1549         }
1550 
1551         lg2::debug("ambientCallback: Ambient: {TEMP}C, altitude: {ALT}m",
1552                    "TEMP", ambient, "ALT", altitude);
1553 #ifdef POWER10
1554         // Send ambient and altitude to all OCCs
1555         for (auto& obj : statusObjects)
1556         {
1557             if (obj->occActive())
1558             {
1559                 obj->sendAmbient(ambient, altitude);
1560             }
1561         }
1562 #endif // POWER10
1563     }
1564 }
1565 
1566 // return the current ambient and altitude readings
getAmbientData(bool & ambientValid,uint8_t & ambientTemp,uint16_t & altitudeValue) const1567 void Manager::getAmbientData(bool& ambientValid, uint8_t& ambientTemp,
1568                              uint16_t& altitudeValue) const
1569 {
1570     ambientValid = true;
1571     ambientTemp = ambient;
1572     altitudeValue = altitude;
1573 
1574     if (ambient == 0xFF)
1575     {
1576         ambientValid = false;
1577     }
1578 }
1579 
1580 #ifdef POWER10
1581 // Called when waitForAllOccsTimer expires
1582 // After the first OCC goes active, this timer will be started (60 seconds)
occsNotAllRunning()1583 void Manager::occsNotAllRunning()
1584 {
1585     if (resetInProgress)
1586     {
1587         lg2::warning(
1588             "occsNotAllRunning: Ignoring waitForAllOccsTimer because reset is in progress");
1589         return;
1590     }
1591     if (activeCount != statusObjects.size())
1592     {
1593         // Not all OCCs went active
1594         lg2::warning(
1595             "occsNotAllRunning: Active OCC count ({COUNT}) does not match expected count ({EXP})",
1596             "COUNT", activeCount, "EXP", statusObjects.size());
1597         // Procs may be garded, so may be expected
1598     }
1599 
1600     if (resetRequired)
1601     {
1602         initiateOccRequest(resetInstance);
1603 
1604         if (!waitForAllOccsTimer->isEnabled())
1605         {
1606             lg2::warning("occsNotAllRunning: Restarting waitForAllOccTimer");
1607             // restart occ wait timer
1608             waitForAllOccsTimer->restartOnce(60s);
1609         }
1610     }
1611     else
1612     {
1613         validateOccMaster();
1614     }
1615 }
1616 
1617 #ifdef PLDM
1618 // Called when throttlePldmTraceTimer expires.
1619 // If this timer expires, that indicates there are no OCC active sensor PDRs
1620 // found which will trigger pldm traces to be throttled.
1621 // The second time this timer expires, a PEL will get created.
throttlePldmTraceExpired()1622 void Manager::throttlePldmTraceExpired()
1623 {
1624     if (utils::isHostRunning())
1625     {
1626         if (!onPldmTimeoutCreatePel)
1627         {
1628             // Throttle traces
1629             pldmHandle->setTraceThrottle(true);
1630             // Restart timer to log a PEL when timer expires
1631             onPldmTimeoutCreatePel = true;
1632             throttlePldmTraceTimer->restartOnce(40min);
1633         }
1634         else
1635         {
1636             lg2::error(
1637                 "throttlePldmTraceExpired(): OCC active sensors still not available!");
1638             // Create PEL
1639             createPldmSensorPEL();
1640         }
1641     }
1642     else
1643     {
1644         // Make sure traces are not throttled
1645         pldmHandle->setTraceThrottle(false);
1646         lg2::info(
1647             "throttlePldmTraceExpired(): host it not running ignoring sensor timer");
1648     }
1649 }
1650 
createPldmSensorPEL()1651 void Manager::createPldmSensorPEL()
1652 {
1653     Error::Descriptor d = Error::Descriptor(MISSING_OCC_SENSORS_PATH);
1654     std::map<std::string, std::string> additionalData;
1655 
1656     additionalData.emplace("_PID", std::to_string(getpid()));
1657 
1658     lg2::info(
1659         "createPldmSensorPEL(): Unable to find PLDM sensors for the OCCs");
1660 
1661     auto& bus = utils::getBus();
1662 
1663     try
1664     {
1665         FFDCFiles ffdc;
1666         // Add occ-control journal traces to PEL FFDC
1667         auto occJournalFile =
1668             FFDC::addJournalEntries(ffdc, "openpower-occ-control", 40);
1669 
1670         static constexpr auto loggingObjectPath =
1671             "/xyz/openbmc_project/logging";
1672         static constexpr auto opLoggingInterface = "org.open_power.Logging.PEL";
1673         std::string service =
1674             utils::getService(loggingObjectPath, opLoggingInterface);
1675         auto method =
1676             bus.new_method_call(service.c_str(), loggingObjectPath,
1677                                 opLoggingInterface, "CreatePELWithFFDCFiles");
1678 
1679         // Set level to Warning (Predictive).
1680         auto level =
1681             sdbusplus::xyz::openbmc_project::Logging::server::convertForMessage(
1682                 sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level::
1683                     Warning);
1684 
1685         method.append(d.path, level, additionalData, ffdc);
1686         bus.call(method);
1687     }
1688     catch (const sdbusplus::exception_t& e)
1689     {
1690         lg2::error("Failed to create MISSING_OCC_SENSORS PEL: {ERROR}", "ERROR",
1691                    e.what());
1692     }
1693 }
1694 #endif // PLDM
1695 #endif // POWER10
1696 
1697 // Verify single master OCC and start presence monitor
validateOccMaster()1698 void Manager::validateOccMaster()
1699 {
1700     int masterInstance = -1;
1701     for (auto& obj : statusObjects)
1702     {
1703         auto instance = obj->getOccInstanceID();
1704 #ifdef POWER10
1705         if (!obj->occActive())
1706         {
1707             if (utils::isHostRunning())
1708             {
1709                 // Check if sensor was queued while waiting for discovery
1710                 auto match = queuedActiveState.find(instance);
1711                 if (match != queuedActiveState.end())
1712                 {
1713                     queuedActiveState.erase(match);
1714                     lg2::info("validateOccMaster: OCC{INST} is ACTIVE (queued)",
1715                               "INST", instance);
1716                     obj->occActive(true);
1717                 }
1718                 else
1719                 {
1720                     // OCC does not appear to be active yet, check active sensor
1721 #ifdef PLDM
1722                     pldmHandle->checkActiveSensor(instance);
1723 #endif
1724                     if (obj->occActive())
1725                     {
1726                         lg2::info(
1727                             "validateOccMaster: OCC{INST} is ACTIVE after reading sensor",
1728                             "INST", instance);
1729                     }
1730                 }
1731             }
1732             else
1733             {
1734                 lg2::warning(
1735                     "validateOccMaster: HOST is not running (OCC{INST})",
1736                     "INST", instance);
1737                 return;
1738             }
1739         }
1740 #endif // POWER10
1741 
1742         if (obj->isMasterOcc())
1743         {
1744             obj->addPresenceWatchMaster();
1745 
1746             if (masterInstance == -1)
1747             {
1748                 masterInstance = instance;
1749             }
1750             else
1751             {
1752                 lg2::error(
1753                     "validateOccMaster: Multiple OCC masters! ({MAST1} and {MAST2})",
1754                     "MAST1", masterInstance, "MAST2", instance);
1755                 // request reset
1756                 obj->deviceError(Error::Descriptor(PRESENCE_ERROR_PATH));
1757             }
1758         }
1759     }
1760 
1761     if (masterInstance < 0)
1762     {
1763         lg2::error("validateOccMaster: Master OCC not found! (of {NUM} OCCs)",
1764                    "NUM", statusObjects.size());
1765         // request reset
1766         statusObjects.front()->deviceError(
1767             Error::Descriptor(PRESENCE_ERROR_PATH));
1768     }
1769     else
1770     {
1771         lg2::info("validateOccMaster: OCC{INST} is master of {COUNT} OCCs",
1772                   "INST", masterInstance, "COUNT", activeCount);
1773 #ifdef POWER10
1774         pmode->updateDbusSafeMode(false);
1775 #endif
1776     }
1777 }
1778 
updatePcapBounds() const1779 void Manager::updatePcapBounds() const
1780 {
1781     if (pcap)
1782     {
1783         pcap->updatePcapBounds();
1784     }
1785 }
1786 
1787 } // namespace occ
1788 } // namespace open_power
1789