xref: /openbmc/openpower-occ-control/occ_manager.cpp (revision f788150656a182cb75c0b72af9f84530531773ef)
1 #include "config.h"
2 
3 #include "occ_manager.hpp"
4 
5 #include "i2c_occ.hpp"
6 #include "occ_dbus.hpp"
7 #include "occ_errors.hpp"
8 #include "utils.hpp"
9 
10 #include <phosphor-logging/elog-errors.hpp>
11 #include <phosphor-logging/lg2.hpp>
12 #include <xyz/openbmc_project/Common/error.hpp>
13 
14 #include <chrono>
15 #include <cmath>
16 #include <filesystem>
17 #include <fstream>
18 #include <regex>
19 
20 namespace open_power
21 {
22 namespace occ
23 {
24 
25 constexpr uint32_t fruTypeNotAvailable = 0xFF;
26 constexpr auto fruTypeSuffix = "fru_type";
27 constexpr auto faultSuffix = "fault";
28 constexpr auto inputSuffix = "input";
29 constexpr auto maxSuffix = "max";
30 
31 const auto HOST_ON_FILE = "/run/openbmc/host@0-on";
32 
33 using namespace phosphor::logging;
34 using namespace std::literals::chrono_literals;
35 
36 template <typename T>
readFile(const std::string & path)37 T readFile(const std::string& path)
38 {
39     std::ifstream ifs;
40     ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit |
41                    std::ifstream::eofbit);
42     T data;
43 
44     try
45     {
46         ifs.open(path);
47         ifs >> data;
48         ifs.close();
49     }
50     catch (const std::exception& e)
51     {
52         auto err = errno;
53         throw std::system_error(err, std::generic_category());
54     }
55 
56     return data;
57 }
58 
createPldmHandle()59 void Manager::createPldmHandle()
60 {
61 #ifdef PLDM
62     pldmHandle = std::make_unique<pldm::Interface>(
63         std::bind(std::mem_fn(&Manager::updateOCCActive), this,
64                   std::placeholders::_1, std::placeholders::_2),
65         std::bind(std::mem_fn(&Manager::sbeHRESETResult), this,
66                   std::placeholders::_1, std::placeholders::_2),
67         std::bind(std::mem_fn(&Manager::updateOccSafeMode), this,
68                   std::placeholders::_1),
69         std::bind(std::mem_fn(&Manager::hostPoweredOff), this), event);
70 #endif
71 }
72 
73 // findAndCreateObjects():
74 // Takes care of getting the required objects created and
75 // finds the available devices/processors.
76 // (function is called everytime the discoverTimer expires)
77 // - create the PowerMode object to control OCC modes
78 // - create statusObjects for each OCC device found
79 // - waits for OCC Active sensors PDRs to become available
80 // - restart discoverTimer if all data is not available yet
findAndCreateObjects()81 void Manager::findAndCreateObjects()
82 {
83 #ifndef POWER10
84     for (auto id = 0; id < MAX_CPUS; ++id)
85     {
86         // Create one occ per cpu
87         auto occ = std::string(OCC_NAME) + std::to_string(id);
88         createObjects(occ);
89     }
90 #else
91     if (!pmode)
92     {
93         // Create the power mode object
94         pmode = std::make_unique<powermode::PowerMode>(
95             *this, powermode::PMODE_PATH, powermode::PIPS_PATH, event);
96     }
97 
98     if (!fs::exists(HOST_ON_FILE))
99     {
100         static bool statusObjCreated = false;
101         if (!statusObjCreated)
102         {
103             // Create the OCCs based on on the /dev/occX devices
104             auto occs = findOCCsInDev();
105 
106             if (occs.empty() || (prevOCCSearch.size() != occs.size()))
107             {
108                 // Something changed or no OCCs yet, try again in 10s.
109                 // Note on the first pass prevOCCSearch will be empty,
110                 // so there will be at least one delay to give things
111                 // a chance to settle.
112                 prevOCCSearch = occs;
113 
114                 lg2::info(
115                     "Manager::findAndCreateObjects(): Waiting for OCCs (currently {QTY})",
116                     "QTY", occs.size());
117 
118                 discoverTimer->restartOnce(10s);
119             }
120             else
121             {
122                 // All OCCs appear to be available, create status objects
123 
124                 // createObjects requires OCC0 first.
125                 std::sort(occs.begin(), occs.end());
126 
127                 lg2::info(
128                     "Manager::findAndCreateObjects(): Creating {QTY} OCC Status Objects",
129                     "QTY", occs.size());
130                 for (auto id : occs)
131                 {
132                     createObjects(std::string(OCC_NAME) + std::to_string(id));
133                 }
134                 statusObjCreated = true;
135                 waitingForAllOccActiveSensors = true;
136 
137                 // Find/update the processor path associated with each OCC
138                 for (auto& obj : statusObjects)
139                 {
140                     obj->updateProcAssociation();
141                 }
142             }
143         }
144 
145         if (statusObjCreated && waitingForAllOccActiveSensors)
146         {
147             static bool tracedHostWait = false;
148             if (utils::isHostRunning())
149             {
150                 if (tracedHostWait)
151                 {
152                     lg2::info(
153                         "Manager::findAndCreateObjects(): Host is running");
154                     tracedHostWait = false;
155                 }
156                 checkAllActiveSensors();
157             }
158             else
159             {
160                 if (!tracedHostWait)
161                 {
162                     lg2::info(
163                         "Manager::findAndCreateObjects(): Waiting for host to start");
164                     tracedHostWait = true;
165                 }
166                 discoverTimer->restartOnce(30s);
167 #ifdef PLDM
168                 if (throttlePldmTraceTimer->isEnabled())
169                 {
170                     // Host is no longer running, disable throttle timer and
171                     // make sure traces are not throttled
172                     lg2::info("findAndCreateObjects(): disabling sensor timer");
173                     throttlePldmTraceTimer->setEnabled(false);
174                     pldmHandle->setTraceThrottle(false);
175                 }
176 #endif
177             }
178         }
179     }
180     else
181     {
182         lg2::info(
183             "Manager::findAndCreateObjects(): Waiting for {FILE} to complete...",
184             "FILE", HOST_ON_FILE);
185         discoverTimer->restartOnce(10s);
186     }
187 #endif
188 }
189 
190 #ifdef POWER10
191 // Check if all occActive sensors are available
checkAllActiveSensors()192 void Manager::checkAllActiveSensors()
193 {
194     static bool allActiveSensorAvailable = false;
195     static bool tracedSensorWait = false;
196     static bool waitingForHost = false;
197 
198     if (open_power::occ::utils::isHostRunning())
199     {
200         if (waitingForHost)
201         {
202             waitingForHost = false;
203             lg2::info("checkAllActiveSensors(): Host is now running");
204         }
205 
206         // Start with the assumption that all are available
207         allActiveSensorAvailable = true;
208         for (auto& obj : statusObjects)
209         {
210             if ((!obj->occActive()) && (!obj->getPldmSensorReceived()))
211             {
212                 auto instance = obj->getOccInstanceID();
213                 // Check if sensor was queued while waiting for discovery
214                 auto match = queuedActiveState.find(instance);
215                 if (match != queuedActiveState.end())
216                 {
217                     queuedActiveState.erase(match);
218                     lg2::info(
219                         "checkAllActiveSensors(): OCC{INST} is ACTIVE (queued)",
220                         "INST", instance);
221                     obj->occActive(true);
222                 }
223                 else
224                 {
225                     allActiveSensorAvailable = false;
226                     if (!tracedSensorWait)
227                     {
228                         lg2::info(
229                             "checkAllActiveSensors(): Waiting on OCC{INST} Active sensor",
230                             "INST", instance);
231                         tracedSensorWait = true;
232 #ifdef PLDM
233                         // Make sure PLDM traces are not throttled
234                         pldmHandle->setTraceThrottle(false);
235                         // Start timer to throttle PLDM traces when timer
236                         // expires
237                         onPldmTimeoutCreatePel = false;
238                         throttlePldmTraceTimer->restartOnce(5min);
239 #endif
240                     }
241 #ifdef PLDM
242                     // Ignore active sensor check if the OCCs are being reset
243                     if (!resetInProgress)
244                     {
245                         pldmHandle->checkActiveSensor(obj->getOccInstanceID());
246                     }
247 #endif
248                     break;
249                 }
250             }
251         }
252     }
253     else
254     {
255         if (!waitingForHost)
256         {
257             waitingForHost = true;
258             lg2::info("checkAllActiveSensors(): Waiting for host to start");
259 #ifdef PLDM
260             if (throttlePldmTraceTimer->isEnabled())
261             {
262                 // Host is no longer running, disable throttle timer and
263                 // make sure traces are not throttled
264                 lg2::info("checkAllActiveSensors(): disabling sensor timer");
265                 throttlePldmTraceTimer->setEnabled(false);
266                 pldmHandle->setTraceThrottle(false);
267             }
268 #endif
269         }
270     }
271 
272     if (allActiveSensorAvailable)
273     {
274         // All sensors were found, disable the discovery timer
275         if (discoverTimer->isEnabled())
276         {
277             discoverTimer->setEnabled(false);
278         }
279 #ifdef PLDM
280         if (throttlePldmTraceTimer->isEnabled())
281         {
282             // Disable throttle timer and make sure traces are not throttled
283             throttlePldmTraceTimer->setEnabled(false);
284             pldmHandle->setTraceThrottle(false);
285         }
286 #endif
287         if (waitingForAllOccActiveSensors)
288         {
289             lg2::info(
290                 "checkAllActiveSensors(): OCC Active sensors are available");
291             waitingForAllOccActiveSensors = false;
292 
293             if (resetRequired)
294             {
295                 initiateOccRequest(resetInstance);
296 
297                 if (!waitForAllOccsTimer->isEnabled())
298                 {
299                     lg2::warning(
300                         "occsNotAllRunning: Restarting waitForAllOccTimer");
301                     // restart occ wait timer to check status after reset
302                     // completes
303                     waitForAllOccsTimer->restartOnce(60s);
304                 }
305             }
306         }
307         queuedActiveState.clear();
308         tracedSensorWait = false;
309     }
310     else
311     {
312         // Not all sensors were available, so keep waiting
313         if (!tracedSensorWait)
314         {
315             lg2::info(
316                 "checkAllActiveSensors(): Waiting for OCC Active sensors to become available");
317             tracedSensorWait = true;
318         }
319         discoverTimer->restartOnce(10s);
320     }
321 }
322 #endif
323 
findOCCsInDev()324 std::vector<int> Manager::findOCCsInDev()
325 {
326     std::vector<int> occs;
327     std::regex expr{R"(occ(\d+)$)"};
328 
329     for (auto& file : fs::directory_iterator("/dev"))
330     {
331         std::smatch match;
332         std::string path{file.path().string()};
333         if (std::regex_search(path, match, expr))
334         {
335             auto num = std::stoi(match[1].str());
336 
337             // /dev numbering starts at 1, ours starts at 0.
338             occs.push_back(num - 1);
339         }
340     }
341 
342     return occs;
343 }
344 
cpuCreated(sdbusplus::message_t & msg)345 int Manager::cpuCreated(sdbusplus::message_t& msg)
346 {
347     namespace fs = std::filesystem;
348 
349     sdbusplus::message::object_path o;
350     msg.read(o);
351     fs::path cpuPath(std::string(std::move(o)));
352 
353     auto name = cpuPath.filename().string();
354     auto index = name.find(CPU_NAME);
355     name.replace(index, std::strlen(CPU_NAME), OCC_NAME);
356 
357     createObjects(name);
358 
359     return 0;
360 }
361 
createObjects(const std::string & occ)362 void Manager::createObjects(const std::string& occ)
363 {
364     auto path = fs::path(OCC_CONTROL_ROOT) / occ;
365 
366     statusObjects.emplace_back(std::make_unique<Status>(
367         event, path.c_str(), *this,
368 #ifdef POWER10
369         pmode,
370 #endif
371         std::bind(std::mem_fn(&Manager::statusCallBack), this,
372                   std::placeholders::_1, std::placeholders::_2)
373 #ifdef PLDM
374             ,
375         // Callback will set flag indicating reset needs to be done
376         // instead of immediately issuing a reset via PLDM.
377         std::bind(std::mem_fn(&Manager::resetOccRequest), this,
378                   std::placeholders::_1)
379 #endif
380             ));
381 
382     // Create the power cap monitor object
383     if (!pcap)
384     {
385         pcap = std::make_unique<open_power::occ::powercap::PowerCap>(
386             *statusObjects.back());
387     }
388 
389     if (statusObjects.back()->isMasterOcc())
390     {
391         lg2::info("Manager::createObjects(): OCC{INST} is the master", "INST",
392                   statusObjects.back()->getOccInstanceID());
393         _pollTimer->setEnabled(false);
394 
395 #ifdef POWER10
396         // Set the master OCC on the PowerMode object
397         pmode->setMasterOcc(path);
398 #endif
399     }
400 
401     passThroughObjects.emplace_back(std::make_unique<PassThrough>(
402         path.c_str()
403 #ifdef POWER10
404             ,
405         pmode
406 #endif
407         ));
408 }
409 
410 // If a reset is not already outstanding, set a flag to indicate that a reset is
411 // needed.
resetOccRequest(instanceID instance)412 void Manager::resetOccRequest(instanceID instance)
413 {
414     if (!resetRequired)
415     {
416         resetRequired = true;
417         resetInstance = instance;
418         lg2::error(
419             "resetOccRequest: PM Complex reset was requested due to OCC{INST}",
420             "INST", instance);
421     }
422     else if (instance != resetInstance)
423     {
424         lg2::warning(
425             "resetOccRequest: Ignoring PM Complex reset request for OCC{INST}, because reset already outstanding for OCC{RINST}",
426             "INST", instance, "RINST", resetInstance);
427     }
428 }
429 
430 // If a reset has not been started, initiate an OCC reset via PLDM
initiateOccRequest(instanceID instance)431 void Manager::initiateOccRequest(instanceID instance)
432 {
433     if (!resetInProgress)
434     {
435         resetInProgress = true;
436         resetInstance = instance;
437         lg2::error(
438             "initiateOccRequest: Initiating PM Complex reset due to OCC{INST}",
439             "INST", instance);
440 
441         // Make sure ALL OCC comm stops to all OCCs before the reset
442         for (auto& obj : statusObjects)
443         {
444             if (obj->occActive())
445             {
446                 obj->occActive(false);
447             }
448         }
449 
450 #ifdef PLDM
451         pldmHandle->resetOCC(instance);
452 #endif
453         resetRequired = false;
454     }
455     else
456     {
457         lg2::warning(
458             "initiateOccRequest: Ignoring PM Complex reset request for OCC{INST}, because reset already in process for OCC{RINST}",
459             "INST", instance, "RINST", resetInstance);
460     }
461 }
462 
statusCallBack(instanceID instance,bool status)463 void Manager::statusCallBack(instanceID instance, bool status)
464 {
465     if (status == true)
466     {
467         if (resetInProgress)
468         {
469             lg2::info(
470                 "statusCallBack: Ignoring OCC{INST} activate because a reset has been initiated due to OCC{RINST}",
471                 "INST", instance, "RINST", resetInstance);
472             return;
473         }
474 
475         // OCC went active
476         ++activeCount;
477 
478 #ifdef POWER10
479         if (activeCount == 1)
480         {
481             // First OCC went active (allow some time for all OCCs to go active)
482             waitForAllOccsTimer->restartOnce(60s);
483         }
484 #endif
485 
486         if (activeCount == statusObjects.size())
487         {
488 #ifdef POWER10
489             // All OCCs are now running
490             if (waitForAllOccsTimer->isEnabled())
491             {
492                 // stop occ wait timer
493                 waitForAllOccsTimer->setEnabled(false);
494             }
495 
496             // All OCCs have been found, check if we need a reset
497             if (resetRequired)
498             {
499                 initiateOccRequest(resetInstance);
500 
501                 if (!waitForAllOccsTimer->isEnabled())
502                 {
503                     lg2::warning(
504                         "occsNotAllRunning: Restarting waitForAllOccTimer");
505                     // restart occ wait timer
506                     waitForAllOccsTimer->restartOnce(60s);
507                 }
508             }
509             else
510             {
511                 // Verify master OCC and start presence monitor
512                 validateOccMaster();
513             }
514 #else
515             // Verify master OCC and start presence monitor
516             validateOccMaster();
517 #endif
518         }
519 
520         // Start poll timer if not already started (since at least one OCC is
521         // running)
522         if (!_pollTimer->isEnabled())
523         {
524             // An OCC just went active, PM Complex is just coming online so
525             // clear any outstanding reset requests
526             if (resetRequired)
527             {
528                 resetRequired = false;
529                 lg2::error(
530                     "statusCallBack: clearing resetRequired (since OCC{INST} went active, resetInProgress={RIP})",
531                     "INST", instance, "RIP", resetInProgress);
532             }
533 
534             lg2::info("Manager: OCCs will be polled every {TIME} seconds",
535                       "TIME", pollInterval);
536 
537             // Send poll and start OCC poll timer
538             pollerTimerExpired();
539         }
540     }
541     else
542     {
543         // OCC went away
544         if (activeCount > 0)
545         {
546             --activeCount;
547         }
548         else
549         {
550             lg2::info("OCC{INST} disabled, and no other OCCs are active",
551                       "INST", instance);
552         }
553 
554         if (activeCount == 0)
555         {
556             // No OCCs are running
557 
558             if (resetInProgress)
559             {
560                 // All OCC active sensors are clear (reset should be in
561                 // progress)
562                 lg2::info(
563                     "statusCallBack: Clearing resetInProgress (activeCount={COUNT}, OCC{INST}, status={STATUS})",
564                     "COUNT", activeCount, "INST", instance, "STATUS", status);
565                 resetInProgress = false;
566                 resetInstance = 255;
567             }
568 
569             // Stop OCC poll timer
570             if (_pollTimer->isEnabled())
571             {
572                 lg2::info(
573                     "Manager::statusCallBack(): OCCs are not running, stopping poll timer");
574                 _pollTimer->setEnabled(false);
575             }
576 
577 #ifdef POWER10
578             // stop wait timer
579             if (waitForAllOccsTimer->isEnabled())
580             {
581                 waitForAllOccsTimer->setEnabled(false);
582             }
583 #endif
584         }
585         else if (resetInProgress)
586         {
587             lg2::info(
588                 "statusCallBack: Skipping clear of resetInProgress (activeCount={COUNT}, OCC{INST}, status={STATUS})",
589                 "COUNT", activeCount, "INST", instance, "STATUS", status);
590         }
591 #ifdef READ_OCC_SENSORS
592         // Clear OCC sensors
593         setSensorValueToNaN(instance);
594 #endif
595     }
596 
597 #ifdef POWER10
598     if (waitingForAllOccActiveSensors)
599     {
600         if (utils::isHostRunning())
601         {
602             checkAllActiveSensors();
603         }
604     }
605 #endif
606 }
607 
608 #ifdef I2C_OCC
initStatusObjects()609 void Manager::initStatusObjects()
610 {
611     // Make sure we have a valid path string
612     static_assert(sizeof(DEV_PATH) != 0);
613 
614     auto deviceNames = i2c_occ::getOccHwmonDevices(DEV_PATH);
615     for (auto& name : deviceNames)
616     {
617         i2c_occ::i2cToDbus(name);
618         name = std::string(OCC_NAME) + '_' + name;
619         auto path = fs::path(OCC_CONTROL_ROOT) / name;
620         statusObjects.emplace_back(
621             std::make_unique<Status>(event, path.c_str(), *this));
622     }
623     // The first device is master occ
624     pcap = std::make_unique<open_power::occ::powercap::PowerCap>(
625         *statusObjects.front());
626 #ifdef POWER10
627     pmode = std::make_unique<powermode::PowerMode>(*this, powermode::PMODE_PATH,
628                                                    powermode::PIPS_PATH);
629     // Set the master OCC on the PowerMode object
630     pmode->setMasterOcc(path);
631 #endif
632 }
633 #endif
634 
635 #ifdef PLDM
sbeTimeout(unsigned int instance)636 void Manager::sbeTimeout(unsigned int instance)
637 {
638     auto obj = std::find_if(statusObjects.begin(), statusObjects.end(),
639                             [instance](const auto& obj) {
640                                 return instance == obj->getOccInstanceID();
641                             });
642 
643     if (obj != statusObjects.end() && (*obj)->occActive())
644     {
645         lg2::info("SBE timeout, requesting HRESET (OCC{INST})", "INST",
646                   instance);
647 
648 #ifdef PHAL_SUPPORT
649         setSBEState(instance, SBE_STATE_NOT_USABLE);
650 #endif
651 
652         // Stop communication with this OCC
653         (*obj)->occActive(false);
654 
655         pldmHandle->sendHRESET(instance);
656     }
657 }
658 
updateOCCActive(instanceID instance,bool status)659 bool Manager::updateOCCActive(instanceID instance, bool status)
660 {
661     auto obj = std::find_if(statusObjects.begin(), statusObjects.end(),
662                             [instance](const auto& obj) {
663                                 return instance == obj->getOccInstanceID();
664                             });
665 
666     const bool hostRunning = open_power::occ::utils::isHostRunning();
667     if (obj != statusObjects.end())
668     {
669         if (!hostRunning && (status == true))
670         {
671             lg2::warning(
672                 "updateOCCActive: Host is not running yet (OCC{INST} active={STAT}), clearing sensor received",
673                 "INST", instance, "STAT", status);
674             (*obj)->setPldmSensorReceived(false);
675             if (!waitingForAllOccActiveSensors)
676             {
677                 lg2::info(
678                     "updateOCCActive: Waiting for Host and all OCC Active Sensors");
679                 waitingForAllOccActiveSensors = true;
680             }
681 #ifdef POWER10
682             discoverTimer->restartOnce(30s);
683 #endif
684             return false;
685         }
686         else
687         {
688             (*obj)->setPldmSensorReceived(true);
689             return (*obj)->occActive(status);
690         }
691     }
692     else
693     {
694         if (hostRunning)
695         {
696             lg2::warning(
697                 "updateOCCActive: No status object to update for OCC{INST} (active={STAT})",
698                 "INST", instance, "STAT", status);
699         }
700         else
701         {
702             if (status == true)
703             {
704                 lg2::warning(
705                     "updateOCCActive: No status objects and Host is not running yet (OCC{INST} active={STAT})",
706                     "INST", instance, "STAT", status);
707             }
708         }
709         if (status == true)
710         {
711             // OCC went active
712             queuedActiveState.insert(instance);
713         }
714         else
715         {
716             auto match = queuedActiveState.find(instance);
717             if (match != queuedActiveState.end())
718             {
719                 // OCC was disabled
720                 queuedActiveState.erase(match);
721             }
722         }
723         return false;
724     }
725 }
726 
727 // Called upon pldm event To set powermode Safe Mode State for system.
updateOccSafeMode(bool safeMode)728 void Manager::updateOccSafeMode(bool safeMode)
729 {
730 #ifdef POWER10
731     pmode->updateDbusSafeMode(safeMode);
732 #endif
733     // Update the processor throttle status on dbus
734     for (auto& obj : statusObjects)
735     {
736         obj->updateThrottle(safeMode, THROTTLED_SAFE);
737     }
738 }
739 
sbeHRESETResult(instanceID instance,bool success)740 void Manager::sbeHRESETResult(instanceID instance, bool success)
741 {
742     if (success)
743     {
744         lg2::info("HRESET succeeded (OCC{INST})", "INST", instance);
745 
746 #ifdef PHAL_SUPPORT
747         setSBEState(instance, SBE_STATE_BOOTED);
748 #endif
749 
750         // Re-enable communication with this OCC
751         auto obj = std::find_if(statusObjects.begin(), statusObjects.end(),
752                                 [instance](const auto& obj) {
753                                     return instance == obj->getOccInstanceID();
754                                 });
755         if (obj != statusObjects.end() && (!(*obj)->occActive()))
756         {
757             (*obj)->occActive(true);
758         }
759 
760         return;
761     }
762 
763 #ifdef PHAL_SUPPORT
764     setSBEState(instance, SBE_STATE_FAILED);
765 
766     if (sbeCanDump(instance))
767     {
768         lg2::info("HRESET failed (OCC{INST}), triggering SBE dump", "INST",
769                   instance);
770 
771         auto& bus = utils::getBus();
772         uint32_t src6 = instance << 16;
773         uint32_t logId =
774             FFDC::createPEL("org.open_power.Processor.Error.SbeChipOpTimeout",
775                             src6, "SBE command timeout");
776 
777         try
778         {
779             constexpr auto interface = "xyz.openbmc_project.Dump.Create";
780             constexpr auto function = "CreateDump";
781 
782             std::string service =
783                 utils::getService(OP_DUMP_OBJ_PATH, interface);
784             auto method = bus.new_method_call(service.c_str(), OP_DUMP_OBJ_PATH,
785                                               interface, function);
786 
787             std::map<std::string, std::variant<std::string, uint64_t>>
788                 createParams{
789                     {"com.ibm.Dump.Create.CreateParameters.ErrorLogId",
790                      uint64_t(logId)},
791                     {"com.ibm.Dump.Create.CreateParameters.DumpType",
792                      "com.ibm.Dump.Create.DumpType.SBE"},
793                     {"com.ibm.Dump.Create.CreateParameters.FailingUnitId",
794                      uint64_t(instance)},
795                 };
796 
797             method.append(createParams);
798 
799             auto response = bus.call(method);
800         }
801         catch (const sdbusplus::exception_t& e)
802         {
803             constexpr auto ERROR_DUMP_DISABLED =
804                 "xyz.openbmc_project.Dump.Create.Error.Disabled";
805             if (e.name() == ERROR_DUMP_DISABLED)
806             {
807                 lg2::info("Dump is disabled, skipping");
808             }
809             else
810             {
811                 lg2::error("Dump failed");
812             }
813         }
814     }
815 #endif
816 
817     // SBE Reset failed, try PM Complex reset
818     lg2::error("sbeHRESETResult: Forcing PM Complex reset");
819     resetOccRequest(instance);
820 }
821 
822 #ifdef PHAL_SUPPORT
sbeCanDump(unsigned int instance)823 bool Manager::sbeCanDump(unsigned int instance)
824 {
825     struct pdbg_target* proc = getPdbgTarget(instance);
826 
827     if (!proc)
828     {
829         // allow the dump in the error case
830         return true;
831     }
832 
833     try
834     {
835         if (!openpower::phal::sbe::isDumpAllowed(proc))
836         {
837             return false;
838         }
839 
840         if (openpower::phal::pdbg::isSbeVitalAttnActive(proc))
841         {
842             return false;
843         }
844     }
845     catch (openpower::phal::exception::SbeError& e)
846     {
847         lg2::info("Failed to query SBE state");
848     }
849 
850     // allow the dump in the error case
851     return true;
852 }
853 
setSBEState(unsigned int instance,enum sbe_state state)854 void Manager::setSBEState(unsigned int instance, enum sbe_state state)
855 {
856     struct pdbg_target* proc = getPdbgTarget(instance);
857 
858     if (!proc)
859     {
860         return;
861     }
862 
863     try
864     {
865         openpower::phal::sbe::setState(proc, state);
866     }
867     catch (const openpower::phal::exception::SbeError& e)
868     {
869         lg2::error("Failed to set SBE state: {ERROR}", "ERROR", e.what());
870     }
871 }
872 
getPdbgTarget(unsigned int instance)873 struct pdbg_target* Manager::getPdbgTarget(unsigned int instance)
874 {
875     if (!pdbgInitialized)
876     {
877         try
878         {
879             openpower::phal::pdbg::init();
880             pdbgInitialized = true;
881         }
882         catch (const openpower::phal::exception::PdbgError& e)
883         {
884             lg2::error("pdbg initialization failed");
885             return nullptr;
886         }
887     }
888 
889     struct pdbg_target* proc = nullptr;
890     pdbg_for_each_class_target("proc", proc)
891     {
892         if (pdbg_target_index(proc) == instance)
893         {
894             return proc;
895         }
896     }
897 
898     lg2::error("Failed to get pdbg target");
899     return nullptr;
900 }
901 #endif
902 #endif
903 
pollerTimerExpired()904 void Manager::pollerTimerExpired()
905 {
906     if (!_pollTimer)
907     {
908         lg2::error("pollerTimerExpired() ERROR: Timer not defined");
909         return;
910     }
911 
912 #ifdef POWER10
913     if (resetRequired)
914     {
915         lg2::error("pollerTimerExpired() - Initiating PM Complex reset");
916         initiateOccRequest(resetInstance);
917 
918         if (!waitForAllOccsTimer->isEnabled())
919         {
920             lg2::warning("pollerTimerExpired: Restarting waitForAllOccTimer");
921             // restart occ wait timer
922             waitForAllOccsTimer->restartOnce(60s);
923         }
924         return;
925     }
926 #endif
927 
928     for (auto& obj : statusObjects)
929     {
930         if (!obj->occActive())
931         {
932             // OCC is not running yet
933 #ifdef READ_OCC_SENSORS
934             auto id = obj->getOccInstanceID();
935             setSensorValueToNaN(id);
936 #endif
937             continue;
938         }
939 
940         // Read sysfs to force kernel to poll OCC
941         obj->readOccState();
942 
943 #ifdef READ_OCC_SENSORS
944         // Read occ sensor values
945         getSensorValues(obj);
946 #endif
947     }
948 
949     if (activeCount > 0)
950     {
951         // Restart OCC poll timer
952         _pollTimer->restartOnce(std::chrono::seconds(pollInterval));
953     }
954     else
955     {
956         // No OCCs running, so poll timer will not be restarted
957         lg2::info(
958             "Manager::pollerTimerExpired: poll timer will not be restarted");
959     }
960 }
961 
962 #ifdef READ_OCC_SENSORS
readTempSensors(const fs::path & path,uint32_t occInstance)963 void Manager::readTempSensors(const fs::path& path, uint32_t occInstance)
964 {
965     // There may be more than one sensor with the same FRU type
966     // and label so make two passes: the first to read the temps
967     // from sysfs, and the second to put them on D-Bus after
968     // resolving any conflicts.
969     std::map<std::string, double> sensorData;
970 
971     std::regex expr{"temp\\d+_label$"}; // Example: temp5_label
972     for (auto& file : fs::directory_iterator(path))
973     {
974         if (!std::regex_search(file.path().string(), expr))
975         {
976             continue;
977         }
978 
979         uint32_t labelValue{0};
980 
981         try
982         {
983             labelValue = readFile<uint32_t>(file.path());
984         }
985         catch (const std::system_error& e)
986         {
987             lg2::debug(
988                 "readTempSensors: Failed reading {PATH}, errno = {ERROR}",
989                 "PATH", file.path().string(), "ERROR", e.code().value());
990             continue;
991         }
992 
993         const std::string& tempLabel = "label";
994         const std::string filePathString = file.path().string().substr(
995             0, file.path().string().length() - tempLabel.length());
996 
997         uint32_t fruTypeValue{0};
998         try
999         {
1000             fruTypeValue = readFile<uint32_t>(filePathString + fruTypeSuffix);
1001         }
1002         catch (const std::system_error& e)
1003         {
1004             lg2::debug(
1005                 "readTempSensors: Failed reading {PATH}, errno = {ERROR}",
1006                 "PATH", filePathString + fruTypeSuffix, "ERROR",
1007                 e.code().value());
1008             continue;
1009         }
1010 
1011         std::string sensorPath =
1012             OCC_SENSORS_ROOT + std::string("/temperature/");
1013 
1014         std::string dvfsTempPath;
1015 
1016         if (fruTypeValue == VRMVdd)
1017         {
1018             sensorPath.append(
1019                 "vrm_vdd" + std::to_string(occInstance) + "_temp");
1020         }
1021         else if (fruTypeValue == processorIoRing)
1022         {
1023             sensorPath.append(
1024                 "proc" + std::to_string(occInstance) + "_ioring_temp");
1025             dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/proc" +
1026                            std::to_string(occInstance) + "_ioring_dvfs_temp";
1027         }
1028         else
1029         {
1030             uint16_t type = (labelValue & 0xFF000000) >> 24;
1031             uint16_t instanceID = labelValue & 0x0000FFFF;
1032 
1033             if (type == OCC_DIMM_TEMP_SENSOR_TYPE)
1034             {
1035                 if (fruTypeValue == fruTypeNotAvailable)
1036                 {
1037                     // Not all DIMM related temps are available to read
1038                     // (no _input file in this case)
1039                     continue;
1040                 }
1041                 auto iter = dimmTempSensorName.find(fruTypeValue);
1042                 if (iter == dimmTempSensorName.end())
1043                 {
1044                     lg2::error(
1045                         "readTempSensors: Fru type error! fruTypeValue = {FRU}) ",
1046                         "FRU", fruTypeValue);
1047                     continue;
1048                 }
1049 
1050                 sensorPath.append(
1051                     "dimm" + std::to_string(instanceID) + iter->second);
1052 
1053                 dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/" +
1054                                dimmDVFSSensorName.at(fruTypeValue);
1055             }
1056             else if (type == OCC_CPU_TEMP_SENSOR_TYPE)
1057             {
1058                 if (fruTypeValue == processorCore)
1059                 {
1060                     // The OCC reports small core temps, of which there are
1061                     // two per big core.  All current P10 systems are in big
1062                     // core mode, so use a big core name.
1063                     uint16_t coreNum = instanceID / 2;
1064                     uint16_t tempNum = instanceID % 2;
1065                     sensorPath.append("proc" + std::to_string(occInstance) +
1066                                       "_core" + std::to_string(coreNum) + "_" +
1067                                       std::to_string(tempNum) + "_temp");
1068 
1069                     dvfsTempPath =
1070                         std::string{OCC_SENSORS_ROOT} + "/temperature/proc" +
1071                         std::to_string(occInstance) + "_core_dvfs_temp";
1072                 }
1073                 else
1074                 {
1075                     continue;
1076                 }
1077             }
1078             else
1079             {
1080                 continue;
1081             }
1082         }
1083 
1084         // The dvfs temp file only needs to be read once per chip per type.
1085         if (!dvfsTempPath.empty() &&
1086             !dbus::OccDBusSensors::getOccDBus().hasDvfsTemp(dvfsTempPath))
1087         {
1088             try
1089             {
1090                 auto dvfsValue = readFile<double>(filePathString + maxSuffix);
1091 
1092                 dbus::OccDBusSensors::getOccDBus().setDvfsTemp(
1093                     dvfsTempPath, dvfsValue * std::pow(10, -3));
1094             }
1095             catch (const std::system_error& e)
1096             {
1097                 lg2::debug(
1098                     "readTempSensors: Failed reading {PATH}, errno = {ERROR}",
1099                     "PATH", filePathString + maxSuffix, "ERROR",
1100                     e.code().value());
1101             }
1102         }
1103 
1104         uint32_t faultValue{0};
1105         try
1106         {
1107             faultValue = readFile<uint32_t>(filePathString + faultSuffix);
1108         }
1109         catch (const std::system_error& e)
1110         {
1111             lg2::debug(
1112                 "readTempSensors: Failed reading {PATH}, errno = {ERROR}",
1113                 "PATH", filePathString + faultSuffix, "ERROR",
1114                 e.code().value());
1115             continue;
1116         }
1117 
1118         double tempValue{0};
1119         // NOTE: if OCC sends back 0xFF, kernal sets this fault value to 1.
1120         if (faultValue != 0)
1121         {
1122             tempValue = std::numeric_limits<double>::quiet_NaN();
1123         }
1124         else
1125         {
1126             // Read the temperature
1127             try
1128             {
1129                 tempValue = readFile<double>(filePathString + inputSuffix);
1130             }
1131             catch (const std::system_error& e)
1132             {
1133                 lg2::debug(
1134                     "readTempSensors: Failed reading {PATH}, errno = {ERROR}",
1135                     "PATH", filePathString + inputSuffix, "ERROR",
1136                     e.code().value());
1137 
1138                 // if errno == EAGAIN(Resource temporarily unavailable) then set
1139                 // temp to 0, to avoid using old temp, and affecting FAN
1140                 // Control.
1141                 if (e.code().value() == EAGAIN)
1142                 {
1143                     tempValue = 0;
1144                 }
1145                 // else the errno would be something like
1146                 //     EBADF(Bad file descriptor)
1147                 // or ENOENT(No such file or directory)
1148                 else
1149                 {
1150                     continue;
1151                 }
1152             }
1153         }
1154 
1155         // If this object path already has a value, only overwite
1156         // it if the previous one was an NaN or a smaller value.
1157         auto existing = sensorData.find(sensorPath);
1158         if (existing != sensorData.end())
1159         {
1160             // Multiple sensors found for this FRU type
1161             if ((std::isnan(existing->second) && (tempValue == 0)) ||
1162                 ((existing->second == 0) && std::isnan(tempValue)))
1163             {
1164                 // One of the redundant sensors has failed (0xFF/nan), and the
1165                 // other sensor has no reading (0), so set the FRU to NaN to
1166                 // force fan increase
1167                 tempValue = std::numeric_limits<double>::quiet_NaN();
1168                 existing->second = tempValue;
1169             }
1170             if (std::isnan(existing->second) || (tempValue > existing->second))
1171             {
1172                 existing->second = tempValue;
1173             }
1174         }
1175         else
1176         {
1177             // First sensor for this FRU type
1178             sensorData[sensorPath] = tempValue;
1179         }
1180     }
1181 
1182     // Now publish the values on D-Bus.
1183     for (const auto& [objectPath, value] : sensorData)
1184     {
1185         dbus::OccDBusSensors::getOccDBus().setValue(objectPath,
1186                                                     value * std::pow(10, -3));
1187 
1188         dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1189             objectPath, !std::isnan(value));
1190 
1191         if (existingSensors.find(objectPath) == existingSensors.end())
1192         {
1193             dbus::OccDBusSensors::getOccDBus().setChassisAssociation(
1194                 objectPath, {"all_sensors"});
1195         }
1196         existingSensors[objectPath] = occInstance;
1197     }
1198 }
1199 
getPowerLabelFunctionID(const std::string & value)1200 std::optional<std::string> Manager::getPowerLabelFunctionID(
1201     const std::string& value)
1202 {
1203     // If the value is "system", then the FunctionID is "system".
1204     if (value == "system")
1205     {
1206         return value;
1207     }
1208 
1209     // If the value is not "system", then the label value have 3 numbers, of
1210     // which we only care about the middle one:
1211     // <sensor id>_<function id>_<apss channel>
1212     // eg: The value is "0_10_5" , then the FunctionID is "10".
1213     if (value.find("_") == std::string::npos)
1214     {
1215         return std::nullopt;
1216     }
1217 
1218     auto powerLabelValue = value.substr((value.find("_") + 1));
1219 
1220     if (powerLabelValue.find("_") == std::string::npos)
1221     {
1222         return std::nullopt;
1223     }
1224 
1225     return powerLabelValue.substr(0, powerLabelValue.find("_"));
1226 }
1227 
readPowerSensors(const fs::path & path,uint32_t id)1228 void Manager::readPowerSensors(const fs::path& path, uint32_t id)
1229 {
1230     std::regex expr{"power\\d+_label$"}; // Example: power5_label
1231     for (auto& file : fs::directory_iterator(path))
1232     {
1233         if (!std::regex_search(file.path().string(), expr))
1234         {
1235             continue;
1236         }
1237 
1238         std::string labelValue;
1239         try
1240         {
1241             labelValue = readFile<std::string>(file.path());
1242         }
1243         catch (const std::system_error& e)
1244         {
1245             lg2::debug(
1246                 "readPowerSensors: Failed reading {PATH}, errno = {ERROR}",
1247                 "PATH", file.path().string(), "ERROR", e.code().value());
1248             continue;
1249         }
1250 
1251         auto functionID = getPowerLabelFunctionID(labelValue);
1252         if (functionID == std::nullopt)
1253         {
1254             continue;
1255         }
1256 
1257         const std::string& tempLabel = "label";
1258         const std::string filePathString = file.path().string().substr(
1259             0, file.path().string().length() - tempLabel.length());
1260 
1261         std::string sensorPath = OCC_SENSORS_ROOT + std::string("/power/");
1262 
1263         auto iter = powerSensorName.find(*functionID);
1264         if (iter == powerSensorName.end())
1265         {
1266             continue;
1267         }
1268         sensorPath.append(iter->second);
1269 
1270         double tempValue{0};
1271 
1272         try
1273         {
1274             tempValue = readFile<double>(filePathString + inputSuffix);
1275         }
1276         catch (const std::system_error& e)
1277         {
1278             lg2::debug(
1279                 "readPowerSensors: Failed reading {PATH}, errno = {ERROR}",
1280                 "PATH", filePathString + inputSuffix, "ERROR",
1281                 e.code().value());
1282             continue;
1283         }
1284 
1285         dbus::OccDBusSensors::getOccDBus().setUnit(
1286             sensorPath, "xyz.openbmc_project.Sensor.Value.Unit.Watts");
1287 
1288         dbus::OccDBusSensors::getOccDBus().setValue(
1289             sensorPath, tempValue * std::pow(10, -3) * std::pow(10, -3));
1290 
1291         dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1292             sensorPath, true);
1293 
1294         if (existingSensors.find(sensorPath) == existingSensors.end())
1295         {
1296             std::vector<std::string> fTypeList = {"all_sensors"};
1297             if (iter->second == "total_power")
1298             {
1299                 // Set sensor purpose as TotalPower
1300                 dbus::OccDBusSensors::getOccDBus().setPurpose(
1301                     sensorPath,
1302                     "xyz.openbmc_project.Sensor.Purpose.SensorPurpose.TotalPower");
1303             }
1304             dbus::OccDBusSensors::getOccDBus().setChassisAssociation(
1305                 sensorPath, fTypeList);
1306         }
1307         existingSensors[sensorPath] = id;
1308     }
1309     return;
1310 }
1311 
readExtnSensors(const fs::path & path,uint32_t id)1312 void Manager::readExtnSensors(const fs::path& path, uint32_t id)
1313 {
1314     std::regex expr{"extn\\d+_label$"}; // Example: extn5_label
1315     for (auto& file : fs::directory_iterator(path))
1316     {
1317         if (!std::regex_search(file.path().string(), expr))
1318         {
1319             continue;
1320         }
1321 
1322         // Read in Label value of the sensor from file.
1323         std::string labelValue;
1324         try
1325         {
1326             labelValue = readFile<std::string>(file.path());
1327         }
1328         catch (const std::system_error& e)
1329         {
1330             lg2::debug(
1331                 "readExtnSensors:label Failed reading {PATH}, errno = {ERROR}",
1332                 "PATH", file.path().string(), "ERROR", e.code().value());
1333             continue;
1334         }
1335         const std::string& tempLabel = "label";
1336         const std::string filePathString = file.path().string().substr(
1337             0, file.path().string().length() - tempLabel.length());
1338 
1339         std::string sensorPath = OCC_SENSORS_ROOT + std::string("/power/");
1340 
1341         // Labels of EXTN sections from OCC interface Document
1342         //     have different formats.
1343         // 0x464d494e : FMIN            0x46444953 : FDIS
1344         // 0x46424153 : FBAS            0x46555400 : FUT
1345         // 0x464d4158 : FMAX            0x434c4950 : CLIP
1346         // 0x4d4f4445 : MODE            0x574f4643 : WOFC
1347         // 0x574f4649 : WOFI            0x5057524d : PWRM
1348         // 0x50575250 : PWRP            0x45525248 : ERRH
1349         // Label indicating byte 5 and 6 is the current (mem,proc) power in
1350         //      Watts.
1351         if ((labelValue == EXTN_LABEL_PWRM_MEMORY_POWER) ||
1352             (labelValue == EXTN_LABEL_PWRP_PROCESSOR_POWER))
1353         {
1354             // Build the dbus String for this chiplet power asset.
1355             if (labelValue == EXTN_LABEL_PWRP_PROCESSOR_POWER)
1356             {
1357                 labelValue = "_power";
1358             }
1359             else // else EXTN_LABEL_PWRM_MEMORY_POWER
1360             {
1361                 labelValue = "_mem_power";
1362             }
1363             sensorPath.append("chiplet" + std::to_string(id) + labelValue);
1364 
1365             // Read in data value of the sensor from file.
1366             // Read in as string due to different format of data in sensors.
1367             std::string extnValue;
1368             try
1369             {
1370                 extnValue = readFile<std::string>(filePathString + inputSuffix);
1371             }
1372             catch (const std::system_error& e)
1373             {
1374                 lg2::debug(
1375                     "readExtnSensors:value Failed reading {PATH}, errno = {ERROR}",
1376                     "PATH", filePathString + inputSuffix, "ERROR",
1377                     e.code().value());
1378                 continue;
1379             }
1380 
1381             // For Power field, Convert last 4 bytes of hex string into number
1382             //   value.
1383             std::stringstream ssData;
1384             ssData << std::hex << extnValue.substr(extnValue.length() - 4);
1385             uint16_t MyHexNumber;
1386             ssData >> MyHexNumber;
1387 
1388             // Convert output/DC power to input/AC power in Watts (round up)
1389             MyHexNumber =
1390                 std::round(((MyHexNumber / (PS_DERATING_FACTOR / 100.0))));
1391 
1392             dbus::OccDBusSensors::getOccDBus().setUnit(
1393                 sensorPath, "xyz.openbmc_project.Sensor.Value.Unit.Watts");
1394 
1395             dbus::OccDBusSensors::getOccDBus().setValue(sensorPath,
1396                                                         MyHexNumber);
1397 
1398             dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1399                 sensorPath, true);
1400 
1401             if (existingSensors.find(sensorPath) == existingSensors.end())
1402             {
1403                 dbus::OccDBusSensors::getOccDBus().setChassisAssociation(
1404                     sensorPath, {"all_sensors"});
1405             }
1406 
1407             existingSensors[sensorPath] = id;
1408         } // End Extended Power Sensors.
1409     } // End For loop on files for Extended Sensors.
1410     return;
1411 }
1412 
setSensorValueToNaN(uint32_t id) const1413 void Manager::setSensorValueToNaN(uint32_t id) const
1414 {
1415     for (const auto& [sensorPath, occId] : existingSensors)
1416     {
1417         if (occId == id)
1418         {
1419             dbus::OccDBusSensors::getOccDBus().setValue(
1420                 sensorPath, std::numeric_limits<double>::quiet_NaN());
1421 
1422             dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1423                 sensorPath, true);
1424         }
1425     }
1426     return;
1427 }
1428 
setSensorValueToNonFunctional(uint32_t id) const1429 void Manager::setSensorValueToNonFunctional(uint32_t id) const
1430 {
1431     for (const auto& [sensorPath, occId] : existingSensors)
1432     {
1433         if (occId == id)
1434         {
1435             dbus::OccDBusSensors::getOccDBus().setValue(
1436                 sensorPath, std::numeric_limits<double>::quiet_NaN());
1437 
1438             dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1439                 sensorPath, false);
1440         }
1441     }
1442     return;
1443 }
1444 
getSensorValues(std::unique_ptr<Status> & occ)1445 void Manager::getSensorValues(std::unique_ptr<Status>& occ)
1446 {
1447     static bool tracedError[8] = {0};
1448     const fs::path sensorPath = occ->getHwmonPath();
1449     const uint32_t id = occ->getOccInstanceID();
1450 
1451     if (fs::exists(sensorPath))
1452     {
1453         // Read temperature sensors
1454         readTempSensors(sensorPath, id);
1455         // Read Extended sensors
1456         readExtnSensors(sensorPath, id);
1457 
1458         if (occ->isMasterOcc())
1459         {
1460             // Read power sensors
1461             readPowerSensors(sensorPath, id);
1462         }
1463         tracedError[id] = false;
1464     }
1465     else
1466     {
1467         if (!tracedError[id])
1468         {
1469             lg2::error(
1470                 "Manager::getSensorValues: OCC{INST} sensor path missing: {PATH}",
1471                 "INST", id, "PATH", sensorPath);
1472             tracedError[id] = true;
1473         }
1474     }
1475 
1476     return;
1477 }
1478 #endif
1479 
1480 // Read the altitude from DBus
readAltitude()1481 void Manager::readAltitude()
1482 {
1483     static bool traceAltitudeErr = true;
1484 
1485     utils::PropertyValue altitudeProperty{};
1486     try
1487     {
1488         altitudeProperty = utils::getProperty(ALTITUDE_PATH, ALTITUDE_INTERFACE,
1489                                               ALTITUDE_PROP);
1490         auto sensorVal = std::get<double>(altitudeProperty);
1491         if (sensorVal < 0xFFFF)
1492         {
1493             if (sensorVal < 0)
1494             {
1495                 altitude = 0;
1496             }
1497             else
1498             {
1499                 // Round to nearest meter
1500                 altitude = uint16_t(sensorVal + 0.5);
1501             }
1502             lg2::debug("readAltitude: sensor={VALUE} ({ALT}m)", "VALUE",
1503                        sensorVal, "ALT", altitude);
1504             traceAltitudeErr = true;
1505         }
1506         else
1507         {
1508             if (traceAltitudeErr)
1509             {
1510                 traceAltitudeErr = false;
1511                 lg2::debug("Invalid altitude value: {ALT}", "ALT", sensorVal);
1512             }
1513         }
1514     }
1515     catch (const sdbusplus::exception_t& e)
1516     {
1517         if (traceAltitudeErr)
1518         {
1519             traceAltitudeErr = false;
1520             lg2::info("Unable to read Altitude: {ERROR}", "ERROR", e.what());
1521         }
1522         altitude = 0xFFFF; // not available
1523     }
1524 }
1525 
1526 // Callback function when ambient temperature changes
ambientCallback(sdbusplus::message_t & msg)1527 void Manager::ambientCallback(sdbusplus::message_t& msg)
1528 {
1529     double currentTemp = 0;
1530     uint8_t truncatedTemp = 0xFF;
1531     std::string msgSensor;
1532     std::map<std::string, std::variant<double>> msgData;
1533     msg.read(msgSensor, msgData);
1534 
1535     auto valPropMap = msgData.find(AMBIENT_PROP);
1536     if (valPropMap == msgData.end())
1537     {
1538         lg2::debug("ambientCallback: Unknown ambient property changed");
1539         return;
1540     }
1541     currentTemp = std::get<double>(valPropMap->second);
1542     if (std::isnan(currentTemp))
1543     {
1544         truncatedTemp = 0xFF;
1545     }
1546     else
1547     {
1548         if (currentTemp < 0)
1549         {
1550             truncatedTemp = 0;
1551         }
1552         else
1553         {
1554             // Round to nearest degree C
1555             truncatedTemp = uint8_t(currentTemp + 0.5);
1556         }
1557     }
1558 
1559     // If ambient changes, notify OCCs
1560     if (truncatedTemp != ambient)
1561     {
1562         lg2::debug("ambientCallback: Ambient change from {OLD} to {NEW}C",
1563                    "OLD", ambient, "NEW", currentTemp);
1564 
1565         ambient = truncatedTemp;
1566         if (altitude == 0xFFFF)
1567         {
1568             // No altitude yet, try reading again
1569             readAltitude();
1570         }
1571 
1572         lg2::debug("ambientCallback: Ambient: {TEMP}C, altitude: {ALT}m",
1573                    "TEMP", ambient, "ALT", altitude);
1574 #ifdef POWER10
1575         // Send ambient and altitude to all OCCs
1576         for (auto& obj : statusObjects)
1577         {
1578             if (obj->occActive())
1579             {
1580                 obj->sendAmbient(ambient, altitude);
1581             }
1582         }
1583 #endif // POWER10
1584     }
1585 }
1586 
1587 // return the current ambient and altitude readings
getAmbientData(bool & ambientValid,uint8_t & ambientTemp,uint16_t & altitudeValue) const1588 void Manager::getAmbientData(bool& ambientValid, uint8_t& ambientTemp,
1589                              uint16_t& altitudeValue) const
1590 {
1591     ambientValid = true;
1592     ambientTemp = ambient;
1593     altitudeValue = altitude;
1594 
1595     if (ambient == 0xFF)
1596     {
1597         ambientValid = false;
1598     }
1599 }
1600 
1601 #ifdef POWER10
1602 // Called when waitForAllOccsTimer expires
1603 // After the first OCC goes active, this timer will be started (60 seconds)
occsNotAllRunning()1604 void Manager::occsNotAllRunning()
1605 {
1606     if (resetInProgress)
1607     {
1608         lg2::warning(
1609             "occsNotAllRunning: Ignoring waitForAllOccsTimer because reset is in progress");
1610         return;
1611     }
1612     if (activeCount != statusObjects.size())
1613     {
1614         // Not all OCCs went active
1615         lg2::warning(
1616             "occsNotAllRunning: Active OCC count ({COUNT}) does not match expected count ({EXP})",
1617             "COUNT", activeCount, "EXP", statusObjects.size());
1618         // Procs may be garded, so may be expected
1619     }
1620 
1621     if (resetRequired)
1622     {
1623         initiateOccRequest(resetInstance);
1624 
1625         if (!waitForAllOccsTimer->isEnabled())
1626         {
1627             lg2::warning("occsNotAllRunning: Restarting waitForAllOccTimer");
1628             // restart occ wait timer
1629             waitForAllOccsTimer->restartOnce(60s);
1630         }
1631     }
1632     else
1633     {
1634         validateOccMaster();
1635     }
1636 }
1637 
1638 #ifdef PLDM
1639 // Called when throttlePldmTraceTimer expires.
1640 // If this timer expires, that indicates there are no OCC active sensor PDRs
1641 // found which will trigger pldm traces to be throttled.
1642 // The second time this timer expires, a PEL will get created.
throttlePldmTraceExpired()1643 void Manager::throttlePldmTraceExpired()
1644 {
1645     if (utils::isHostRunning())
1646     {
1647         if (!onPldmTimeoutCreatePel)
1648         {
1649             // Throttle traces
1650             pldmHandle->setTraceThrottle(true);
1651             // Restart timer to log a PEL when timer expires
1652             onPldmTimeoutCreatePel = true;
1653             throttlePldmTraceTimer->restartOnce(40min);
1654         }
1655         else
1656         {
1657             lg2::error(
1658                 "throttlePldmTraceExpired(): OCC active sensors still not available!");
1659             // Create PEL
1660             createPldmSensorPEL();
1661         }
1662     }
1663     else
1664     {
1665         // Make sure traces are not throttled
1666         pldmHandle->setTraceThrottle(false);
1667         lg2::info(
1668             "throttlePldmTraceExpired(): host it not running ignoring sensor timer");
1669     }
1670 }
1671 
createPldmSensorPEL()1672 void Manager::createPldmSensorPEL()
1673 {
1674     Error::Descriptor d = Error::Descriptor(MISSING_OCC_SENSORS_PATH);
1675     std::map<std::string, std::string> additionalData;
1676 
1677     additionalData.emplace("_PID", std::to_string(getpid()));
1678 
1679     lg2::info(
1680         "createPldmSensorPEL(): Unable to find PLDM sensors for the OCCs");
1681 
1682     auto& bus = utils::getBus();
1683 
1684     try
1685     {
1686         FFDCFiles ffdc;
1687         // Add occ-control journal traces to PEL FFDC
1688         auto occJournalFile =
1689             FFDC::addJournalEntries(ffdc, "openpower-occ-control", 40);
1690 
1691         static constexpr auto loggingObjectPath =
1692             "/xyz/openbmc_project/logging";
1693         static constexpr auto opLoggingInterface = "org.open_power.Logging.PEL";
1694         std::string service =
1695             utils::getService(loggingObjectPath, opLoggingInterface);
1696         auto method =
1697             bus.new_method_call(service.c_str(), loggingObjectPath,
1698                                 opLoggingInterface, "CreatePELWithFFDCFiles");
1699 
1700         // Set level to Warning (Predictive).
1701         auto level =
1702             sdbusplus::xyz::openbmc_project::Logging::server::convertForMessage(
1703                 sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level::
1704                     Warning);
1705 
1706         method.append(d.path, level, additionalData, ffdc);
1707         bus.call(method);
1708     }
1709     catch (const sdbusplus::exception_t& e)
1710     {
1711         lg2::error("Failed to create MISSING_OCC_SENSORS PEL: {ERROR}", "ERROR",
1712                    e.what());
1713     }
1714 }
1715 #endif // PLDM
1716 #endif // POWER10
1717 
1718 // Verify single master OCC and start presence monitor
validateOccMaster()1719 void Manager::validateOccMaster()
1720 {
1721     int masterInstance = -1;
1722     for (auto& obj : statusObjects)
1723     {
1724         auto instance = obj->getOccInstanceID();
1725 #ifdef POWER10
1726         if (!obj->occActive())
1727         {
1728             if (utils::isHostRunning())
1729             {
1730                 // Check if sensor was queued while waiting for discovery
1731                 auto match = queuedActiveState.find(instance);
1732                 if (match != queuedActiveState.end())
1733                 {
1734                     queuedActiveState.erase(match);
1735                     lg2::info("validateOccMaster: OCC{INST} is ACTIVE (queued)",
1736                               "INST", instance);
1737                     obj->occActive(true);
1738                 }
1739                 else
1740                 {
1741                     // OCC does not appear to be active yet, check active sensor
1742 #ifdef PLDM
1743                     pldmHandle->checkActiveSensor(instance);
1744 #endif
1745                     if (obj->occActive())
1746                     {
1747                         lg2::info(
1748                             "validateOccMaster: OCC{INST} is ACTIVE after reading sensor",
1749                             "INST", instance);
1750                     }
1751                 }
1752             }
1753             else
1754             {
1755                 lg2::warning(
1756                     "validateOccMaster: HOST is not running (OCC{INST})",
1757                     "INST", instance);
1758                 return;
1759             }
1760         }
1761 #endif // POWER10
1762 
1763         if (obj->isMasterOcc())
1764         {
1765             obj->addPresenceWatchMaster();
1766 
1767             if (masterInstance == -1)
1768             {
1769                 masterInstance = instance;
1770             }
1771             else
1772             {
1773                 lg2::error(
1774                     "validateOccMaster: Multiple OCC masters! ({MAST1} and {MAST2})",
1775                     "MAST1", masterInstance, "MAST2", instance);
1776                 // request reset
1777                 obj->deviceError(Error::Descriptor(PRESENCE_ERROR_PATH));
1778             }
1779         }
1780     }
1781 
1782     if (masterInstance < 0)
1783     {
1784         lg2::error("validateOccMaster: Master OCC not found! (of {NUM} OCCs)",
1785                    "NUM", statusObjects.size());
1786         // request reset
1787         statusObjects.front()->deviceError(
1788             Error::Descriptor(PRESENCE_ERROR_PATH));
1789     }
1790     else
1791     {
1792         lg2::info("validateOccMaster: OCC{INST} is master of {COUNT} OCCs",
1793                   "INST", masterInstance, "COUNT", activeCount);
1794 #ifdef POWER10
1795         pmode->updateDbusSafeMode(false);
1796 #endif
1797     }
1798 }
1799 
updatePcapBounds() const1800 void Manager::updatePcapBounds() const
1801 {
1802     if (pcap)
1803     {
1804         pcap->updatePcapBounds();
1805     }
1806 }
1807 
1808 // Clean up any variables since the OCC is no longer running.
1809 // Called when pldm receives an event indicating host is powered off.
hostPoweredOff()1810 void Manager::hostPoweredOff()
1811 {
1812     if (resetRequired)
1813     {
1814         lg2::info("hostPoweredOff: Clearing resetRequired for OCC{INST}",
1815                   "INST", resetInstance);
1816         resetRequired = false;
1817     }
1818     if (resetInProgress)
1819     {
1820         lg2::info("hostPoweredOff: Clearing resetInProgress for OCC{INST}",
1821                   "INST", resetInstance);
1822         resetInProgress = false;
1823     }
1824     resetInstance = 255;
1825 }
1826 
1827 } // namespace occ
1828 } // namespace open_power
1829