xref: /openbmc/openpower-occ-control/occ_manager.cpp (revision 7bd6d148e4f204266fe8ffd3df8aa93cf0284cd0)
1 #include "config.h"
2 
3 #include "occ_manager.hpp"
4 
5 #include "i2c_occ.hpp"
6 #include "occ_dbus.hpp"
7 #include "utils.hpp"
8 
9 #include <phosphor-logging/elog-errors.hpp>
10 #include <phosphor-logging/log.hpp>
11 #include <xyz/openbmc_project/Common/error.hpp>
12 
13 #include <chrono>
14 #include <cmath>
15 #include <filesystem>
16 #include <fstream>
17 #include <regex>
18 
19 namespace open_power
20 {
21 namespace occ
22 {
23 
24 constexpr uint32_t fruTypeNotAvailable = 0xFF;
25 constexpr auto fruTypeSuffix = "fru_type";
26 constexpr auto faultSuffix = "fault";
27 constexpr auto inputSuffix = "input";
28 constexpr auto maxSuffix = "max";
29 
30 const auto HOST_ON_FILE = "/run/openbmc/host@0-on";
31 
32 using namespace phosphor::logging;
33 using namespace std::literals::chrono_literals;
34 
35 template <typename T>
36 T readFile(const std::string& path)
37 {
38     std::ifstream ifs;
39     ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit |
40                    std::ifstream::eofbit);
41     T data;
42 
43     try
44     {
45         ifs.open(path);
46         ifs >> data;
47         ifs.close();
48     }
49     catch (const std::exception& e)
50     {
51         auto err = errno;
52         throw std::system_error(err, std::generic_category());
53     }
54 
55     return data;
56 }
57 
58 void Manager::findAndCreateObjects()
59 {
60 #ifndef POWER10
61     for (auto id = 0; id < MAX_CPUS; ++id)
62     {
63         // Create one occ per cpu
64         auto occ = std::string(OCC_NAME) + std::to_string(id);
65         createObjects(occ);
66     }
67 #else
68     if (!pmode)
69     {
70         // Create the power mode object
71         pmode = std::make_unique<powermode::PowerMode>(
72             *this, powermode::PMODE_PATH, powermode::PIPS_PATH, event);
73     }
74 
75     if (!fs::exists(HOST_ON_FILE))
76     {
77         static bool statusObjCreated = false;
78         if (!statusObjCreated)
79         {
80             // Create the OCCs based on on the /dev/occX devices
81             auto occs = findOCCsInDev();
82 
83             if (occs.empty() || (prevOCCSearch.size() != occs.size()))
84             {
85                 // Something changed or no OCCs yet, try again in 10s.
86                 // Note on the first pass prevOCCSearch will be empty,
87                 // so there will be at least one delay to give things
88                 // a chance to settle.
89                 prevOCCSearch = occs;
90 
91                 log<level::INFO>(
92                     fmt::format(
93                         "Manager::findAndCreateObjects(): Waiting for OCCs (currently {})",
94                         occs.size())
95                         .c_str());
96 
97                 discoverTimer->restartOnce(10s);
98             }
99             else
100             {
101                 // All OCCs appear to be available, create status objects
102 
103                 // createObjects requires OCC0 first.
104                 std::sort(occs.begin(), occs.end());
105 
106                 log<level::INFO>(
107                     fmt::format(
108                         "Manager::findAndCreateObjects(): Creating {} OCC Status Objects",
109                         occs.size())
110                         .c_str());
111                 for (auto id : occs)
112                 {
113                     createObjects(std::string(OCC_NAME) + std::to_string(id));
114                 }
115                 statusObjCreated = true;
116                 waitingForAllOccActiveSensors = true;
117             }
118         }
119 
120         if (statusObjCreated && waitingForAllOccActiveSensors)
121         {
122             static bool tracedHostWait = false;
123             if (utils::isHostRunning())
124             {
125                 if (tracedHostWait)
126                 {
127                     log<level::INFO>(
128                         "Manager::findAndCreateObjects(): Host is running");
129                     tracedHostWait = false;
130                 }
131                 checkAllActiveSensors();
132             }
133             else
134             {
135                 if (!tracedHostWait)
136                 {
137                     log<level::INFO>(
138                         "Manager::findAndCreateObjects(): Waiting for host to start");
139                     tracedHostWait = true;
140                 }
141                 discoverTimer->restartOnce(30s);
142             }
143         }
144     }
145     else
146     {
147         log<level::INFO>(
148             fmt::format(
149                 "Manager::findAndCreateObjects(): Waiting for {} to complete...",
150                 HOST_ON_FILE)
151                 .c_str());
152         discoverTimer->restartOnce(10s);
153     }
154 #endif
155 }
156 
157 #ifdef POWER10
158 // Check if all occActive sensors are available
159 void Manager::checkAllActiveSensors()
160 {
161     static bool allActiveSensorAvailable = false;
162     static bool tracedSensorWait = false;
163 
164     // Start with the assumption that all are available
165     allActiveSensorAvailable = true;
166     for (auto& obj : statusObjects)
167     {
168         if (!obj->occActive())
169         {
170             if (!obj->getPldmSensorReceived())
171             {
172                 auto instance = obj->getOccInstanceID();
173                 // Check if sensor was queued while waiting for discovery
174                 auto match = queuedActiveState.find(instance);
175                 if (match != queuedActiveState.end())
176                 {
177                     queuedActiveState.erase(match);
178                     log<level::INFO>(
179                         fmt::format(
180                             "checkAllActiveSensors(): OCC{} is ACTIVE (queued)",
181                             instance)
182                             .c_str());
183                     obj->occActive(true);
184                 }
185                 else
186                 {
187                     allActiveSensorAvailable = false;
188                     if (!tracedSensorWait)
189                     {
190                         log<level::INFO>(
191                             fmt::format(
192                                 "checkAllActiveSensors(): Waiting on OCC{} Active sensor",
193                                 instance)
194                                 .c_str());
195                         tracedSensorWait = true;
196                     }
197                     pldmHandle->checkActiveSensor(obj->getOccInstanceID());
198                     break;
199                 }
200             }
201         }
202     }
203 
204     if (allActiveSensorAvailable)
205     {
206         // All sensors were found, disable the discovery timer
207         if (discoverTimer->isEnabled())
208         {
209             discoverTimer->setEnabled(false);
210         }
211 
212         if (waitingForAllOccActiveSensors)
213         {
214             log<level::INFO>(
215                 "checkAllActiveSensors(): OCC Active sensors are available");
216             waitingForAllOccActiveSensors = false;
217         }
218         queuedActiveState.clear();
219         tracedSensorWait = false;
220     }
221     else
222     {
223         // Not all sensors were available, so keep waiting
224         if (!tracedSensorWait)
225         {
226             log<level::INFO>(
227                 "checkAllActiveSensors(): Waiting for OCC Active sensors to become available");
228             tracedSensorWait = true;
229         }
230         discoverTimer->restartOnce(10s);
231     }
232 }
233 #endif
234 
235 std::vector<int> Manager::findOCCsInDev()
236 {
237     std::vector<int> occs;
238     std::regex expr{R"(occ(\d+)$)"};
239 
240     for (auto& file : fs::directory_iterator("/dev"))
241     {
242         std::smatch match;
243         std::string path{file.path().string()};
244         if (std::regex_search(path, match, expr))
245         {
246             auto num = std::stoi(match[1].str());
247 
248             // /dev numbering starts at 1, ours starts at 0.
249             occs.push_back(num - 1);
250         }
251     }
252 
253     return occs;
254 }
255 
256 int Manager::cpuCreated(sdbusplus::message_t& msg)
257 {
258     namespace fs = std::filesystem;
259 
260     sdbusplus::message::object_path o;
261     msg.read(o);
262     fs::path cpuPath(std::string(std::move(o)));
263 
264     auto name = cpuPath.filename().string();
265     auto index = name.find(CPU_NAME);
266     name.replace(index, std::strlen(CPU_NAME), OCC_NAME);
267 
268     createObjects(name);
269 
270     return 0;
271 }
272 
273 void Manager::createObjects(const std::string& occ)
274 {
275     auto path = fs::path(OCC_CONTROL_ROOT) / occ;
276 
277     statusObjects.emplace_back(std::make_unique<Status>(
278         event, path.c_str(), *this,
279 #ifdef POWER10
280         pmode,
281 #endif
282         std::bind(std::mem_fn(&Manager::statusCallBack), this,
283                   std::placeholders::_1, std::placeholders::_2)
284 #ifdef PLDM
285             ,
286         std::bind(std::mem_fn(&pldm::Interface::resetOCC), pldmHandle.get(),
287                   std::placeholders::_1)
288 #endif
289             ));
290 
291     // Create the power cap monitor object
292     if (!pcap)
293     {
294         pcap = std::make_unique<open_power::occ::powercap::PowerCap>(
295             *statusObjects.back());
296     }
297 
298     if (statusObjects.back()->isMasterOcc())
299     {
300         log<level::INFO>(
301             fmt::format("Manager::createObjects(): OCC{} is the master",
302                         statusObjects.back()->getOccInstanceID())
303                 .c_str());
304         _pollTimer->setEnabled(false);
305 
306 #ifdef POWER10
307         // Set the master OCC on the PowerMode object
308         pmode->setMasterOcc(path);
309 #endif
310     }
311 
312     passThroughObjects.emplace_back(std::make_unique<PassThrough>(path.c_str()
313 #ifdef POWER10
314                                                                       ,
315                                                                   pmode
316 #endif
317                                                                   ));
318 }
319 
320 void Manager::statusCallBack(instanceID instance, bool status)
321 {
322     using InternalFailure =
323         sdbusplus::xyz::openbmc_project::Common::Error::InternalFailure;
324 
325     // At this time, it won't happen but keeping it
326     // here just in case something changes in the future
327     if ((activeCount == 0) && (!status))
328     {
329         log<level::ERR>(
330             fmt::format("Invalid update on OCCActive with OCC{}", instance)
331                 .c_str());
332 
333         elog<InternalFailure>();
334     }
335 
336     if (status == true)
337     {
338         // OCC went active
339         ++activeCount;
340 
341 #ifdef POWER10
342         if (activeCount == 1)
343         {
344             // First OCC went active (allow some time for all OCCs to go active)
345             waitForAllOccsTimer->restartOnce(60s);
346         }
347 #endif
348 
349         if (activeCount == statusObjects.size())
350         {
351 #ifdef POWER10
352             // All OCCs are now running
353             if (waitForAllOccsTimer->isEnabled())
354             {
355                 // stop occ wait timer
356                 waitForAllOccsTimer->setEnabled(false);
357             }
358 #endif
359 
360             // Verify master OCC and start presence monitor
361             validateOccMaster();
362         }
363 
364         // Start poll timer if not already started
365         if (!_pollTimer->isEnabled())
366         {
367             log<level::INFO>(
368                 fmt::format("Manager: OCCs will be polled every {} seconds",
369                             pollInterval)
370                     .c_str());
371 
372             // Send poll and start OCC poll timer
373             pollerTimerExpired();
374         }
375     }
376     else
377     {
378         // OCC went away
379         --activeCount;
380 
381         if (activeCount == 0)
382         {
383             // No OCCs are running
384 
385             // Stop OCC poll timer
386             if (_pollTimer->isEnabled())
387             {
388                 log<level::INFO>(
389                     "Manager::statusCallBack(): OCCs are not running, stopping poll timer");
390                 _pollTimer->setEnabled(false);
391             }
392 
393 #ifdef POWER10
394             // stop wait timer
395             if (waitForAllOccsTimer->isEnabled())
396             {
397                 waitForAllOccsTimer->setEnabled(false);
398             }
399 #endif
400         }
401 #ifdef READ_OCC_SENSORS
402         // Clear OCC sensors
403         setSensorValueToNaN(instance);
404 #endif
405     }
406 
407 #ifdef POWER10
408     if (waitingForAllOccActiveSensors)
409     {
410         if (utils::isHostRunning())
411         {
412             checkAllActiveSensors();
413         }
414     }
415 #endif
416 }
417 
418 #ifdef I2C_OCC
419 void Manager::initStatusObjects()
420 {
421     // Make sure we have a valid path string
422     static_assert(sizeof(DEV_PATH) != 0);
423 
424     auto deviceNames = i2c_occ::getOccHwmonDevices(DEV_PATH);
425     for (auto& name : deviceNames)
426     {
427         i2c_occ::i2cToDbus(name);
428         name = std::string(OCC_NAME) + '_' + name;
429         auto path = fs::path(OCC_CONTROL_ROOT) / name;
430         statusObjects.emplace_back(
431             std::make_unique<Status>(event, path.c_str(), *this));
432     }
433     // The first device is master occ
434     pcap = std::make_unique<open_power::occ::powercap::PowerCap>(
435         *statusObjects.front());
436 #ifdef POWER10
437     pmode = std::make_unique<powermode::PowerMode>(*this, powermode::PMODE_PATH,
438                                                    powermode::PIPS_PATH);
439     // Set the master OCC on the PowerMode object
440     pmode->setMasterOcc(path);
441 #endif
442 }
443 #endif
444 
445 #ifdef PLDM
446 void Manager::sbeTimeout(unsigned int instance)
447 {
448     auto obj = std::find_if(statusObjects.begin(), statusObjects.end(),
449                             [instance](const auto& obj) {
450                                 return instance == obj->getOccInstanceID();
451                             });
452 
453     if (obj != statusObjects.end() && (*obj)->occActive())
454     {
455         log<level::INFO>(
456             fmt::format("SBE timeout, requesting HRESET (OCC{})", instance)
457                 .c_str());
458 
459         setSBEState(instance, SBE_STATE_NOT_USABLE);
460 
461         pldmHandle->sendHRESET(instance);
462     }
463 }
464 
465 bool Manager::updateOCCActive(instanceID instance, bool status)
466 {
467     auto obj = std::find_if(statusObjects.begin(), statusObjects.end(),
468                             [instance](const auto& obj) {
469                                 return instance == obj->getOccInstanceID();
470                             });
471 
472     if (obj != statusObjects.end())
473     {
474         (*obj)->setPldmSensorReceived(true);
475         return (*obj)->occActive(status);
476     }
477     else
478     {
479         log<level::WARNING>(
480             fmt::format(
481                 "Manager::updateOCCActive: No status object to update for OCC{} (active={})",
482                 instance, status)
483                 .c_str());
484         if (status == true)
485         {
486             // OCC went active
487             queuedActiveState.insert(instance);
488         }
489         else
490         {
491             auto match = queuedActiveState.find(instance);
492             if (match != queuedActiveState.end())
493             {
494                 // OCC was disabled
495                 queuedActiveState.erase(match);
496             }
497         }
498         return false;
499     }
500 }
501 
502 // Called upon pldm event To set powermode Safe Mode State for system.
503 void Manager::updateOccSafeMode(bool safeMode)
504 {
505 #ifdef POWER10
506     pmode->updateDbusSafeMode(safeMode);
507 #endif
508 }
509 
510 void Manager::sbeHRESETResult(instanceID instance, bool success)
511 {
512     if (success)
513     {
514         log<level::INFO>(
515             fmt::format("HRESET succeeded (OCC{})", instance).c_str());
516 
517         setSBEState(instance, SBE_STATE_BOOTED);
518 
519         return;
520     }
521 
522     setSBEState(instance, SBE_STATE_FAILED);
523 
524     if (sbeCanDump(instance))
525     {
526         log<level::INFO>(
527             fmt::format("HRESET failed (OCC{}), triggering SBE dump", instance)
528                 .c_str());
529 
530         auto& bus = utils::getBus();
531         uint32_t src6 = instance << 16;
532         uint32_t logId =
533             FFDC::createPEL("org.open_power.Processor.Error.SbeChipOpTimeout",
534                             src6, "SBE command timeout");
535 
536         try
537         {
538             constexpr auto path = "/org/openpower/dump";
539             constexpr auto interface = "xyz.openbmc_project.Dump.Create";
540             constexpr auto function = "CreateDump";
541 
542             std::string service = utils::getService(path, interface);
543             auto method =
544                 bus.new_method_call(service.c_str(), path, interface, function);
545 
546             std::map<std::string, std::variant<std::string, uint64_t>>
547                 createParams{
548                     {"com.ibm.Dump.Create.CreateParameters.ErrorLogId",
549                      uint64_t(logId)},
550                     {"com.ibm.Dump.Create.CreateParameters.DumpType",
551                      "com.ibm.Dump.Create.DumpType.SBE"},
552                     {"com.ibm.Dump.Create.CreateParameters.FailingUnitId",
553                      uint64_t(instance)},
554                 };
555 
556             method.append(createParams);
557 
558             auto response = bus.call(method);
559         }
560         catch (const sdbusplus::exception_t& e)
561         {
562             constexpr auto ERROR_DUMP_DISABLED =
563                 "xyz.openbmc_project.Dump.Create.Error.Disabled";
564             if (e.name() == ERROR_DUMP_DISABLED)
565             {
566                 log<level::INFO>("Dump is disabled, skipping");
567             }
568             else
569             {
570                 log<level::ERR>("Dump failed");
571             }
572         }
573     }
574 }
575 
576 bool Manager::sbeCanDump(unsigned int instance)
577 {
578     struct pdbg_target* proc = getPdbgTarget(instance);
579 
580     if (!proc)
581     {
582         // allow the dump in the error case
583         return true;
584     }
585 
586     try
587     {
588         if (!openpower::phal::sbe::isDumpAllowed(proc))
589         {
590             return false;
591         }
592 
593         if (openpower::phal::pdbg::isSbeVitalAttnActive(proc))
594         {
595             return false;
596         }
597     }
598     catch (openpower::phal::exception::SbeError& e)
599     {
600         log<level::INFO>("Failed to query SBE state");
601     }
602 
603     // allow the dump in the error case
604     return true;
605 }
606 
607 void Manager::setSBEState(unsigned int instance, enum sbe_state state)
608 {
609     struct pdbg_target* proc = getPdbgTarget(instance);
610 
611     if (!proc)
612     {
613         return;
614     }
615 
616     try
617     {
618         openpower::phal::sbe::setState(proc, state);
619     }
620     catch (const openpower::phal::exception::SbeError& e)
621     {
622         log<level::ERR>("Failed to set SBE state");
623     }
624 }
625 
626 struct pdbg_target* Manager::getPdbgTarget(unsigned int instance)
627 {
628     if (!pdbgInitialized)
629     {
630         try
631         {
632             openpower::phal::pdbg::init();
633             pdbgInitialized = true;
634         }
635         catch (const openpower::phal::exception::PdbgError& e)
636         {
637             log<level::ERR>("pdbg initialization failed");
638             return nullptr;
639         }
640     }
641 
642     struct pdbg_target* proc = nullptr;
643     pdbg_for_each_class_target("proc", proc)
644     {
645         if (pdbg_target_index(proc) == instance)
646         {
647             return proc;
648         }
649     }
650 
651     log<level::ERR>("Failed to get pdbg target");
652     return nullptr;
653 }
654 #endif
655 
656 void Manager::pollerTimerExpired()
657 {
658     if (!_pollTimer)
659     {
660         log<level::ERR>(
661             "Manager::pollerTimerExpired() ERROR: Timer not defined");
662         return;
663     }
664 
665     for (auto& obj : statusObjects)
666     {
667         if (!obj->occActive())
668         {
669             // OCC is not running yet
670 #ifdef READ_OCC_SENSORS
671             auto id = obj->getOccInstanceID();
672             setSensorValueToNaN(id);
673 #endif
674             continue;
675         }
676 
677         // Read sysfs to force kernel to poll OCC
678         obj->readOccState();
679 
680 #ifdef READ_OCC_SENSORS
681         // Read occ sensor values
682         getSensorValues(obj);
683 #endif
684     }
685 
686     if (activeCount > 0)
687     {
688         // Restart OCC poll timer
689         _pollTimer->restartOnce(std::chrono::seconds(pollInterval));
690     }
691     else
692     {
693         // No OCCs running, so poll timer will not be restarted
694         log<level::INFO>(
695             fmt::format(
696                 "Manager::pollerTimerExpired: poll timer will not be restarted")
697                 .c_str());
698     }
699 }
700 
701 #ifdef READ_OCC_SENSORS
702 void Manager::readTempSensors(const fs::path& path, uint32_t id)
703 {
704     std::regex expr{"temp\\d+_label$"}; // Example: temp5_label
705     for (auto& file : fs::directory_iterator(path))
706     {
707         if (!std::regex_search(file.path().string(), expr))
708         {
709             continue;
710         }
711 
712         uint32_t labelValue{0};
713 
714         try
715         {
716             labelValue = readFile<uint32_t>(file.path());
717         }
718         catch (const std::system_error& e)
719         {
720             log<level::DEBUG>(
721                 fmt::format("readTempSensors: Failed reading {}, errno = {}",
722                             file.path().string(), e.code().value())
723                     .c_str());
724             continue;
725         }
726 
727         const std::string& tempLabel = "label";
728         const std::string filePathString = file.path().string().substr(
729             0, file.path().string().length() - tempLabel.length());
730 
731         uint32_t fruTypeValue{0};
732         try
733         {
734             fruTypeValue = readFile<uint32_t>(filePathString + fruTypeSuffix);
735         }
736         catch (const std::system_error& e)
737         {
738             log<level::DEBUG>(
739                 fmt::format("readTempSensors: Failed reading {}, errno = {}",
740                             filePathString + fruTypeSuffix, e.code().value())
741                     .c_str());
742             continue;
743         }
744 
745         std::string sensorPath =
746             OCC_SENSORS_ROOT + std::string("/temperature/");
747 
748         std::string dvfsTempPath;
749 
750         if (fruTypeValue == VRMVdd)
751         {
752             sensorPath.append("vrm_vdd" + std::to_string(id) + "_temp");
753         }
754         else if (fruTypeValue == processorIoRing)
755         {
756             sensorPath.append("proc" + std::to_string(id) + "_ioring_temp");
757             dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/proc" +
758                            std::to_string(id) + "_ioring_dvfs_temp";
759         }
760         else
761         {
762             uint16_t type = (labelValue & 0xFF000000) >> 24;
763             uint16_t instanceID = labelValue & 0x0000FFFF;
764 
765             if (type == OCC_DIMM_TEMP_SENSOR_TYPE)
766             {
767                 if (fruTypeValue == fruTypeNotAvailable)
768                 {
769                     // Not all DIMM related temps are available to read
770                     // (no _input file in this case)
771                     continue;
772                 }
773                 auto iter = dimmTempSensorName.find(fruTypeValue);
774                 if (iter == dimmTempSensorName.end())
775                 {
776                     log<level::ERR>(
777                         fmt::format(
778                             "readTempSensors: Fru type error! fruTypeValue = {}) ",
779                             fruTypeValue)
780                             .c_str());
781                     continue;
782                 }
783 
784                 sensorPath.append("dimm" + std::to_string(instanceID) +
785                                   iter->second);
786             }
787             else if (type == OCC_CPU_TEMP_SENSOR_TYPE)
788             {
789                 if (fruTypeValue == processorCore)
790                 {
791                     // The OCC reports small core temps, of which there are
792                     // two per big core.  All current P10 systems are in big
793                     // core mode, so use a big core name.
794                     uint16_t coreNum = instanceID / 2;
795                     uint16_t tempNum = instanceID % 2;
796                     sensorPath.append("proc" + std::to_string(id) + "_core" +
797                                       std::to_string(coreNum) + "_" +
798                                       std::to_string(tempNum) + "_temp");
799 
800                     dvfsTempPath = std::string{OCC_SENSORS_ROOT} +
801                                    "/temperature/proc" + std::to_string(id) +
802                                    "_core_dvfs_temp";
803                 }
804                 else
805                 {
806                     continue;
807                 }
808             }
809             else
810             {
811                 continue;
812             }
813         }
814 
815         // The dvfs temp file only needs to be read once per chip per type.
816         if (!dvfsTempPath.empty() &&
817             !dbus::OccDBusSensors::getOccDBus().hasDvfsTemp(dvfsTempPath))
818         {
819             try
820             {
821                 auto dvfsValue = readFile<double>(filePathString + maxSuffix);
822 
823                 dbus::OccDBusSensors::getOccDBus().setDvfsTemp(
824                     dvfsTempPath, dvfsValue * std::pow(10, -3));
825             }
826             catch (const std::system_error& e)
827             {
828                 log<level::DEBUG>(
829                     fmt::format(
830                         "readTempSensors: Failed reading {}, errno = {}",
831                         filePathString + maxSuffix, e.code().value())
832                         .c_str());
833             }
834         }
835 
836         uint32_t faultValue{0};
837         try
838         {
839             faultValue = readFile<uint32_t>(filePathString + faultSuffix);
840         }
841         catch (const std::system_error& e)
842         {
843             log<level::DEBUG>(
844                 fmt::format("readTempSensors: Failed reading {}, errno = {}",
845                             filePathString + faultSuffix, e.code().value())
846                     .c_str());
847             continue;
848         }
849 
850         // NOTE: if OCC sends back 0xFF kernal sets this fault value to 1.
851         if (faultValue != 0)
852         {
853             dbus::OccDBusSensors::getOccDBus().setValue(
854                 sensorPath, std::numeric_limits<double>::quiet_NaN());
855 
856             dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath,
857                                                                     false);
858 
859             continue;
860         }
861 
862         double tempValue{0};
863 
864         try
865         {
866             tempValue = readFile<double>(filePathString + inputSuffix);
867         }
868         catch (const std::system_error& e)
869         {
870             log<level::DEBUG>(
871                 fmt::format("readTempSensors: Failed reading {}, errno = {}",
872                             filePathString + inputSuffix, e.code().value())
873                     .c_str());
874 
875             // if errno == EAGAIN(Resource temporarily unavailable) then set
876             // temp to 0, to avoid using old temp, and affecting FAN Control.
877             if (e.code().value() == EAGAIN)
878             {
879                 tempValue = 0;
880             }
881             // else the errno would be something like
882             //     EBADF(Bad file descriptor)
883             // or ENOENT(No such file or directory)
884             else
885             {
886                 continue;
887             }
888         }
889 
890         dbus::OccDBusSensors::getOccDBus().setValue(
891             sensorPath, tempValue * std::pow(10, -3));
892 
893         dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath,
894                                                                 true);
895 
896         // At this point, the sensor will be created for sure.
897         if (existingSensors.find(sensorPath) == existingSensors.end())
898         {
899             dbus::OccDBusSensors::getOccDBus().setChassisAssociation(
900                 sensorPath);
901         }
902 
903         existingSensors[sensorPath] = id;
904     }
905     return;
906 }
907 
908 std::optional<std::string>
909     Manager::getPowerLabelFunctionID(const std::string& value)
910 {
911     // If the value is "system", then the FunctionID is "system".
912     if (value == "system")
913     {
914         return value;
915     }
916 
917     // If the value is not "system", then the label value have 3 numbers, of
918     // which we only care about the middle one:
919     // <sensor id>_<function id>_<apss channel>
920     // eg: The value is "0_10_5" , then the FunctionID is "10".
921     if (value.find("_") == std::string::npos)
922     {
923         return std::nullopt;
924     }
925 
926     auto powerLabelValue = value.substr((value.find("_") + 1));
927 
928     if (powerLabelValue.find("_") == std::string::npos)
929     {
930         return std::nullopt;
931     }
932 
933     return powerLabelValue.substr(0, powerLabelValue.find("_"));
934 }
935 
936 void Manager::readPowerSensors(const fs::path& path, uint32_t id)
937 {
938     std::regex expr{"power\\d+_label$"}; // Example: power5_label
939     for (auto& file : fs::directory_iterator(path))
940     {
941         if (!std::regex_search(file.path().string(), expr))
942         {
943             continue;
944         }
945 
946         std::string labelValue;
947         try
948         {
949             labelValue = readFile<std::string>(file.path());
950         }
951         catch (const std::system_error& e)
952         {
953             log<level::DEBUG>(
954                 fmt::format("readPowerSensors: Failed reading {}, errno = {}",
955                             file.path().string(), e.code().value())
956                     .c_str());
957             continue;
958         }
959 
960         auto functionID = getPowerLabelFunctionID(labelValue);
961         if (functionID == std::nullopt)
962         {
963             continue;
964         }
965 
966         const std::string& tempLabel = "label";
967         const std::string filePathString = file.path().string().substr(
968             0, file.path().string().length() - tempLabel.length());
969 
970         std::string sensorPath = OCC_SENSORS_ROOT + std::string("/power/");
971 
972         auto iter = powerSensorName.find(*functionID);
973         if (iter == powerSensorName.end())
974         {
975             continue;
976         }
977         sensorPath.append(iter->second);
978 
979         double tempValue{0};
980 
981         try
982         {
983             tempValue = readFile<double>(filePathString + inputSuffix);
984         }
985         catch (const std::system_error& e)
986         {
987             log<level::DEBUG>(
988                 fmt::format("readPowerSensors: Failed reading {}, errno = {}",
989                             filePathString + inputSuffix, e.code().value())
990                     .c_str());
991             continue;
992         }
993 
994         dbus::OccDBusSensors::getOccDBus().setUnit(
995             sensorPath, "xyz.openbmc_project.Sensor.Value.Unit.Watts");
996 
997         dbus::OccDBusSensors::getOccDBus().setValue(
998             sensorPath, tempValue * std::pow(10, -3) * std::pow(10, -3));
999 
1000         dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath,
1001                                                                 true);
1002 
1003         if (existingSensors.find(sensorPath) == existingSensors.end())
1004         {
1005             dbus::OccDBusSensors::getOccDBus().setChassisAssociation(
1006                 sensorPath);
1007         }
1008 
1009         existingSensors[sensorPath] = id;
1010     }
1011     return;
1012 }
1013 
1014 void Manager::setSensorValueToNaN(uint32_t id) const
1015 {
1016     for (const auto& [sensorPath, occId] : existingSensors)
1017     {
1018         if (occId == id)
1019         {
1020             dbus::OccDBusSensors::getOccDBus().setValue(
1021                 sensorPath, std::numeric_limits<double>::quiet_NaN());
1022 
1023             dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath,
1024                                                                     true);
1025         }
1026     }
1027     return;
1028 }
1029 
1030 void Manager::setSensorValueToNonFunctional(uint32_t id) const
1031 {
1032     for (const auto& [sensorPath, occId] : existingSensors)
1033     {
1034         if (occId == id)
1035         {
1036             dbus::OccDBusSensors::getOccDBus().setValue(
1037                 sensorPath, std::numeric_limits<double>::quiet_NaN());
1038 
1039             dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath,
1040                                                                     false);
1041         }
1042     }
1043     return;
1044 }
1045 
1046 void Manager::getSensorValues(std::unique_ptr<Status>& occ)
1047 {
1048     static bool tracedError[8] = {0};
1049     const fs::path sensorPath = occ->getHwmonPath();
1050     const uint32_t id = occ->getOccInstanceID();
1051 
1052     if (fs::exists(sensorPath))
1053     {
1054         // Read temperature sensors
1055         readTempSensors(sensorPath, id);
1056 
1057         if (occ->isMasterOcc())
1058         {
1059             // Read power sensors
1060             readPowerSensors(sensorPath, id);
1061         }
1062         tracedError[id] = false;
1063     }
1064     else
1065     {
1066         if (!tracedError[id])
1067         {
1068             log<level::ERR>(
1069                 fmt::format(
1070                     "Manager::getSensorValues: OCC{} sensor path missing: {}",
1071                     id, sensorPath.c_str())
1072                     .c_str());
1073             tracedError[id] = true;
1074         }
1075     }
1076 
1077     return;
1078 }
1079 #endif
1080 
1081 // Read the altitude from DBus
1082 void Manager::readAltitude()
1083 {
1084     static bool traceAltitudeErr = true;
1085 
1086     utils::PropertyValue altitudeProperty{};
1087     try
1088     {
1089         altitudeProperty = utils::getProperty(ALTITUDE_PATH, ALTITUDE_INTERFACE,
1090                                               ALTITUDE_PROP);
1091         auto sensorVal = std::get<double>(altitudeProperty);
1092         if (sensorVal < 0xFFFF)
1093         {
1094             if (sensorVal < 0)
1095             {
1096                 altitude = 0;
1097             }
1098             else
1099             {
1100                 // Round to nearest meter
1101                 altitude = uint16_t(sensorVal + 0.5);
1102             }
1103             log<level::DEBUG>(fmt::format("readAltitude: sensor={} ({}m)",
1104                                           sensorVal, altitude)
1105                                   .c_str());
1106             traceAltitudeErr = true;
1107         }
1108         else
1109         {
1110             if (traceAltitudeErr)
1111             {
1112                 traceAltitudeErr = false;
1113                 log<level::DEBUG>(
1114                     fmt::format("Invalid altitude value: {}", sensorVal)
1115                         .c_str());
1116             }
1117         }
1118     }
1119     catch (const sdbusplus::exception_t& e)
1120     {
1121         if (traceAltitudeErr)
1122         {
1123             traceAltitudeErr = false;
1124             log<level::INFO>(
1125                 fmt::format("Unable to read Altitude: {}", e.what()).c_str());
1126         }
1127         altitude = 0xFFFF; // not available
1128     }
1129 }
1130 
1131 // Callback function when ambient temperature changes
1132 void Manager::ambientCallback(sdbusplus::message_t& msg)
1133 {
1134     double currentTemp = 0;
1135     uint8_t truncatedTemp = 0xFF;
1136     std::string msgSensor;
1137     std::map<std::string, std::variant<double>> msgData;
1138     msg.read(msgSensor, msgData);
1139 
1140     auto valPropMap = msgData.find(AMBIENT_PROP);
1141     if (valPropMap == msgData.end())
1142     {
1143         log<level::DEBUG>("ambientCallback: Unknown ambient property changed");
1144         return;
1145     }
1146     currentTemp = std::get<double>(valPropMap->second);
1147     if (std::isnan(currentTemp))
1148     {
1149         truncatedTemp = 0xFF;
1150     }
1151     else
1152     {
1153         if (currentTemp < 0)
1154         {
1155             truncatedTemp = 0;
1156         }
1157         else
1158         {
1159             // Round to nearest degree C
1160             truncatedTemp = uint8_t(currentTemp + 0.5);
1161         }
1162     }
1163 
1164     // If ambient changes, notify OCCs
1165     if (truncatedTemp != ambient)
1166     {
1167         log<level::DEBUG>(
1168             fmt::format("ambientCallback: Ambient change from {} to {}C",
1169                         ambient, currentTemp)
1170                 .c_str());
1171 
1172         ambient = truncatedTemp;
1173         if (altitude == 0xFFFF)
1174         {
1175             // No altitude yet, try reading again
1176             readAltitude();
1177         }
1178 
1179         log<level::DEBUG>(
1180             fmt::format("ambientCallback: Ambient: {}C, altitude: {}m", ambient,
1181                         altitude)
1182                 .c_str());
1183 #ifdef POWER10
1184         // Send ambient and altitude to all OCCs
1185         for (auto& obj : statusObjects)
1186         {
1187             if (obj->occActive())
1188             {
1189                 obj->sendAmbient(ambient, altitude);
1190             }
1191         }
1192 #endif // POWER10
1193     }
1194 }
1195 
1196 // return the current ambient and altitude readings
1197 void Manager::getAmbientData(bool& ambientValid, uint8_t& ambientTemp,
1198                              uint16_t& altitudeValue) const
1199 {
1200     ambientValid = true;
1201     ambientTemp = ambient;
1202     altitudeValue = altitude;
1203 
1204     if (ambient == 0xFF)
1205     {
1206         ambientValid = false;
1207     }
1208 }
1209 
1210 #ifdef POWER10
1211 // Called when waitForAllOccsTimer expires
1212 // After the first OCC goes active, this timer will be started (60 seconds)
1213 void Manager::occsNotAllRunning()
1214 {
1215     if (activeCount != statusObjects.size())
1216     {
1217         // Not all OCCs went active
1218         log<level::WARNING>(
1219             fmt::format(
1220                 "occsNotAllRunning: Active OCC count ({}) does not match expected count ({})",
1221                 activeCount, statusObjects.size())
1222                 .c_str());
1223         // Procs may be garded, so may be expected
1224     }
1225 
1226     validateOccMaster();
1227 }
1228 #endif // POWER10
1229 
1230 // Verify single master OCC and start presence monitor
1231 void Manager::validateOccMaster()
1232 {
1233     int masterInstance = -1;
1234     for (auto& obj : statusObjects)
1235     {
1236         auto instance = obj->getOccInstanceID();
1237 #ifdef POWER10
1238         if (!obj->occActive())
1239         {
1240             if (utils::isHostRunning())
1241             {
1242                 // Check if sensor was queued while waiting for discovery
1243                 auto match = queuedActiveState.find(instance);
1244                 if (match != queuedActiveState.end())
1245                 {
1246                     queuedActiveState.erase(match);
1247                     log<level::INFO>(
1248                         fmt::format(
1249                             "validateOccMaster: OCC{} is ACTIVE (queued)",
1250                             instance)
1251                             .c_str());
1252                     obj->occActive(true);
1253                 }
1254                 else
1255                 {
1256                     // OCC does not appear to be active yet, check active sensor
1257                     pldmHandle->checkActiveSensor(instance);
1258                     if (obj->occActive())
1259                     {
1260                         log<level::INFO>(
1261                             fmt::format(
1262                                 "validateOccMaster: OCC{} is ACTIVE after reading sensor",
1263                                 instance)
1264                                 .c_str());
1265                     }
1266                 }
1267             }
1268             else
1269             {
1270                 log<level::WARNING>(
1271                     fmt::format(
1272                         "validateOccMaster: HOST is not running (OCC{})",
1273                         instance)
1274                         .c_str());
1275                 return;
1276             }
1277         }
1278 #endif // POWER10
1279 
1280         if (obj->isMasterOcc())
1281         {
1282             obj->addPresenceWatchMaster();
1283 
1284             if (masterInstance == -1)
1285             {
1286                 masterInstance = instance;
1287             }
1288             else
1289             {
1290                 log<level::ERR>(
1291                     fmt::format(
1292                         "validateOccMaster: Multiple OCC masters! ({} and {})",
1293                         masterInstance, instance)
1294                         .c_str());
1295                 // request reset
1296                 obj->deviceError(Error::Descriptor(PRESENCE_ERROR_PATH));
1297             }
1298         }
1299     }
1300 
1301     if (masterInstance < 0)
1302     {
1303         log<level::ERR>(
1304             fmt::format("validateOccMaster: Master OCC not found! (of {} OCCs)",
1305                         statusObjects.size())
1306                 .c_str());
1307         // request reset
1308         statusObjects.front()->deviceError(
1309             Error::Descriptor(PRESENCE_ERROR_PATH));
1310     }
1311     else
1312     {
1313         log<level::INFO>(
1314             fmt::format("validateOccMaster: OCC{} is master of {} OCCs",
1315                         masterInstance, activeCount)
1316                 .c_str());
1317 #ifdef POWER10
1318         pmode->updateDbusSafeMode(false);
1319 #endif
1320     }
1321 }
1322 
1323 void Manager::updatePcapBounds() const
1324 {
1325     if (pcap)
1326     {
1327         pcap->updatePcapBounds();
1328     }
1329 }
1330 
1331 } // namespace occ
1332 } // namespace open_power
1333