1 #include "config.h"
2 
3 #include "occ_manager.hpp"
4 
5 #include "i2c_occ.hpp"
6 #include "occ_dbus.hpp"
7 #include "utils.hpp"
8 
9 #include <phosphor-logging/elog-errors.hpp>
10 #include <phosphor-logging/log.hpp>
11 #include <xyz/openbmc_project/Common/error.hpp>
12 
13 #include <chrono>
14 #include <cmath>
15 #include <filesystem>
16 #include <fstream>
17 #include <regex>
18 
19 namespace open_power
20 {
21 namespace occ
22 {
23 
24 constexpr uint32_t fruTypeNotAvailable = 0xFF;
25 constexpr auto fruTypeSuffix = "fru_type";
26 constexpr auto faultSuffix = "fault";
27 constexpr auto inputSuffix = "input";
28 constexpr auto maxSuffix = "max";
29 
30 const auto HOST_ON_FILE = "/run/openbmc/host@0-on";
31 
32 using namespace phosphor::logging;
33 using namespace std::literals::chrono_literals;
34 
35 template <typename T>
36 T readFile(const std::string& path)
37 {
38     std::ifstream ifs;
39     ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit |
40                    std::ifstream::eofbit);
41     T data;
42 
43     try
44     {
45         ifs.open(path);
46         ifs >> data;
47         ifs.close();
48     }
49     catch (const std::exception& e)
50     {
51         auto err = errno;
52         throw std::system_error(err, std::generic_category());
53     }
54 
55     return data;
56 }
57 
58 void Manager::findAndCreateObjects()
59 {
60 #ifndef POWER10
61     for (auto id = 0; id < MAX_CPUS; ++id)
62     {
63         // Create one occ per cpu
64         auto occ = std::string(OCC_NAME) + std::to_string(id);
65         createObjects(occ);
66     }
67 #else
68     if (!pmode)
69     {
70         // Create the power mode object
71         pmode = std::make_unique<powermode::PowerMode>(
72             *this, powermode::PMODE_PATH, powermode::PIPS_PATH, event);
73     }
74 
75     if (!fs::exists(HOST_ON_FILE))
76     {
77         static bool statusObjCreated = false;
78         if (!statusObjCreated)
79         {
80             // Create the OCCs based on on the /dev/occX devices
81             auto occs = findOCCsInDev();
82 
83             if (occs.empty() || (prevOCCSearch.size() != occs.size()))
84             {
85                 // Something changed or no OCCs yet, try again in 10s.
86                 // Note on the first pass prevOCCSearch will be empty,
87                 // so there will be at least one delay to give things
88                 // a chance to settle.
89                 prevOCCSearch = occs;
90 
91                 log<level::INFO>(
92                     std::format(
93                         "Manager::findAndCreateObjects(): Waiting for OCCs (currently {})",
94                         occs.size())
95                         .c_str());
96 
97                 discoverTimer->restartOnce(10s);
98             }
99             else
100             {
101                 // All OCCs appear to be available, create status objects
102 
103                 // createObjects requires OCC0 first.
104                 std::sort(occs.begin(), occs.end());
105 
106                 log<level::INFO>(
107                     std::format(
108                         "Manager::findAndCreateObjects(): Creating {} OCC Status Objects",
109                         occs.size())
110                         .c_str());
111                 for (auto id : occs)
112                 {
113                     createObjects(std::string(OCC_NAME) + std::to_string(id));
114                 }
115                 statusObjCreated = true;
116                 waitingForAllOccActiveSensors = true;
117 
118                 // Find/update the processor path associated with each OCC
119                 for (auto& obj : statusObjects)
120                 {
121                     obj->updateProcAssociation();
122                 }
123             }
124         }
125 
126         if (statusObjCreated && waitingForAllOccActiveSensors)
127         {
128             static bool tracedHostWait = false;
129             if (utils::isHostRunning())
130             {
131                 if (tracedHostWait)
132                 {
133                     log<level::INFO>(
134                         "Manager::findAndCreateObjects(): Host is running");
135                     tracedHostWait = false;
136                 }
137                 checkAllActiveSensors();
138             }
139             else
140             {
141                 if (!tracedHostWait)
142                 {
143                     log<level::INFO>(
144                         "Manager::findAndCreateObjects(): Waiting for host to start");
145                     tracedHostWait = true;
146                 }
147                 discoverTimer->restartOnce(30s);
148             }
149         }
150     }
151     else
152     {
153         log<level::INFO>(
154             std::format(
155                 "Manager::findAndCreateObjects(): Waiting for {} to complete...",
156                 HOST_ON_FILE)
157                 .c_str());
158         discoverTimer->restartOnce(10s);
159     }
160 #endif
161 }
162 
163 #ifdef POWER10
164 // Check if all occActive sensors are available
165 void Manager::checkAllActiveSensors()
166 {
167     static bool allActiveSensorAvailable = false;
168     static bool tracedSensorWait = false;
169     static bool waitingForHost = false;
170 
171     if (open_power::occ::utils::isHostRunning())
172     {
173         if (waitingForHost)
174         {
175             waitingForHost = false;
176             log<level::INFO>("checkAllActiveSensors(): Host is now running");
177         }
178 
179         // Start with the assumption that all are available
180         allActiveSensorAvailable = true;
181         for (auto& obj : statusObjects)
182         {
183             if ((!obj->occActive()) && (!obj->getPldmSensorReceived()))
184             {
185                 auto instance = obj->getOccInstanceID();
186                 // Check if sensor was queued while waiting for discovery
187                 auto match = queuedActiveState.find(instance);
188                 if (match != queuedActiveState.end())
189                 {
190                     queuedActiveState.erase(match);
191                     log<level::INFO>(
192                         std::format(
193                             "checkAllActiveSensors(): OCC{} is ACTIVE (queued)",
194                             instance)
195                             .c_str());
196                     obj->occActive(true);
197                 }
198                 else
199                 {
200                     allActiveSensorAvailable = false;
201                     if (!tracedSensorWait)
202                     {
203                         log<level::INFO>(
204                             std::format(
205                                 "checkAllActiveSensors(): Waiting on OCC{} Active sensor",
206                                 instance)
207                                 .c_str());
208                         tracedSensorWait = true;
209                         // Make sure traces are not throttled
210 #ifdef PLDM
211                         pldmHandle->setTraceThrottle(false);
212                         // Start timer to throttle pldm traces when timer
213                         // expires
214                         throttleTraceTimer->restartOnce(5min);
215 #endif
216                     }
217 #ifdef PLDM
218                     pldmHandle->checkActiveSensor(obj->getOccInstanceID());
219 #endif
220                     break;
221                 }
222             }
223         }
224     }
225     else
226     {
227         if (!waitingForHost)
228         {
229             waitingForHost = true;
230             log<level::INFO>(
231                 "checkAllActiveSensors(): Waiting for host to start");
232         }
233     }
234 
235     if (allActiveSensorAvailable)
236     {
237         // All sensors were found, disable the discovery timer
238         if (discoverTimer->isEnabled())
239         {
240             discoverTimer->setEnabled(false);
241         }
242 #ifdef PLDM
243         if (throttleTraceTimer->isEnabled())
244         {
245             // Disable throttle timer and make sure traces are not throttled
246             throttleTraceTimer->setEnabled(false);
247             pldmHandle->setTraceThrottle(false);
248         }
249 #endif
250 
251         if (waitingForAllOccActiveSensors)
252         {
253             log<level::INFO>(
254                 "checkAllActiveSensors(): OCC Active sensors are available");
255             waitingForAllOccActiveSensors = false;
256         }
257         queuedActiveState.clear();
258         tracedSensorWait = false;
259     }
260     else
261     {
262         // Not all sensors were available, so keep waiting
263         if (!tracedSensorWait)
264         {
265             log<level::INFO>(
266                 "checkAllActiveSensors(): Waiting for OCC Active sensors to become available");
267             tracedSensorWait = true;
268         }
269         discoverTimer->restartOnce(10s);
270     }
271 }
272 #endif
273 
274 std::vector<int> Manager::findOCCsInDev()
275 {
276     std::vector<int> occs;
277     std::regex expr{R"(occ(\d+)$)"};
278 
279     for (auto& file : fs::directory_iterator("/dev"))
280     {
281         std::smatch match;
282         std::string path{file.path().string()};
283         if (std::regex_search(path, match, expr))
284         {
285             auto num = std::stoi(match[1].str());
286 
287             // /dev numbering starts at 1, ours starts at 0.
288             occs.push_back(num - 1);
289         }
290     }
291 
292     return occs;
293 }
294 
295 int Manager::cpuCreated(sdbusplus::message_t& msg)
296 {
297     namespace fs = std::filesystem;
298 
299     sdbusplus::message::object_path o;
300     msg.read(o);
301     fs::path cpuPath(std::string(std::move(o)));
302 
303     auto name = cpuPath.filename().string();
304     auto index = name.find(CPU_NAME);
305     name.replace(index, std::strlen(CPU_NAME), OCC_NAME);
306 
307     createObjects(name);
308 
309     return 0;
310 }
311 
312 void Manager::createObjects(const std::string& occ)
313 {
314     auto path = fs::path(OCC_CONTROL_ROOT) / occ;
315 
316     statusObjects.emplace_back(std::make_unique<Status>(
317         event, path.c_str(), *this,
318 #ifdef POWER10
319         pmode,
320 #endif
321         std::bind(std::mem_fn(&Manager::statusCallBack), this,
322                   std::placeholders::_1, std::placeholders::_2)
323 #ifdef PLDM
324             ,
325         std::bind(std::mem_fn(&pldm::Interface::resetOCC), pldmHandle.get(),
326                   std::placeholders::_1)
327 #endif
328             ));
329 
330     // Create the power cap monitor object
331     if (!pcap)
332     {
333         pcap = std::make_unique<open_power::occ::powercap::PowerCap>(
334             *statusObjects.back());
335     }
336 
337     if (statusObjects.back()->isMasterOcc())
338     {
339         log<level::INFO>(
340             std::format("Manager::createObjects(): OCC{} is the master",
341                         statusObjects.back()->getOccInstanceID())
342                 .c_str());
343         _pollTimer->setEnabled(false);
344 
345 #ifdef POWER10
346         // Set the master OCC on the PowerMode object
347         pmode->setMasterOcc(path);
348 #endif
349     }
350 
351     passThroughObjects.emplace_back(std::make_unique<PassThrough>(path.c_str()
352 #ifdef POWER10
353                                                                       ,
354                                                                   pmode
355 #endif
356                                                                   ));
357 }
358 
359 void Manager::statusCallBack(instanceID instance, bool status)
360 {
361     if (status == true)
362     {
363         // OCC went active
364         ++activeCount;
365 
366 #ifdef POWER10
367         if (activeCount == 1)
368         {
369             // First OCC went active (allow some time for all OCCs to go active)
370             waitForAllOccsTimer->restartOnce(60s);
371         }
372 #endif
373 
374         if (activeCount == statusObjects.size())
375         {
376 #ifdef POWER10
377             // All OCCs are now running
378             if (waitForAllOccsTimer->isEnabled())
379             {
380                 // stop occ wait timer
381                 waitForAllOccsTimer->setEnabled(false);
382             }
383 #endif
384 
385             // Verify master OCC and start presence monitor
386             validateOccMaster();
387         }
388 
389         // Start poll timer if not already started
390         if (!_pollTimer->isEnabled())
391         {
392             log<level::INFO>(
393                 std::format("Manager: OCCs will be polled every {} seconds",
394                             pollInterval)
395                     .c_str());
396 
397             // Send poll and start OCC poll timer
398             pollerTimerExpired();
399         }
400     }
401     else
402     {
403         // OCC went away
404         if (activeCount > 0)
405         {
406             --activeCount;
407         }
408         else
409         {
410             log<level::ERR>(
411                 std::format("OCC{} disabled, but currently no active OCCs",
412                             instance)
413                     .c_str());
414         }
415 
416         if (activeCount == 0)
417         {
418             // No OCCs are running
419 
420             // Stop OCC poll timer
421             if (_pollTimer->isEnabled())
422             {
423                 log<level::INFO>(
424                     "Manager::statusCallBack(): OCCs are not running, stopping poll timer");
425                 _pollTimer->setEnabled(false);
426             }
427 
428 #ifdef POWER10
429             // stop wait timer
430             if (waitForAllOccsTimer->isEnabled())
431             {
432                 waitForAllOccsTimer->setEnabled(false);
433             }
434 #endif
435         }
436 #ifdef READ_OCC_SENSORS
437         // Clear OCC sensors
438         setSensorValueToNaN(instance);
439 #endif
440     }
441 
442 #ifdef POWER10
443     if (waitingForAllOccActiveSensors)
444     {
445         if (utils::isHostRunning())
446         {
447             checkAllActiveSensors();
448         }
449     }
450 #endif
451 }
452 
453 #ifdef I2C_OCC
454 void Manager::initStatusObjects()
455 {
456     // Make sure we have a valid path string
457     static_assert(sizeof(DEV_PATH) != 0);
458 
459     auto deviceNames = i2c_occ::getOccHwmonDevices(DEV_PATH);
460     for (auto& name : deviceNames)
461     {
462         i2c_occ::i2cToDbus(name);
463         name = std::string(OCC_NAME) + '_' + name;
464         auto path = fs::path(OCC_CONTROL_ROOT) / name;
465         statusObjects.emplace_back(
466             std::make_unique<Status>(event, path.c_str(), *this));
467     }
468     // The first device is master occ
469     pcap = std::make_unique<open_power::occ::powercap::PowerCap>(
470         *statusObjects.front());
471 #ifdef POWER10
472     pmode = std::make_unique<powermode::PowerMode>(*this, powermode::PMODE_PATH,
473                                                    powermode::PIPS_PATH);
474     // Set the master OCC on the PowerMode object
475     pmode->setMasterOcc(path);
476 #endif
477 }
478 #endif
479 
480 #ifdef PLDM
481 void Manager::sbeTimeout(unsigned int instance)
482 {
483     auto obj = std::find_if(statusObjects.begin(), statusObjects.end(),
484                             [instance](const auto& obj) {
485         return instance == obj->getOccInstanceID();
486     });
487 
488     if (obj != statusObjects.end() && (*obj)->occActive())
489     {
490         log<level::INFO>(
491             std::format("SBE timeout, requesting HRESET (OCC{})", instance)
492                 .c_str());
493 
494         setSBEState(instance, SBE_STATE_NOT_USABLE);
495 
496         pldmHandle->sendHRESET(instance);
497     }
498 }
499 
500 bool Manager::updateOCCActive(instanceID instance, bool status)
501 {
502     auto obj = std::find_if(statusObjects.begin(), statusObjects.end(),
503                             [instance](const auto& obj) {
504         return instance == obj->getOccInstanceID();
505     });
506 
507     const bool hostRunning = open_power::occ::utils::isHostRunning();
508     if (obj != statusObjects.end())
509     {
510         if (!hostRunning && (status == true))
511         {
512             log<level::WARNING>(
513                 std::format(
514                     "updateOCCActive: Host is not running yet (OCC{} active={}), clearing sensor received",
515                     instance, status)
516                     .c_str());
517             (*obj)->setPldmSensorReceived(false);
518             if (!waitingForAllOccActiveSensors)
519             {
520                 log<level::INFO>(
521                     "updateOCCActive: Waiting for Host and all OCC Active Sensors");
522                 waitingForAllOccActiveSensors = true;
523             }
524 #ifdef POWER10
525             discoverTimer->restartOnce(30s);
526 #endif
527             return false;
528         }
529         else
530         {
531             log<level::INFO>(std::format("updateOCCActive: OCC{} active={}",
532                                          instance, status)
533                                  .c_str());
534             (*obj)->setPldmSensorReceived(true);
535             return (*obj)->occActive(status);
536         }
537     }
538     else
539     {
540         if (hostRunning)
541         {
542             log<level::WARNING>(
543                 std::format(
544                     "updateOCCActive: No status object to update for OCC{} (active={})",
545                     instance, status)
546                     .c_str());
547         }
548         else
549         {
550             if (status == true)
551             {
552                 log<level::WARNING>(
553                     std::format(
554                         "updateOCCActive: No status objects and Host is not running yet (OCC{} active={})",
555                         instance, status)
556                         .c_str());
557             }
558         }
559         if (status == true)
560         {
561             // OCC went active
562             queuedActiveState.insert(instance);
563         }
564         else
565         {
566             auto match = queuedActiveState.find(instance);
567             if (match != queuedActiveState.end())
568             {
569                 // OCC was disabled
570                 queuedActiveState.erase(match);
571             }
572         }
573         return false;
574     }
575 }
576 
577 // Called upon pldm event To set powermode Safe Mode State for system.
578 void Manager::updateOccSafeMode(bool safeMode)
579 {
580 #ifdef POWER10
581     pmode->updateDbusSafeMode(safeMode);
582 #endif
583     // Update the processor throttle status on dbus
584     for (auto& obj : statusObjects)
585     {
586         obj->updateThrottle(safeMode, THROTTLED_SAFE);
587     }
588 }
589 
590 void Manager::sbeHRESETResult(instanceID instance, bool success)
591 {
592     if (success)
593     {
594         log<level::INFO>(
595             std::format("HRESET succeeded (OCC{})", instance).c_str());
596 
597         setSBEState(instance, SBE_STATE_BOOTED);
598 
599         return;
600     }
601 
602     setSBEState(instance, SBE_STATE_FAILED);
603 
604     if (sbeCanDump(instance))
605     {
606         log<level::INFO>(
607             std::format("HRESET failed (OCC{}), triggering SBE dump", instance)
608                 .c_str());
609 
610         auto& bus = utils::getBus();
611         uint32_t src6 = instance << 16;
612         uint32_t logId =
613             FFDC::createPEL("org.open_power.Processor.Error.SbeChipOpTimeout",
614                             src6, "SBE command timeout");
615 
616         try
617         {
618             constexpr auto path = "/org/openpower/dump";
619             constexpr auto interface = "xyz.openbmc_project.Dump.Create";
620             constexpr auto function = "CreateDump";
621 
622             std::string service = utils::getService(path, interface);
623             auto method = bus.new_method_call(service.c_str(), path, interface,
624                                               function);
625 
626             std::map<std::string, std::variant<std::string, uint64_t>>
627                 createParams{
628                     {"com.ibm.Dump.Create.CreateParameters.ErrorLogId",
629                      uint64_t(logId)},
630                     {"com.ibm.Dump.Create.CreateParameters.DumpType",
631                      "com.ibm.Dump.Create.DumpType.SBE"},
632                     {"com.ibm.Dump.Create.CreateParameters.FailingUnitId",
633                      uint64_t(instance)},
634                 };
635 
636             method.append(createParams);
637 
638             auto response = bus.call(method);
639         }
640         catch (const sdbusplus::exception_t& e)
641         {
642             constexpr auto ERROR_DUMP_DISABLED =
643                 "xyz.openbmc_project.Dump.Create.Error.Disabled";
644             if (e.name() == ERROR_DUMP_DISABLED)
645             {
646                 log<level::INFO>("Dump is disabled, skipping");
647             }
648             else
649             {
650                 log<level::ERR>("Dump failed");
651             }
652         }
653     }
654 }
655 
656 bool Manager::sbeCanDump(unsigned int instance)
657 {
658     struct pdbg_target* proc = getPdbgTarget(instance);
659 
660     if (!proc)
661     {
662         // allow the dump in the error case
663         return true;
664     }
665 
666     try
667     {
668         if (!openpower::phal::sbe::isDumpAllowed(proc))
669         {
670             return false;
671         }
672 
673         if (openpower::phal::pdbg::isSbeVitalAttnActive(proc))
674         {
675             return false;
676         }
677     }
678     catch (openpower::phal::exception::SbeError& e)
679     {
680         log<level::INFO>("Failed to query SBE state");
681     }
682 
683     // allow the dump in the error case
684     return true;
685 }
686 
687 void Manager::setSBEState(unsigned int instance, enum sbe_state state)
688 {
689     struct pdbg_target* proc = getPdbgTarget(instance);
690 
691     if (!proc)
692     {
693         return;
694     }
695 
696     try
697     {
698         openpower::phal::sbe::setState(proc, state);
699     }
700     catch (const openpower::phal::exception::SbeError& e)
701     {
702         log<level::ERR>("Failed to set SBE state");
703     }
704 }
705 
706 struct pdbg_target* Manager::getPdbgTarget(unsigned int instance)
707 {
708     if (!pdbgInitialized)
709     {
710         try
711         {
712             openpower::phal::pdbg::init();
713             pdbgInitialized = true;
714         }
715         catch (const openpower::phal::exception::PdbgError& e)
716         {
717             log<level::ERR>("pdbg initialization failed");
718             return nullptr;
719         }
720     }
721 
722     struct pdbg_target* proc = nullptr;
723     pdbg_for_each_class_target("proc", proc)
724     {
725         if (pdbg_target_index(proc) == instance)
726         {
727             return proc;
728         }
729     }
730 
731     log<level::ERR>("Failed to get pdbg target");
732     return nullptr;
733 }
734 #endif
735 
736 void Manager::pollerTimerExpired()
737 {
738     if (!_pollTimer)
739     {
740         log<level::ERR>(
741             "Manager::pollerTimerExpired() ERROR: Timer not defined");
742         return;
743     }
744 
745     for (auto& obj : statusObjects)
746     {
747         if (!obj->occActive())
748         {
749             // OCC is not running yet
750 #ifdef READ_OCC_SENSORS
751             auto id = obj->getOccInstanceID();
752             setSensorValueToNaN(id);
753 #endif
754             continue;
755         }
756 
757         // Read sysfs to force kernel to poll OCC
758         obj->readOccState();
759 
760 #ifdef READ_OCC_SENSORS
761         // Read occ sensor values
762         getSensorValues(obj);
763 #endif
764     }
765 
766     if (activeCount > 0)
767     {
768         // Restart OCC poll timer
769         _pollTimer->restartOnce(std::chrono::seconds(pollInterval));
770     }
771     else
772     {
773         // No OCCs running, so poll timer will not be restarted
774         log<level::INFO>(
775             std::format(
776                 "Manager::pollerTimerExpired: poll timer will not be restarted")
777                 .c_str());
778     }
779 }
780 
781 #ifdef READ_OCC_SENSORS
782 void Manager::readTempSensors(const fs::path& path, uint32_t occInstance)
783 {
784     // There may be more than one sensor with the same FRU type
785     // and label so make two passes: the first to read the temps
786     // from sysfs, and the second to put them on D-Bus after
787     // resolving any conflicts.
788     std::map<std::string, double> sensorData;
789 
790     std::regex expr{"temp\\d+_label$"}; // Example: temp5_label
791     for (auto& file : fs::directory_iterator(path))
792     {
793         if (!std::regex_search(file.path().string(), expr))
794         {
795             continue;
796         }
797 
798         uint32_t labelValue{0};
799 
800         try
801         {
802             labelValue = readFile<uint32_t>(file.path());
803         }
804         catch (const std::system_error& e)
805         {
806             log<level::DEBUG>(
807                 std::format("readTempSensors: Failed reading {}, errno = {}",
808                             file.path().string(), e.code().value())
809                     .c_str());
810             continue;
811         }
812 
813         const std::string& tempLabel = "label";
814         const std::string filePathString = file.path().string().substr(
815             0, file.path().string().length() - tempLabel.length());
816 
817         uint32_t fruTypeValue{0};
818         try
819         {
820             fruTypeValue = readFile<uint32_t>(filePathString + fruTypeSuffix);
821         }
822         catch (const std::system_error& e)
823         {
824             log<level::DEBUG>(
825                 std::format("readTempSensors: Failed reading {}, errno = {}",
826                             filePathString + fruTypeSuffix, e.code().value())
827                     .c_str());
828             continue;
829         }
830 
831         std::string sensorPath = OCC_SENSORS_ROOT +
832                                  std::string("/temperature/");
833 
834         std::string dvfsTempPath;
835 
836         if (fruTypeValue == VRMVdd)
837         {
838             sensorPath.append("vrm_vdd" + std::to_string(occInstance) +
839                               "_temp");
840         }
841         else if (fruTypeValue == processorIoRing)
842         {
843             sensorPath.append("proc" + std::to_string(occInstance) +
844                               "_ioring_temp");
845             dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/proc" +
846                            std::to_string(occInstance) + "_ioring_dvfs_temp";
847         }
848         else
849         {
850             uint16_t type = (labelValue & 0xFF000000) >> 24;
851             uint16_t instanceID = labelValue & 0x0000FFFF;
852 
853             if (type == OCC_DIMM_TEMP_SENSOR_TYPE)
854             {
855                 if (fruTypeValue == fruTypeNotAvailable)
856                 {
857                     // Not all DIMM related temps are available to read
858                     // (no _input file in this case)
859                     continue;
860                 }
861                 auto iter = dimmTempSensorName.find(fruTypeValue);
862                 if (iter == dimmTempSensorName.end())
863                 {
864                     log<level::ERR>(
865                         std::format(
866                             "readTempSensors: Fru type error! fruTypeValue = {}) ",
867                             fruTypeValue)
868                             .c_str());
869                     continue;
870                 }
871 
872                 sensorPath.append("dimm" + std::to_string(instanceID) +
873                                   iter->second);
874 
875                 dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/" +
876                                dimmDVFSSensorName.at(fruTypeValue);
877             }
878             else if (type == OCC_CPU_TEMP_SENSOR_TYPE)
879             {
880                 if (fruTypeValue == processorCore)
881                 {
882                     // The OCC reports small core temps, of which there are
883                     // two per big core.  All current P10 systems are in big
884                     // core mode, so use a big core name.
885                     uint16_t coreNum = instanceID / 2;
886                     uint16_t tempNum = instanceID % 2;
887                     sensorPath.append("proc" + std::to_string(occInstance) +
888                                       "_core" + std::to_string(coreNum) + "_" +
889                                       std::to_string(tempNum) + "_temp");
890 
891                     dvfsTempPath =
892                         std::string{OCC_SENSORS_ROOT} + "/temperature/proc" +
893                         std::to_string(occInstance) + "_core_dvfs_temp";
894                 }
895                 else
896                 {
897                     continue;
898                 }
899             }
900             else
901             {
902                 continue;
903             }
904         }
905 
906         // The dvfs temp file only needs to be read once per chip per type.
907         if (!dvfsTempPath.empty() &&
908             !dbus::OccDBusSensors::getOccDBus().hasDvfsTemp(dvfsTempPath))
909         {
910             try
911             {
912                 auto dvfsValue = readFile<double>(filePathString + maxSuffix);
913 
914                 dbus::OccDBusSensors::getOccDBus().setDvfsTemp(
915                     dvfsTempPath, dvfsValue * std::pow(10, -3));
916             }
917             catch (const std::system_error& e)
918             {
919                 log<level::DEBUG>(
920                     std::format(
921                         "readTempSensors: Failed reading {}, errno = {}",
922                         filePathString + maxSuffix, e.code().value())
923                         .c_str());
924             }
925         }
926 
927         uint32_t faultValue{0};
928         try
929         {
930             faultValue = readFile<uint32_t>(filePathString + faultSuffix);
931         }
932         catch (const std::system_error& e)
933         {
934             log<level::DEBUG>(
935                 std::format("readTempSensors: Failed reading {}, errno = {}",
936                             filePathString + faultSuffix, e.code().value())
937                     .c_str());
938             continue;
939         }
940 
941         double tempValue{0};
942         // NOTE: if OCC sends back 0xFF, kernal sets this fault value to 1.
943         if (faultValue != 0)
944         {
945             tempValue = std::numeric_limits<double>::quiet_NaN();
946         }
947         else
948         {
949             // Read the temperature
950             try
951             {
952                 tempValue = readFile<double>(filePathString + inputSuffix);
953             }
954             catch (const std::system_error& e)
955             {
956                 log<level::DEBUG>(
957                     std::format(
958                         "readTempSensors: Failed reading {}, errno = {}",
959                         filePathString + inputSuffix, e.code().value())
960                         .c_str());
961 
962                 // if errno == EAGAIN(Resource temporarily unavailable) then set
963                 // temp to 0, to avoid using old temp, and affecting FAN
964                 // Control.
965                 if (e.code().value() == EAGAIN)
966                 {
967                     tempValue = 0;
968                 }
969                 // else the errno would be something like
970                 //     EBADF(Bad file descriptor)
971                 // or ENOENT(No such file or directory)
972                 else
973                 {
974                     continue;
975                 }
976             }
977         }
978 
979         // If this object path already has a value, only overwite
980         // it if the previous one was an NaN or a smaller value.
981         auto existing = sensorData.find(sensorPath);
982         if (existing != sensorData.end())
983         {
984             // Multiple sensors found for this FRU type
985             if ((std::isnan(existing->second) && (tempValue == 0)) ||
986                 ((existing->second == 0) && std::isnan(tempValue)))
987             {
988                 // One of the redundant sensors has failed (0xFF/nan), and the
989                 // other sensor has no reading (0), so set the FRU to NaN to
990                 // force fan increase
991                 tempValue = std::numeric_limits<double>::quiet_NaN();
992                 existing->second = tempValue;
993             }
994             if (std::isnan(existing->second) || (tempValue > existing->second))
995             {
996                 existing->second = tempValue;
997             }
998         }
999         else
1000         {
1001             // First sensor for this FRU type
1002             sensorData[sensorPath] = tempValue;
1003         }
1004     }
1005 
1006     // Now publish the values on D-Bus.
1007     for (const auto& [objectPath, value] : sensorData)
1008     {
1009         dbus::OccDBusSensors::getOccDBus().setValue(objectPath,
1010                                                     value * std::pow(10, -3));
1011 
1012         dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1013             objectPath, !std::isnan(value));
1014 
1015         if (existingSensors.find(objectPath) == existingSensors.end())
1016         {
1017             dbus::OccDBusSensors::getOccDBus().setChassisAssociation(
1018                 objectPath);
1019         }
1020 
1021         existingSensors[objectPath] = occInstance;
1022     }
1023 }
1024 
1025 std::optional<std::string>
1026     Manager::getPowerLabelFunctionID(const std::string& value)
1027 {
1028     // If the value is "system", then the FunctionID is "system".
1029     if (value == "system")
1030     {
1031         return value;
1032     }
1033 
1034     // If the value is not "system", then the label value have 3 numbers, of
1035     // which we only care about the middle one:
1036     // <sensor id>_<function id>_<apss channel>
1037     // eg: The value is "0_10_5" , then the FunctionID is "10".
1038     if (value.find("_") == std::string::npos)
1039     {
1040         return std::nullopt;
1041     }
1042 
1043     auto powerLabelValue = value.substr((value.find("_") + 1));
1044 
1045     if (powerLabelValue.find("_") == std::string::npos)
1046     {
1047         return std::nullopt;
1048     }
1049 
1050     return powerLabelValue.substr(0, powerLabelValue.find("_"));
1051 }
1052 
1053 void Manager::readPowerSensors(const fs::path& path, uint32_t id)
1054 {
1055     std::regex expr{"power\\d+_label$"}; // Example: power5_label
1056     for (auto& file : fs::directory_iterator(path))
1057     {
1058         if (!std::regex_search(file.path().string(), expr))
1059         {
1060             continue;
1061         }
1062 
1063         std::string labelValue;
1064         try
1065         {
1066             labelValue = readFile<std::string>(file.path());
1067         }
1068         catch (const std::system_error& e)
1069         {
1070             log<level::DEBUG>(
1071                 std::format("readPowerSensors: Failed reading {}, errno = {}",
1072                             file.path().string(), e.code().value())
1073                     .c_str());
1074             continue;
1075         }
1076 
1077         auto functionID = getPowerLabelFunctionID(labelValue);
1078         if (functionID == std::nullopt)
1079         {
1080             continue;
1081         }
1082 
1083         const std::string& tempLabel = "label";
1084         const std::string filePathString = file.path().string().substr(
1085             0, file.path().string().length() - tempLabel.length());
1086 
1087         std::string sensorPath = OCC_SENSORS_ROOT + std::string("/power/");
1088 
1089         auto iter = powerSensorName.find(*functionID);
1090         if (iter == powerSensorName.end())
1091         {
1092             continue;
1093         }
1094         sensorPath.append(iter->second);
1095 
1096         double tempValue{0};
1097 
1098         try
1099         {
1100             tempValue = readFile<double>(filePathString + inputSuffix);
1101         }
1102         catch (const std::system_error& e)
1103         {
1104             log<level::DEBUG>(
1105                 std::format("readPowerSensors: Failed reading {}, errno = {}",
1106                             filePathString + inputSuffix, e.code().value())
1107                     .c_str());
1108             continue;
1109         }
1110 
1111         dbus::OccDBusSensors::getOccDBus().setUnit(
1112             sensorPath, "xyz.openbmc_project.Sensor.Value.Unit.Watts");
1113 
1114         dbus::OccDBusSensors::getOccDBus().setValue(
1115             sensorPath, tempValue * std::pow(10, -3) * std::pow(10, -3));
1116 
1117         dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath,
1118                                                                 true);
1119 
1120         if (existingSensors.find(sensorPath) == existingSensors.end())
1121         {
1122             dbus::OccDBusSensors::getOccDBus().setChassisAssociation(
1123                 sensorPath);
1124         }
1125 
1126         existingSensors[sensorPath] = id;
1127     }
1128     return;
1129 }
1130 
1131 void Manager::setSensorValueToNaN(uint32_t id) const
1132 {
1133     for (const auto& [sensorPath, occId] : existingSensors)
1134     {
1135         if (occId == id)
1136         {
1137             dbus::OccDBusSensors::getOccDBus().setValue(
1138                 sensorPath, std::numeric_limits<double>::quiet_NaN());
1139 
1140             dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath,
1141                                                                     true);
1142         }
1143     }
1144     return;
1145 }
1146 
1147 void Manager::setSensorValueToNonFunctional(uint32_t id) const
1148 {
1149     for (const auto& [sensorPath, occId] : existingSensors)
1150     {
1151         if (occId == id)
1152         {
1153             dbus::OccDBusSensors::getOccDBus().setValue(
1154                 sensorPath, std::numeric_limits<double>::quiet_NaN());
1155 
1156             dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath,
1157                                                                     false);
1158         }
1159     }
1160     return;
1161 }
1162 
1163 void Manager::getSensorValues(std::unique_ptr<Status>& occ)
1164 {
1165     static bool tracedError[8] = {0};
1166     const fs::path sensorPath = occ->getHwmonPath();
1167     const uint32_t id = occ->getOccInstanceID();
1168 
1169     if (fs::exists(sensorPath))
1170     {
1171         // Read temperature sensors
1172         readTempSensors(sensorPath, id);
1173 
1174         if (occ->isMasterOcc())
1175         {
1176             // Read power sensors
1177             readPowerSensors(sensorPath, id);
1178         }
1179         tracedError[id] = false;
1180     }
1181     else
1182     {
1183         if (!tracedError[id])
1184         {
1185             log<level::ERR>(
1186                 std::format(
1187                     "Manager::getSensorValues: OCC{} sensor path missing: {}",
1188                     id, sensorPath.c_str())
1189                     .c_str());
1190             tracedError[id] = true;
1191         }
1192     }
1193 
1194     return;
1195 }
1196 #endif
1197 
1198 // Read the altitude from DBus
1199 void Manager::readAltitude()
1200 {
1201     static bool traceAltitudeErr = true;
1202 
1203     utils::PropertyValue altitudeProperty{};
1204     try
1205     {
1206         altitudeProperty = utils::getProperty(ALTITUDE_PATH, ALTITUDE_INTERFACE,
1207                                               ALTITUDE_PROP);
1208         auto sensorVal = std::get<double>(altitudeProperty);
1209         if (sensorVal < 0xFFFF)
1210         {
1211             if (sensorVal < 0)
1212             {
1213                 altitude = 0;
1214             }
1215             else
1216             {
1217                 // Round to nearest meter
1218                 altitude = uint16_t(sensorVal + 0.5);
1219             }
1220             log<level::DEBUG>(std::format("readAltitude: sensor={} ({}m)",
1221                                           sensorVal, altitude)
1222                                   .c_str());
1223             traceAltitudeErr = true;
1224         }
1225         else
1226         {
1227             if (traceAltitudeErr)
1228             {
1229                 traceAltitudeErr = false;
1230                 log<level::DEBUG>(
1231                     std::format("Invalid altitude value: {}", sensorVal)
1232                         .c_str());
1233             }
1234         }
1235     }
1236     catch (const sdbusplus::exception_t& e)
1237     {
1238         if (traceAltitudeErr)
1239         {
1240             traceAltitudeErr = false;
1241             log<level::INFO>(
1242                 std::format("Unable to read Altitude: {}", e.what()).c_str());
1243         }
1244         altitude = 0xFFFF; // not available
1245     }
1246 }
1247 
1248 // Callback function when ambient temperature changes
1249 void Manager::ambientCallback(sdbusplus::message_t& msg)
1250 {
1251     double currentTemp = 0;
1252     uint8_t truncatedTemp = 0xFF;
1253     std::string msgSensor;
1254     std::map<std::string, std::variant<double>> msgData;
1255     msg.read(msgSensor, msgData);
1256 
1257     auto valPropMap = msgData.find(AMBIENT_PROP);
1258     if (valPropMap == msgData.end())
1259     {
1260         log<level::DEBUG>("ambientCallback: Unknown ambient property changed");
1261         return;
1262     }
1263     currentTemp = std::get<double>(valPropMap->second);
1264     if (std::isnan(currentTemp))
1265     {
1266         truncatedTemp = 0xFF;
1267     }
1268     else
1269     {
1270         if (currentTemp < 0)
1271         {
1272             truncatedTemp = 0;
1273         }
1274         else
1275         {
1276             // Round to nearest degree C
1277             truncatedTemp = uint8_t(currentTemp + 0.5);
1278         }
1279     }
1280 
1281     // If ambient changes, notify OCCs
1282     if (truncatedTemp != ambient)
1283     {
1284         log<level::DEBUG>(
1285             std::format("ambientCallback: Ambient change from {} to {}C",
1286                         ambient, currentTemp)
1287                 .c_str());
1288 
1289         ambient = truncatedTemp;
1290         if (altitude == 0xFFFF)
1291         {
1292             // No altitude yet, try reading again
1293             readAltitude();
1294         }
1295 
1296         log<level::DEBUG>(
1297             std::format("ambientCallback: Ambient: {}C, altitude: {}m", ambient,
1298                         altitude)
1299                 .c_str());
1300 #ifdef POWER10
1301         // Send ambient and altitude to all OCCs
1302         for (auto& obj : statusObjects)
1303         {
1304             if (obj->occActive())
1305             {
1306                 obj->sendAmbient(ambient, altitude);
1307             }
1308         }
1309 #endif // POWER10
1310     }
1311 }
1312 
1313 // return the current ambient and altitude readings
1314 void Manager::getAmbientData(bool& ambientValid, uint8_t& ambientTemp,
1315                              uint16_t& altitudeValue) const
1316 {
1317     ambientValid = true;
1318     ambientTemp = ambient;
1319     altitudeValue = altitude;
1320 
1321     if (ambient == 0xFF)
1322     {
1323         ambientValid = false;
1324     }
1325 }
1326 
1327 #ifdef POWER10
1328 // Called when waitForAllOccsTimer expires
1329 // After the first OCC goes active, this timer will be started (60 seconds)
1330 void Manager::occsNotAllRunning()
1331 {
1332     if (activeCount != statusObjects.size())
1333     {
1334         // Not all OCCs went active
1335         log<level::WARNING>(
1336             std::format(
1337                 "occsNotAllRunning: Active OCC count ({}) does not match expected count ({})",
1338                 activeCount, statusObjects.size())
1339                 .c_str());
1340         // Procs may be garded, so may be expected
1341     }
1342 
1343     validateOccMaster();
1344 }
1345 
1346 #ifdef PLDM
1347 // Called when throttleTraceTimer expires.
1348 // If this timer expires, that indicates there is still no confirmed OCC status
1349 //   which will trigger pldm traces to be throttled.
1350 void Manager::throttleTraceExpired()
1351 {
1352     // Throttle traces
1353     pldmHandle->setTraceThrottle(true);
1354 }
1355 #endif // PLDM
1356 #endif // POWER10
1357 
1358 // Verify single master OCC and start presence monitor
1359 void Manager::validateOccMaster()
1360 {
1361     int masterInstance = -1;
1362     for (auto& obj : statusObjects)
1363     {
1364         auto instance = obj->getOccInstanceID();
1365 #ifdef POWER10
1366         if (!obj->occActive())
1367         {
1368             if (utils::isHostRunning())
1369             {
1370                 // Check if sensor was queued while waiting for discovery
1371                 auto match = queuedActiveState.find(instance);
1372                 if (match != queuedActiveState.end())
1373                 {
1374                     queuedActiveState.erase(match);
1375                     log<level::INFO>(
1376                         std::format(
1377                             "validateOccMaster: OCC{} is ACTIVE (queued)",
1378                             instance)
1379                             .c_str());
1380                     obj->occActive(true);
1381                 }
1382                 else
1383                 {
1384                     // OCC does not appear to be active yet, check active sensor
1385 #ifdef PLDM
1386                     pldmHandle->checkActiveSensor(instance);
1387 #endif
1388                     if (obj->occActive())
1389                     {
1390                         log<level::INFO>(
1391                             std::format(
1392                                 "validateOccMaster: OCC{} is ACTIVE after reading sensor",
1393                                 instance)
1394                                 .c_str());
1395                     }
1396                 }
1397             }
1398             else
1399             {
1400                 log<level::WARNING>(
1401                     std::format(
1402                         "validateOccMaster: HOST is not running (OCC{})",
1403                         instance)
1404                         .c_str());
1405                 return;
1406             }
1407         }
1408 #endif // POWER10
1409 
1410         if (obj->isMasterOcc())
1411         {
1412             obj->addPresenceWatchMaster();
1413 
1414             if (masterInstance == -1)
1415             {
1416                 masterInstance = instance;
1417             }
1418             else
1419             {
1420                 log<level::ERR>(
1421                     std::format(
1422                         "validateOccMaster: Multiple OCC masters! ({} and {})",
1423                         masterInstance, instance)
1424                         .c_str());
1425                 // request reset
1426                 obj->deviceError(Error::Descriptor(PRESENCE_ERROR_PATH));
1427             }
1428         }
1429     }
1430 
1431     if (masterInstance < 0)
1432     {
1433         log<level::ERR>(
1434             std::format("validateOccMaster: Master OCC not found! (of {} OCCs)",
1435                         statusObjects.size())
1436                 .c_str());
1437         // request reset
1438         statusObjects.front()->deviceError(
1439             Error::Descriptor(PRESENCE_ERROR_PATH));
1440     }
1441     else
1442     {
1443         log<level::INFO>(
1444             std::format("validateOccMaster: OCC{} is master of {} OCCs",
1445                         masterInstance, activeCount)
1446                 .c_str());
1447 #ifdef POWER10
1448         pmode->updateDbusSafeMode(false);
1449 #endif
1450     }
1451 }
1452 
1453 void Manager::updatePcapBounds() const
1454 {
1455     if (pcap)
1456     {
1457         pcap->updatePcapBounds();
1458     }
1459 }
1460 
1461 } // namespace occ
1462 } // namespace open_power
1463