1 #include "config.h"
2 
3 #include "occ_manager.hpp"
4 
5 #include "i2c_occ.hpp"
6 #include "occ_dbus.hpp"
7 #include "utils.hpp"
8 
9 #include <phosphor-logging/elog-errors.hpp>
10 #include <phosphor-logging/log.hpp>
11 #include <xyz/openbmc_project/Common/error.hpp>
12 
13 #include <chrono>
14 #include <cmath>
15 #include <filesystem>
16 #include <fstream>
17 #include <regex>
18 
19 namespace open_power
20 {
21 namespace occ
22 {
23 
24 constexpr uint32_t fruTypeNotAvailable = 0xFF;
25 constexpr auto fruTypeSuffix = "fru_type";
26 constexpr auto faultSuffix = "fault";
27 constexpr auto inputSuffix = "input";
28 constexpr auto maxSuffix = "max";
29 
30 const auto HOST_ON_FILE = "/run/openbmc/host@0-on";
31 
32 using namespace phosphor::logging;
33 using namespace std::literals::chrono_literals;
34 
35 template <typename T>
36 T readFile(const std::string& path)
37 {
38     std::ifstream ifs;
39     ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit |
40                    std::ifstream::eofbit);
41     T data;
42 
43     try
44     {
45         ifs.open(path);
46         ifs >> data;
47         ifs.close();
48     }
49     catch (const std::exception& e)
50     {
51         auto err = errno;
52         throw std::system_error(err, std::generic_category());
53     }
54 
55     return data;
56 }
57 
58 void Manager::findAndCreateObjects()
59 {
60 #ifndef POWER10
61     for (auto id = 0; id < MAX_CPUS; ++id)
62     {
63         // Create one occ per cpu
64         auto occ = std::string(OCC_NAME) + std::to_string(id);
65         createObjects(occ);
66     }
67 #else
68     if (!pmode)
69     {
70         // Create the power mode object
71         pmode = std::make_unique<powermode::PowerMode>(
72             *this, powermode::PMODE_PATH, powermode::PIPS_PATH, event);
73     }
74 
75     if (!fs::exists(HOST_ON_FILE))
76     {
77         static bool statusObjCreated = false;
78         if (!statusObjCreated)
79         {
80             // Create the OCCs based on on the /dev/occX devices
81             auto occs = findOCCsInDev();
82 
83             if (occs.empty() || (prevOCCSearch.size() != occs.size()))
84             {
85                 // Something changed or no OCCs yet, try again in 10s.
86                 // Note on the first pass prevOCCSearch will be empty,
87                 // so there will be at least one delay to give things
88                 // a chance to settle.
89                 prevOCCSearch = occs;
90 
91                 log<level::INFO>(
92                     std::format(
93                         "Manager::findAndCreateObjects(): Waiting for OCCs (currently {})",
94                         occs.size())
95                         .c_str());
96 
97                 discoverTimer->restartOnce(10s);
98             }
99             else
100             {
101                 // All OCCs appear to be available, create status objects
102 
103                 // createObjects requires OCC0 first.
104                 std::sort(occs.begin(), occs.end());
105 
106                 log<level::INFO>(
107                     std::format(
108                         "Manager::findAndCreateObjects(): Creating {} OCC Status Objects",
109                         occs.size())
110                         .c_str());
111                 for (auto id : occs)
112                 {
113                     createObjects(std::string(OCC_NAME) + std::to_string(id));
114                 }
115                 statusObjCreated = true;
116                 waitingForAllOccActiveSensors = true;
117 
118                 // Find/update the processor path associated with each OCC
119                 for (auto& obj : statusObjects)
120                 {
121                     obj->updateProcAssociation();
122                 }
123             }
124         }
125 
126         if (statusObjCreated && waitingForAllOccActiveSensors)
127         {
128             static bool tracedHostWait = false;
129             if (utils::isHostRunning())
130             {
131                 if (tracedHostWait)
132                 {
133                     log<level::INFO>(
134                         "Manager::findAndCreateObjects(): Host is running");
135                     tracedHostWait = false;
136                 }
137                 checkAllActiveSensors();
138             }
139             else
140             {
141                 if (!tracedHostWait)
142                 {
143                     log<level::INFO>(
144                         "Manager::findAndCreateObjects(): Waiting for host to start");
145                     tracedHostWait = true;
146                 }
147                 discoverTimer->restartOnce(30s);
148             }
149         }
150     }
151     else
152     {
153         log<level::INFO>(
154             std::format(
155                 "Manager::findAndCreateObjects(): Waiting for {} to complete...",
156                 HOST_ON_FILE)
157                 .c_str());
158         discoverTimer->restartOnce(10s);
159     }
160 #endif
161 }
162 
163 #ifdef POWER10
164 // Check if all occActive sensors are available
165 void Manager::checkAllActiveSensors()
166 {
167     static bool allActiveSensorAvailable = false;
168     static bool tracedSensorWait = false;
169     static bool waitingForHost = false;
170 
171     if (open_power::occ::utils::isHostRunning())
172     {
173         if (waitingForHost)
174         {
175             waitingForHost = false;
176             log<level::INFO>("checkAllActiveSensors(): Host is now running");
177         }
178 
179         // Start with the assumption that all are available
180         allActiveSensorAvailable = true;
181         for (auto& obj : statusObjects)
182         {
183             if ((!obj->occActive()) && (!obj->getPldmSensorReceived()))
184             {
185                 auto instance = obj->getOccInstanceID();
186                 // Check if sensor was queued while waiting for discovery
187                 auto match = queuedActiveState.find(instance);
188                 if (match != queuedActiveState.end())
189                 {
190                     queuedActiveState.erase(match);
191                     log<level::INFO>(
192                         std::format(
193                             "checkAllActiveSensors(): OCC{} is ACTIVE (queued)",
194                             instance)
195                             .c_str());
196                     obj->occActive(true);
197                 }
198                 else
199                 {
200                     allActiveSensorAvailable = false;
201                     if (!tracedSensorWait)
202                     {
203                         log<level::INFO>(
204                             std::format(
205                                 "checkAllActiveSensors(): Waiting on OCC{} Active sensor",
206                                 instance)
207                                 .c_str());
208                         tracedSensorWait = true;
209                     }
210                     pldmHandle->checkActiveSensor(obj->getOccInstanceID());
211                     break;
212                 }
213             }
214         }
215     }
216     else
217     {
218         if (!waitingForHost)
219         {
220             waitingForHost = true;
221             log<level::INFO>(
222                 "checkAllActiveSensors(): Waiting for host to start");
223         }
224     }
225 
226     if (allActiveSensorAvailable)
227     {
228         // All sensors were found, disable the discovery timer
229         if (discoverTimer->isEnabled())
230         {
231             discoverTimer->setEnabled(false);
232         }
233 
234         if (waitingForAllOccActiveSensors)
235         {
236             log<level::INFO>(
237                 "checkAllActiveSensors(): OCC Active sensors are available");
238             waitingForAllOccActiveSensors = false;
239         }
240         queuedActiveState.clear();
241         tracedSensorWait = false;
242     }
243     else
244     {
245         // Not all sensors were available, so keep waiting
246         if (!tracedSensorWait)
247         {
248             log<level::INFO>(
249                 "checkAllActiveSensors(): Waiting for OCC Active sensors to become available");
250             tracedSensorWait = true;
251         }
252         discoverTimer->restartOnce(10s);
253     }
254 }
255 #endif
256 
257 std::vector<int> Manager::findOCCsInDev()
258 {
259     std::vector<int> occs;
260     std::regex expr{R"(occ(\d+)$)"};
261 
262     for (auto& file : fs::directory_iterator("/dev"))
263     {
264         std::smatch match;
265         std::string path{file.path().string()};
266         if (std::regex_search(path, match, expr))
267         {
268             auto num = std::stoi(match[1].str());
269 
270             // /dev numbering starts at 1, ours starts at 0.
271             occs.push_back(num - 1);
272         }
273     }
274 
275     return occs;
276 }
277 
278 int Manager::cpuCreated(sdbusplus::message_t& msg)
279 {
280     namespace fs = std::filesystem;
281 
282     sdbusplus::message::object_path o;
283     msg.read(o);
284     fs::path cpuPath(std::string(std::move(o)));
285 
286     auto name = cpuPath.filename().string();
287     auto index = name.find(CPU_NAME);
288     name.replace(index, std::strlen(CPU_NAME), OCC_NAME);
289 
290     createObjects(name);
291 
292     return 0;
293 }
294 
295 void Manager::createObjects(const std::string& occ)
296 {
297     auto path = fs::path(OCC_CONTROL_ROOT) / occ;
298 
299     statusObjects.emplace_back(std::make_unique<Status>(
300         event, path.c_str(), *this,
301 #ifdef POWER10
302         pmode,
303 #endif
304         std::bind(std::mem_fn(&Manager::statusCallBack), this,
305                   std::placeholders::_1, std::placeholders::_2)
306 #ifdef PLDM
307             ,
308         std::bind(std::mem_fn(&pldm::Interface::resetOCC), pldmHandle.get(),
309                   std::placeholders::_1)
310 #endif
311             ));
312 
313     // Create the power cap monitor object
314     if (!pcap)
315     {
316         pcap = std::make_unique<open_power::occ::powercap::PowerCap>(
317             *statusObjects.back());
318     }
319 
320     if (statusObjects.back()->isMasterOcc())
321     {
322         log<level::INFO>(
323             std::format("Manager::createObjects(): OCC{} is the master",
324                         statusObjects.back()->getOccInstanceID())
325                 .c_str());
326         _pollTimer->setEnabled(false);
327 
328 #ifdef POWER10
329         // Set the master OCC on the PowerMode object
330         pmode->setMasterOcc(path);
331 #endif
332     }
333 
334     passThroughObjects.emplace_back(std::make_unique<PassThrough>(path.c_str()
335 #ifdef POWER10
336                                                                       ,
337                                                                   pmode
338 #endif
339                                                                   ));
340 }
341 
342 void Manager::statusCallBack(instanceID instance, bool status)
343 {
344     if (status == true)
345     {
346         // OCC went active
347         ++activeCount;
348 
349 #ifdef POWER10
350         if (activeCount == 1)
351         {
352             // First OCC went active (allow some time for all OCCs to go active)
353             waitForAllOccsTimer->restartOnce(60s);
354         }
355 #endif
356 
357         if (activeCount == statusObjects.size())
358         {
359 #ifdef POWER10
360             // All OCCs are now running
361             if (waitForAllOccsTimer->isEnabled())
362             {
363                 // stop occ wait timer
364                 waitForAllOccsTimer->setEnabled(false);
365             }
366 #endif
367 
368             // Verify master OCC and start presence monitor
369             validateOccMaster();
370         }
371 
372         // Start poll timer if not already started
373         if (!_pollTimer->isEnabled())
374         {
375             log<level::INFO>(
376                 std::format("Manager: OCCs will be polled every {} seconds",
377                             pollInterval)
378                     .c_str());
379 
380             // Send poll and start OCC poll timer
381             pollerTimerExpired();
382         }
383     }
384     else
385     {
386         // OCC went away
387         if (activeCount > 0)
388         {
389             --activeCount;
390         }
391         else
392         {
393             log<level::ERR>(
394                 std::format("OCC{} disabled, but currently no active OCCs",
395                             instance)
396                     .c_str());
397         }
398 
399         if (activeCount == 0)
400         {
401             // No OCCs are running
402 
403             // Stop OCC poll timer
404             if (_pollTimer->isEnabled())
405             {
406                 log<level::INFO>(
407                     "Manager::statusCallBack(): OCCs are not running, stopping poll timer");
408                 _pollTimer->setEnabled(false);
409             }
410 
411 #ifdef POWER10
412             // stop wait timer
413             if (waitForAllOccsTimer->isEnabled())
414             {
415                 waitForAllOccsTimer->setEnabled(false);
416             }
417 #endif
418         }
419 #ifdef READ_OCC_SENSORS
420         // Clear OCC sensors
421         setSensorValueToNaN(instance);
422 #endif
423     }
424 
425 #ifdef POWER10
426     if (waitingForAllOccActiveSensors)
427     {
428         if (utils::isHostRunning())
429         {
430             checkAllActiveSensors();
431         }
432     }
433 #endif
434 }
435 
436 #ifdef I2C_OCC
437 void Manager::initStatusObjects()
438 {
439     // Make sure we have a valid path string
440     static_assert(sizeof(DEV_PATH) != 0);
441 
442     auto deviceNames = i2c_occ::getOccHwmonDevices(DEV_PATH);
443     for (auto& name : deviceNames)
444     {
445         i2c_occ::i2cToDbus(name);
446         name = std::string(OCC_NAME) + '_' + name;
447         auto path = fs::path(OCC_CONTROL_ROOT) / name;
448         statusObjects.emplace_back(
449             std::make_unique<Status>(event, path.c_str(), *this));
450     }
451     // The first device is master occ
452     pcap = std::make_unique<open_power::occ::powercap::PowerCap>(
453         *statusObjects.front());
454 #ifdef POWER10
455     pmode = std::make_unique<powermode::PowerMode>(*this, powermode::PMODE_PATH,
456                                                    powermode::PIPS_PATH);
457     // Set the master OCC on the PowerMode object
458     pmode->setMasterOcc(path);
459 #endif
460 }
461 #endif
462 
463 #ifdef PLDM
464 void Manager::sbeTimeout(unsigned int instance)
465 {
466     auto obj = std::find_if(statusObjects.begin(), statusObjects.end(),
467                             [instance](const auto& obj) {
468         return instance == obj->getOccInstanceID();
469     });
470 
471     if (obj != statusObjects.end() && (*obj)->occActive())
472     {
473         log<level::INFO>(
474             std::format("SBE timeout, requesting HRESET (OCC{})", instance)
475                 .c_str());
476 
477         setSBEState(instance, SBE_STATE_NOT_USABLE);
478 
479         pldmHandle->sendHRESET(instance);
480     }
481 }
482 
483 bool Manager::updateOCCActive(instanceID instance, bool status)
484 {
485     auto obj = std::find_if(statusObjects.begin(), statusObjects.end(),
486                             [instance](const auto& obj) {
487         return instance == obj->getOccInstanceID();
488     });
489 
490     const bool hostRunning = open_power::occ::utils::isHostRunning();
491     if (obj != statusObjects.end())
492     {
493         if (!hostRunning && (status == true))
494         {
495             log<level::WARNING>(
496                 std::format(
497                     "updateOCCActive: Host is not running yet (OCC{} active={}), clearing sensor received",
498                     instance, status)
499                     .c_str());
500             (*obj)->setPldmSensorReceived(false);
501             if (!waitingForAllOccActiveSensors)
502             {
503                 log<level::INFO>(
504                     "updateOCCActive: Waiting for Host and all OCC Active Sensors");
505                 waitingForAllOccActiveSensors = true;
506             }
507             discoverTimer->restartOnce(30s);
508             return false;
509         }
510         else
511         {
512             log<level::INFO>(std::format("updateOCCActive: OCC{} active={}",
513                                          instance, status)
514                                  .c_str());
515             (*obj)->setPldmSensorReceived(true);
516             return (*obj)->occActive(status);
517         }
518     }
519     else
520     {
521         if (hostRunning)
522         {
523             log<level::WARNING>(
524                 std::format(
525                     "updateOCCActive: No status object to update for OCC{} (active={})",
526                     instance, status)
527                     .c_str());
528         }
529         else
530         {
531             if (status == true)
532             {
533                 log<level::WARNING>(
534                     std::format(
535                         "updateOCCActive: No status objects and Host is not running yet (OCC{} active={})",
536                         instance, status)
537                         .c_str());
538             }
539         }
540         if (status == true)
541         {
542             // OCC went active
543             queuedActiveState.insert(instance);
544         }
545         else
546         {
547             auto match = queuedActiveState.find(instance);
548             if (match != queuedActiveState.end())
549             {
550                 // OCC was disabled
551                 queuedActiveState.erase(match);
552             }
553         }
554         return false;
555     }
556 }
557 
558 // Called upon pldm event To set powermode Safe Mode State for system.
559 void Manager::updateOccSafeMode(bool safeMode)
560 {
561 #ifdef POWER10
562     pmode->updateDbusSafeMode(safeMode);
563 #endif
564     // Update the processor throttle status on dbus
565     for (auto& obj : statusObjects)
566     {
567         obj->updateThrottle(safeMode, THROTTLED_SAFE);
568     }
569 }
570 
571 void Manager::sbeHRESETResult(instanceID instance, bool success)
572 {
573     if (success)
574     {
575         log<level::INFO>(
576             std::format("HRESET succeeded (OCC{})", instance).c_str());
577 
578         setSBEState(instance, SBE_STATE_BOOTED);
579 
580         return;
581     }
582 
583     setSBEState(instance, SBE_STATE_FAILED);
584 
585     if (sbeCanDump(instance))
586     {
587         log<level::INFO>(
588             std::format("HRESET failed (OCC{}), triggering SBE dump", instance)
589                 .c_str());
590 
591         auto& bus = utils::getBus();
592         uint32_t src6 = instance << 16;
593         uint32_t logId =
594             FFDC::createPEL("org.open_power.Processor.Error.SbeChipOpTimeout",
595                             src6, "SBE command timeout");
596 
597         try
598         {
599             constexpr auto path = "/org/openpower/dump";
600             constexpr auto interface = "xyz.openbmc_project.Dump.Create";
601             constexpr auto function = "CreateDump";
602 
603             std::string service = utils::getService(path, interface);
604             auto method = bus.new_method_call(service.c_str(), path, interface,
605                                               function);
606 
607             std::map<std::string, std::variant<std::string, uint64_t>>
608                 createParams{
609                     {"com.ibm.Dump.Create.CreateParameters.ErrorLogId",
610                      uint64_t(logId)},
611                     {"com.ibm.Dump.Create.CreateParameters.DumpType",
612                      "com.ibm.Dump.Create.DumpType.SBE"},
613                     {"com.ibm.Dump.Create.CreateParameters.FailingUnitId",
614                      uint64_t(instance)},
615                 };
616 
617             method.append(createParams);
618 
619             auto response = bus.call(method);
620         }
621         catch (const sdbusplus::exception_t& e)
622         {
623             constexpr auto ERROR_DUMP_DISABLED =
624                 "xyz.openbmc_project.Dump.Create.Error.Disabled";
625             if (e.name() == ERROR_DUMP_DISABLED)
626             {
627                 log<level::INFO>("Dump is disabled, skipping");
628             }
629             else
630             {
631                 log<level::ERR>("Dump failed");
632             }
633         }
634     }
635 }
636 
637 bool Manager::sbeCanDump(unsigned int instance)
638 {
639     struct pdbg_target* proc = getPdbgTarget(instance);
640 
641     if (!proc)
642     {
643         // allow the dump in the error case
644         return true;
645     }
646 
647     try
648     {
649         if (!openpower::phal::sbe::isDumpAllowed(proc))
650         {
651             return false;
652         }
653 
654         if (openpower::phal::pdbg::isSbeVitalAttnActive(proc))
655         {
656             return false;
657         }
658     }
659     catch (openpower::phal::exception::SbeError& e)
660     {
661         log<level::INFO>("Failed to query SBE state");
662     }
663 
664     // allow the dump in the error case
665     return true;
666 }
667 
668 void Manager::setSBEState(unsigned int instance, enum sbe_state state)
669 {
670     struct pdbg_target* proc = getPdbgTarget(instance);
671 
672     if (!proc)
673     {
674         return;
675     }
676 
677     try
678     {
679         openpower::phal::sbe::setState(proc, state);
680     }
681     catch (const openpower::phal::exception::SbeError& e)
682     {
683         log<level::ERR>("Failed to set SBE state");
684     }
685 }
686 
687 struct pdbg_target* Manager::getPdbgTarget(unsigned int instance)
688 {
689     if (!pdbgInitialized)
690     {
691         try
692         {
693             openpower::phal::pdbg::init();
694             pdbgInitialized = true;
695         }
696         catch (const openpower::phal::exception::PdbgError& e)
697         {
698             log<level::ERR>("pdbg initialization failed");
699             return nullptr;
700         }
701     }
702 
703     struct pdbg_target* proc = nullptr;
704     pdbg_for_each_class_target("proc", proc)
705     {
706         if (pdbg_target_index(proc) == instance)
707         {
708             return proc;
709         }
710     }
711 
712     log<level::ERR>("Failed to get pdbg target");
713     return nullptr;
714 }
715 #endif
716 
717 void Manager::pollerTimerExpired()
718 {
719     if (!_pollTimer)
720     {
721         log<level::ERR>(
722             "Manager::pollerTimerExpired() ERROR: Timer not defined");
723         return;
724     }
725 
726     for (auto& obj : statusObjects)
727     {
728         if (!obj->occActive())
729         {
730             // OCC is not running yet
731 #ifdef READ_OCC_SENSORS
732             auto id = obj->getOccInstanceID();
733             setSensorValueToNaN(id);
734 #endif
735             continue;
736         }
737 
738         // Read sysfs to force kernel to poll OCC
739         obj->readOccState();
740 
741 #ifdef READ_OCC_SENSORS
742         // Read occ sensor values
743         getSensorValues(obj);
744 #endif
745     }
746 
747     if (activeCount > 0)
748     {
749         // Restart OCC poll timer
750         _pollTimer->restartOnce(std::chrono::seconds(pollInterval));
751     }
752     else
753     {
754         // No OCCs running, so poll timer will not be restarted
755         log<level::INFO>(
756             std::format(
757                 "Manager::pollerTimerExpired: poll timer will not be restarted")
758                 .c_str());
759     }
760 }
761 
762 #ifdef READ_OCC_SENSORS
763 void Manager::readTempSensors(const fs::path& path, uint32_t occInstance)
764 {
765     // There may be more than one sensor with the same FRU type
766     // and label so make two passes: the first to read the temps
767     // from sysfs, and the second to put them on D-Bus after
768     // resolving any conflicts.
769     std::map<std::string, double> sensorData;
770 
771     std::regex expr{"temp\\d+_label$"}; // Example: temp5_label
772     for (auto& file : fs::directory_iterator(path))
773     {
774         if (!std::regex_search(file.path().string(), expr))
775         {
776             continue;
777         }
778 
779         uint32_t labelValue{0};
780 
781         try
782         {
783             labelValue = readFile<uint32_t>(file.path());
784         }
785         catch (const std::system_error& e)
786         {
787             log<level::DEBUG>(
788                 std::format("readTempSensors: Failed reading {}, errno = {}",
789                             file.path().string(), e.code().value())
790                     .c_str());
791             continue;
792         }
793 
794         const std::string& tempLabel = "label";
795         const std::string filePathString = file.path().string().substr(
796             0, file.path().string().length() - tempLabel.length());
797 
798         uint32_t fruTypeValue{0};
799         try
800         {
801             fruTypeValue = readFile<uint32_t>(filePathString + fruTypeSuffix);
802         }
803         catch (const std::system_error& e)
804         {
805             log<level::DEBUG>(
806                 std::format("readTempSensors: Failed reading {}, errno = {}",
807                             filePathString + fruTypeSuffix, e.code().value())
808                     .c_str());
809             continue;
810         }
811 
812         std::string sensorPath = OCC_SENSORS_ROOT +
813                                  std::string("/temperature/");
814 
815         std::string dvfsTempPath;
816 
817         if (fruTypeValue == VRMVdd)
818         {
819             sensorPath.append("vrm_vdd" + std::to_string(occInstance) +
820                               "_temp");
821         }
822         else if (fruTypeValue == processorIoRing)
823         {
824             sensorPath.append("proc" + std::to_string(occInstance) +
825                               "_ioring_temp");
826             dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/proc" +
827                            std::to_string(occInstance) + "_ioring_dvfs_temp";
828         }
829         else
830         {
831             uint16_t type = (labelValue & 0xFF000000) >> 24;
832             uint16_t instanceID = labelValue & 0x0000FFFF;
833 
834             if (type == OCC_DIMM_TEMP_SENSOR_TYPE)
835             {
836                 if (fruTypeValue == fruTypeNotAvailable)
837                 {
838                     // Not all DIMM related temps are available to read
839                     // (no _input file in this case)
840                     continue;
841                 }
842                 auto iter = dimmTempSensorName.find(fruTypeValue);
843                 if (iter == dimmTempSensorName.end())
844                 {
845                     log<level::ERR>(
846                         std::format(
847                             "readTempSensors: Fru type error! fruTypeValue = {}) ",
848                             fruTypeValue)
849                             .c_str());
850                     continue;
851                 }
852 
853                 sensorPath.append("dimm" + std::to_string(instanceID) +
854                                   iter->second);
855 
856                 dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/" +
857                                dimmDVFSSensorName.at(fruTypeValue);
858             }
859             else if (type == OCC_CPU_TEMP_SENSOR_TYPE)
860             {
861                 if (fruTypeValue == processorCore)
862                 {
863                     // The OCC reports small core temps, of which there are
864                     // two per big core.  All current P10 systems are in big
865                     // core mode, so use a big core name.
866                     uint16_t coreNum = instanceID / 2;
867                     uint16_t tempNum = instanceID % 2;
868                     sensorPath.append("proc" + std::to_string(occInstance) +
869                                       "_core" + std::to_string(coreNum) + "_" +
870                                       std::to_string(tempNum) + "_temp");
871 
872                     dvfsTempPath =
873                         std::string{OCC_SENSORS_ROOT} + "/temperature/proc" +
874                         std::to_string(occInstance) + "_core_dvfs_temp";
875                 }
876                 else
877                 {
878                     continue;
879                 }
880             }
881             else
882             {
883                 continue;
884             }
885         }
886 
887         // The dvfs temp file only needs to be read once per chip per type.
888         if (!dvfsTempPath.empty() &&
889             !dbus::OccDBusSensors::getOccDBus().hasDvfsTemp(dvfsTempPath))
890         {
891             try
892             {
893                 auto dvfsValue = readFile<double>(filePathString + maxSuffix);
894 
895                 dbus::OccDBusSensors::getOccDBus().setDvfsTemp(
896                     dvfsTempPath, dvfsValue * std::pow(10, -3));
897             }
898             catch (const std::system_error& e)
899             {
900                 log<level::DEBUG>(
901                     std::format(
902                         "readTempSensors: Failed reading {}, errno = {}",
903                         filePathString + maxSuffix, e.code().value())
904                         .c_str());
905             }
906         }
907 
908         uint32_t faultValue{0};
909         try
910         {
911             faultValue = readFile<uint32_t>(filePathString + faultSuffix);
912         }
913         catch (const std::system_error& e)
914         {
915             log<level::DEBUG>(
916                 std::format("readTempSensors: Failed reading {}, errno = {}",
917                             filePathString + faultSuffix, e.code().value())
918                     .c_str());
919             continue;
920         }
921 
922         double tempValue{0};
923         // NOTE: if OCC sends back 0xFF, kernal sets this fault value to 1.
924         if (faultValue != 0)
925         {
926             tempValue = std::numeric_limits<double>::quiet_NaN();
927         }
928         else
929         {
930             // Read the temperature
931             try
932             {
933                 tempValue = readFile<double>(filePathString + inputSuffix);
934             }
935             catch (const std::system_error& e)
936             {
937                 log<level::DEBUG>(
938                     std::format(
939                         "readTempSensors: Failed reading {}, errno = {}",
940                         filePathString + inputSuffix, e.code().value())
941                         .c_str());
942 
943                 // if errno == EAGAIN(Resource temporarily unavailable) then set
944                 // temp to 0, to avoid using old temp, and affecting FAN
945                 // Control.
946                 if (e.code().value() == EAGAIN)
947                 {
948                     tempValue = 0;
949                 }
950                 // else the errno would be something like
951                 //     EBADF(Bad file descriptor)
952                 // or ENOENT(No such file or directory)
953                 else
954                 {
955                     continue;
956                 }
957             }
958         }
959 
960         // If this object path already has a value, only overwite
961         // it if the previous one was an NaN or a smaller value.
962         auto existing = sensorData.find(sensorPath);
963         if (existing != sensorData.end())
964         {
965             // Multiple sensors found for this FRU type
966             if ((std::isnan(existing->second) && (tempValue == 0)) ||
967                 ((existing->second == 0) && std::isnan(tempValue)))
968             {
969                 // One of the redundant sensors has failed (0xFF/nan), and the
970                 // other sensor has no reading (0), so set the FRU to NaN to
971                 // force fan increase
972                 tempValue = std::numeric_limits<double>::quiet_NaN();
973                 existing->second = tempValue;
974             }
975             if (std::isnan(existing->second) || (tempValue > existing->second))
976             {
977                 existing->second = tempValue;
978             }
979         }
980         else
981         {
982             // First sensor for this FRU type
983             sensorData[sensorPath] = tempValue;
984         }
985     }
986 
987     // Now publish the values on D-Bus.
988     for (const auto& [objectPath, value] : sensorData)
989     {
990         dbus::OccDBusSensors::getOccDBus().setValue(objectPath,
991                                                     value * std::pow(10, -3));
992 
993         dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
994             objectPath, !std::isnan(value));
995 
996         if (existingSensors.find(objectPath) == existingSensors.end())
997         {
998             dbus::OccDBusSensors::getOccDBus().setChassisAssociation(
999                 objectPath);
1000         }
1001 
1002         existingSensors[objectPath] = occInstance;
1003     }
1004 }
1005 
1006 std::optional<std::string>
1007     Manager::getPowerLabelFunctionID(const std::string& value)
1008 {
1009     // If the value is "system", then the FunctionID is "system".
1010     if (value == "system")
1011     {
1012         return value;
1013     }
1014 
1015     // If the value is not "system", then the label value have 3 numbers, of
1016     // which we only care about the middle one:
1017     // <sensor id>_<function id>_<apss channel>
1018     // eg: The value is "0_10_5" , then the FunctionID is "10".
1019     if (value.find("_") == std::string::npos)
1020     {
1021         return std::nullopt;
1022     }
1023 
1024     auto powerLabelValue = value.substr((value.find("_") + 1));
1025 
1026     if (powerLabelValue.find("_") == std::string::npos)
1027     {
1028         return std::nullopt;
1029     }
1030 
1031     return powerLabelValue.substr(0, powerLabelValue.find("_"));
1032 }
1033 
1034 void Manager::readPowerSensors(const fs::path& path, uint32_t id)
1035 {
1036     std::regex expr{"power\\d+_label$"}; // Example: power5_label
1037     for (auto& file : fs::directory_iterator(path))
1038     {
1039         if (!std::regex_search(file.path().string(), expr))
1040         {
1041             continue;
1042         }
1043 
1044         std::string labelValue;
1045         try
1046         {
1047             labelValue = readFile<std::string>(file.path());
1048         }
1049         catch (const std::system_error& e)
1050         {
1051             log<level::DEBUG>(
1052                 std::format("readPowerSensors: Failed reading {}, errno = {}",
1053                             file.path().string(), e.code().value())
1054                     .c_str());
1055             continue;
1056         }
1057 
1058         auto functionID = getPowerLabelFunctionID(labelValue);
1059         if (functionID == std::nullopt)
1060         {
1061             continue;
1062         }
1063 
1064         const std::string& tempLabel = "label";
1065         const std::string filePathString = file.path().string().substr(
1066             0, file.path().string().length() - tempLabel.length());
1067 
1068         std::string sensorPath = OCC_SENSORS_ROOT + std::string("/power/");
1069 
1070         auto iter = powerSensorName.find(*functionID);
1071         if (iter == powerSensorName.end())
1072         {
1073             continue;
1074         }
1075         sensorPath.append(iter->second);
1076 
1077         double tempValue{0};
1078 
1079         try
1080         {
1081             tempValue = readFile<double>(filePathString + inputSuffix);
1082         }
1083         catch (const std::system_error& e)
1084         {
1085             log<level::DEBUG>(
1086                 std::format("readPowerSensors: Failed reading {}, errno = {}",
1087                             filePathString + inputSuffix, e.code().value())
1088                     .c_str());
1089             continue;
1090         }
1091 
1092         dbus::OccDBusSensors::getOccDBus().setUnit(
1093             sensorPath, "xyz.openbmc_project.Sensor.Value.Unit.Watts");
1094 
1095         dbus::OccDBusSensors::getOccDBus().setValue(
1096             sensorPath, tempValue * std::pow(10, -3) * std::pow(10, -3));
1097 
1098         dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath,
1099                                                                 true);
1100 
1101         if (existingSensors.find(sensorPath) == existingSensors.end())
1102         {
1103             dbus::OccDBusSensors::getOccDBus().setChassisAssociation(
1104                 sensorPath);
1105         }
1106 
1107         existingSensors[sensorPath] = id;
1108     }
1109     return;
1110 }
1111 
1112 void Manager::setSensorValueToNaN(uint32_t id) const
1113 {
1114     for (const auto& [sensorPath, occId] : existingSensors)
1115     {
1116         if (occId == id)
1117         {
1118             dbus::OccDBusSensors::getOccDBus().setValue(
1119                 sensorPath, std::numeric_limits<double>::quiet_NaN());
1120 
1121             dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath,
1122                                                                     true);
1123         }
1124     }
1125     return;
1126 }
1127 
1128 void Manager::setSensorValueToNonFunctional(uint32_t id) const
1129 {
1130     for (const auto& [sensorPath, occId] : existingSensors)
1131     {
1132         if (occId == id)
1133         {
1134             dbus::OccDBusSensors::getOccDBus().setValue(
1135                 sensorPath, std::numeric_limits<double>::quiet_NaN());
1136 
1137             dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath,
1138                                                                     false);
1139         }
1140     }
1141     return;
1142 }
1143 
1144 void Manager::getSensorValues(std::unique_ptr<Status>& occ)
1145 {
1146     static bool tracedError[8] = {0};
1147     const fs::path sensorPath = occ->getHwmonPath();
1148     const uint32_t id = occ->getOccInstanceID();
1149 
1150     if (fs::exists(sensorPath))
1151     {
1152         // Read temperature sensors
1153         readTempSensors(sensorPath, id);
1154 
1155         if (occ->isMasterOcc())
1156         {
1157             // Read power sensors
1158             readPowerSensors(sensorPath, id);
1159         }
1160         tracedError[id] = false;
1161     }
1162     else
1163     {
1164         if (!tracedError[id])
1165         {
1166             log<level::ERR>(
1167                 std::format(
1168                     "Manager::getSensorValues: OCC{} sensor path missing: {}",
1169                     id, sensorPath.c_str())
1170                     .c_str());
1171             tracedError[id] = true;
1172         }
1173     }
1174 
1175     return;
1176 }
1177 #endif
1178 
1179 // Read the altitude from DBus
1180 void Manager::readAltitude()
1181 {
1182     static bool traceAltitudeErr = true;
1183 
1184     utils::PropertyValue altitudeProperty{};
1185     try
1186     {
1187         altitudeProperty = utils::getProperty(ALTITUDE_PATH, ALTITUDE_INTERFACE,
1188                                               ALTITUDE_PROP);
1189         auto sensorVal = std::get<double>(altitudeProperty);
1190         if (sensorVal < 0xFFFF)
1191         {
1192             if (sensorVal < 0)
1193             {
1194                 altitude = 0;
1195             }
1196             else
1197             {
1198                 // Round to nearest meter
1199                 altitude = uint16_t(sensorVal + 0.5);
1200             }
1201             log<level::DEBUG>(std::format("readAltitude: sensor={} ({}m)",
1202                                           sensorVal, altitude)
1203                                   .c_str());
1204             traceAltitudeErr = true;
1205         }
1206         else
1207         {
1208             if (traceAltitudeErr)
1209             {
1210                 traceAltitudeErr = false;
1211                 log<level::DEBUG>(
1212                     std::format("Invalid altitude value: {}", sensorVal)
1213                         .c_str());
1214             }
1215         }
1216     }
1217     catch (const sdbusplus::exception_t& e)
1218     {
1219         if (traceAltitudeErr)
1220         {
1221             traceAltitudeErr = false;
1222             log<level::INFO>(
1223                 std::format("Unable to read Altitude: {}", e.what()).c_str());
1224         }
1225         altitude = 0xFFFF; // not available
1226     }
1227 }
1228 
1229 // Callback function when ambient temperature changes
1230 void Manager::ambientCallback(sdbusplus::message_t& msg)
1231 {
1232     double currentTemp = 0;
1233     uint8_t truncatedTemp = 0xFF;
1234     std::string msgSensor;
1235     std::map<std::string, std::variant<double>> msgData;
1236     msg.read(msgSensor, msgData);
1237 
1238     auto valPropMap = msgData.find(AMBIENT_PROP);
1239     if (valPropMap == msgData.end())
1240     {
1241         log<level::DEBUG>("ambientCallback: Unknown ambient property changed");
1242         return;
1243     }
1244     currentTemp = std::get<double>(valPropMap->second);
1245     if (std::isnan(currentTemp))
1246     {
1247         truncatedTemp = 0xFF;
1248     }
1249     else
1250     {
1251         if (currentTemp < 0)
1252         {
1253             truncatedTemp = 0;
1254         }
1255         else
1256         {
1257             // Round to nearest degree C
1258             truncatedTemp = uint8_t(currentTemp + 0.5);
1259         }
1260     }
1261 
1262     // If ambient changes, notify OCCs
1263     if (truncatedTemp != ambient)
1264     {
1265         log<level::DEBUG>(
1266             std::format("ambientCallback: Ambient change from {} to {}C",
1267                         ambient, currentTemp)
1268                 .c_str());
1269 
1270         ambient = truncatedTemp;
1271         if (altitude == 0xFFFF)
1272         {
1273             // No altitude yet, try reading again
1274             readAltitude();
1275         }
1276 
1277         log<level::DEBUG>(
1278             std::format("ambientCallback: Ambient: {}C, altitude: {}m", ambient,
1279                         altitude)
1280                 .c_str());
1281 #ifdef POWER10
1282         // Send ambient and altitude to all OCCs
1283         for (auto& obj : statusObjects)
1284         {
1285             if (obj->occActive())
1286             {
1287                 obj->sendAmbient(ambient, altitude);
1288             }
1289         }
1290 #endif // POWER10
1291     }
1292 }
1293 
1294 // return the current ambient and altitude readings
1295 void Manager::getAmbientData(bool& ambientValid, uint8_t& ambientTemp,
1296                              uint16_t& altitudeValue) const
1297 {
1298     ambientValid = true;
1299     ambientTemp = ambient;
1300     altitudeValue = altitude;
1301 
1302     if (ambient == 0xFF)
1303     {
1304         ambientValid = false;
1305     }
1306 }
1307 
1308 #ifdef POWER10
1309 // Called when waitForAllOccsTimer expires
1310 // After the first OCC goes active, this timer will be started (60 seconds)
1311 void Manager::occsNotAllRunning()
1312 {
1313     if (activeCount != statusObjects.size())
1314     {
1315         // Not all OCCs went active
1316         log<level::WARNING>(
1317             std::format(
1318                 "occsNotAllRunning: Active OCC count ({}) does not match expected count ({})",
1319                 activeCount, statusObjects.size())
1320                 .c_str());
1321         // Procs may be garded, so may be expected
1322     }
1323 
1324     validateOccMaster();
1325 }
1326 #endif // POWER10
1327 
1328 // Verify single master OCC and start presence monitor
1329 void Manager::validateOccMaster()
1330 {
1331     int masterInstance = -1;
1332     for (auto& obj : statusObjects)
1333     {
1334         auto instance = obj->getOccInstanceID();
1335 #ifdef POWER10
1336         if (!obj->occActive())
1337         {
1338             if (utils::isHostRunning())
1339             {
1340                 // Check if sensor was queued while waiting for discovery
1341                 auto match = queuedActiveState.find(instance);
1342                 if (match != queuedActiveState.end())
1343                 {
1344                     queuedActiveState.erase(match);
1345                     log<level::INFO>(
1346                         std::format(
1347                             "validateOccMaster: OCC{} is ACTIVE (queued)",
1348                             instance)
1349                             .c_str());
1350                     obj->occActive(true);
1351                 }
1352                 else
1353                 {
1354                     // OCC does not appear to be active yet, check active sensor
1355                     pldmHandle->checkActiveSensor(instance);
1356                     if (obj->occActive())
1357                     {
1358                         log<level::INFO>(
1359                             std::format(
1360                                 "validateOccMaster: OCC{} is ACTIVE after reading sensor",
1361                                 instance)
1362                                 .c_str());
1363                     }
1364                 }
1365             }
1366             else
1367             {
1368                 log<level::WARNING>(
1369                     std::format(
1370                         "validateOccMaster: HOST is not running (OCC{})",
1371                         instance)
1372                         .c_str());
1373                 return;
1374             }
1375         }
1376 #endif // POWER10
1377 
1378         if (obj->isMasterOcc())
1379         {
1380             obj->addPresenceWatchMaster();
1381 
1382             if (masterInstance == -1)
1383             {
1384                 masterInstance = instance;
1385             }
1386             else
1387             {
1388                 log<level::ERR>(
1389                     std::format(
1390                         "validateOccMaster: Multiple OCC masters! ({} and {})",
1391                         masterInstance, instance)
1392                         .c_str());
1393                 // request reset
1394                 obj->deviceError(Error::Descriptor(PRESENCE_ERROR_PATH));
1395             }
1396         }
1397     }
1398 
1399     if (masterInstance < 0)
1400     {
1401         log<level::ERR>(
1402             std::format("validateOccMaster: Master OCC not found! (of {} OCCs)",
1403                         statusObjects.size())
1404                 .c_str());
1405         // request reset
1406         statusObjects.front()->deviceError(
1407             Error::Descriptor(PRESENCE_ERROR_PATH));
1408     }
1409     else
1410     {
1411         log<level::INFO>(
1412             std::format("validateOccMaster: OCC{} is master of {} OCCs",
1413                         masterInstance, activeCount)
1414                 .c_str());
1415 #ifdef POWER10
1416         pmode->updateDbusSafeMode(false);
1417 #endif
1418     }
1419 }
1420 
1421 void Manager::updatePcapBounds() const
1422 {
1423     if (pcap)
1424     {
1425         pcap->updatePcapBounds();
1426     }
1427 }
1428 
1429 } // namespace occ
1430 } // namespace open_power
1431