1 #include "config.h"
2 
3 #include "occ_manager.hpp"
4 
5 #include "i2c_occ.hpp"
6 #include "occ_dbus.hpp"
7 #include "occ_errors.hpp"
8 #include "utils.hpp"
9 
10 #include <phosphor-logging/elog-errors.hpp>
11 #include <phosphor-logging/log.hpp>
12 #include <xyz/openbmc_project/Common/error.hpp>
13 
14 #include <chrono>
15 #include <cmath>
16 #include <filesystem>
17 #include <fstream>
18 #include <regex>
19 
20 namespace open_power
21 {
22 namespace occ
23 {
24 
25 constexpr uint32_t fruTypeNotAvailable = 0xFF;
26 constexpr auto fruTypeSuffix = "fru_type";
27 constexpr auto faultSuffix = "fault";
28 constexpr auto inputSuffix = "input";
29 constexpr auto maxSuffix = "max";
30 
31 const auto HOST_ON_FILE = "/run/openbmc/host@0-on";
32 
33 using namespace phosphor::logging;
34 using namespace std::literals::chrono_literals;
35 
36 template <typename T>
37 T readFile(const std::string& path)
38 {
39     std::ifstream ifs;
40     ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit |
41                    std::ifstream::eofbit);
42     T data;
43 
44     try
45     {
46         ifs.open(path);
47         ifs >> data;
48         ifs.close();
49     }
50     catch (const std::exception& e)
51     {
52         auto err = errno;
53         throw std::system_error(err, std::generic_category());
54     }
55 
56     return data;
57 }
58 
59 void Manager::findAndCreateObjects()
60 {
61 #ifndef POWER10
62     for (auto id = 0; id < MAX_CPUS; ++id)
63     {
64         // Create one occ per cpu
65         auto occ = std::string(OCC_NAME) + std::to_string(id);
66         createObjects(occ);
67     }
68 #else
69     if (!pmode)
70     {
71         // Create the power mode object
72         pmode = std::make_unique<powermode::PowerMode>(
73             *this, powermode::PMODE_PATH, powermode::PIPS_PATH, event);
74     }
75 
76     if (!fs::exists(HOST_ON_FILE))
77     {
78         static bool statusObjCreated = false;
79         if (!statusObjCreated)
80         {
81             // Create the OCCs based on on the /dev/occX devices
82             auto occs = findOCCsInDev();
83 
84             if (occs.empty() || (prevOCCSearch.size() != occs.size()))
85             {
86                 // Something changed or no OCCs yet, try again in 10s.
87                 // Note on the first pass prevOCCSearch will be empty,
88                 // so there will be at least one delay to give things
89                 // a chance to settle.
90                 prevOCCSearch = occs;
91 
92                 log<level::INFO>(
93                     std::format(
94                         "Manager::findAndCreateObjects(): Waiting for OCCs (currently {})",
95                         occs.size())
96                         .c_str());
97 
98                 discoverTimer->restartOnce(10s);
99             }
100             else
101             {
102                 // All OCCs appear to be available, create status objects
103 
104                 // createObjects requires OCC0 first.
105                 std::sort(occs.begin(), occs.end());
106 
107                 log<level::INFO>(
108                     std::format(
109                         "Manager::findAndCreateObjects(): Creating {} OCC Status Objects",
110                         occs.size())
111                         .c_str());
112                 for (auto id : occs)
113                 {
114                     createObjects(std::string(OCC_NAME) + std::to_string(id));
115                 }
116                 statusObjCreated = true;
117                 waitingForAllOccActiveSensors = true;
118 
119                 // Find/update the processor path associated with each OCC
120                 for (auto& obj : statusObjects)
121                 {
122                     obj->updateProcAssociation();
123                 }
124             }
125         }
126 
127         if (statusObjCreated && waitingForAllOccActiveSensors)
128         {
129             static bool tracedHostWait = false;
130             if (utils::isHostRunning())
131             {
132                 if (tracedHostWait)
133                 {
134                     log<level::INFO>(
135                         "Manager::findAndCreateObjects(): Host is running");
136                     tracedHostWait = false;
137                 }
138                 checkAllActiveSensors();
139             }
140             else
141             {
142                 if (!tracedHostWait)
143                 {
144                     log<level::INFO>(
145                         "Manager::findAndCreateObjects(): Waiting for host to start");
146                     tracedHostWait = true;
147                 }
148                 discoverTimer->restartOnce(30s);
149             }
150         }
151     }
152     else
153     {
154         log<level::INFO>(
155             std::format(
156                 "Manager::findAndCreateObjects(): Waiting for {} to complete...",
157                 HOST_ON_FILE)
158                 .c_str());
159         discoverTimer->restartOnce(10s);
160     }
161 #endif
162 }
163 
164 #ifdef POWER10
165 // Check if all occActive sensors are available
166 void Manager::checkAllActiveSensors()
167 {
168     static bool allActiveSensorAvailable = false;
169     static bool tracedSensorWait = false;
170     static bool waitingForHost = false;
171 
172     if (open_power::occ::utils::isHostRunning())
173     {
174         if (waitingForHost)
175         {
176             waitingForHost = false;
177             log<level::INFO>("checkAllActiveSensors(): Host is now running");
178         }
179 
180         // Start with the assumption that all are available
181         allActiveSensorAvailable = true;
182         for (auto& obj : statusObjects)
183         {
184             if ((!obj->occActive()) && (!obj->getPldmSensorReceived()))
185             {
186                 auto instance = obj->getOccInstanceID();
187                 // Check if sensor was queued while waiting for discovery
188                 auto match = queuedActiveState.find(instance);
189                 if (match != queuedActiveState.end())
190                 {
191                     queuedActiveState.erase(match);
192                     log<level::INFO>(
193                         std::format(
194                             "checkAllActiveSensors(): OCC{} is ACTIVE (queued)",
195                             instance)
196                             .c_str());
197                     obj->occActive(true);
198                 }
199                 else
200                 {
201                     allActiveSensorAvailable = false;
202                     if (!tracedSensorWait)
203                     {
204                         log<level::INFO>(
205                             std::format(
206                                 "checkAllActiveSensors(): Waiting on OCC{} Active sensor",
207                                 instance)
208                                 .c_str());
209                         tracedSensorWait = true;
210                         // Make sure traces are not throttled
211 #ifdef PLDM
212                         pldmHandle->setTraceThrottle(false);
213                         // Start timer to throttle pldm traces when timer
214                         // expires
215                         throttleTraceTimer->restartOnce(5min);
216 #endif
217                     }
218 #ifdef PLDM
219                     pldmHandle->checkActiveSensor(obj->getOccInstanceID());
220 #endif
221                     break;
222                 }
223             }
224         }
225     }
226     else
227     {
228         if (!waitingForHost)
229         {
230             waitingForHost = true;
231             log<level::INFO>(
232                 "checkAllActiveSensors(): Waiting for host to start");
233         }
234     }
235 
236     if (allActiveSensorAvailable)
237     {
238         // All sensors were found, disable the discovery timer
239         if (discoverTimer->isEnabled())
240         {
241             discoverTimer->setEnabled(false);
242         }
243 #ifdef PLDM
244         if (throttleTraceTimer->isEnabled())
245         {
246             // Disable throttle timer and make sure traces are not throttled
247             throttleTraceTimer->setEnabled(false);
248             pldmHandle->setTraceThrottle(false);
249         }
250 #endif
251 
252         if (waitingForAllOccActiveSensors)
253         {
254             log<level::INFO>(
255                 "checkAllActiveSensors(): OCC Active sensors are available");
256             waitingForAllOccActiveSensors = false;
257         }
258         queuedActiveState.clear();
259         tracedSensorWait = false;
260     }
261     else
262     {
263         // Not all sensors were available, so keep waiting
264         if (!tracedSensorWait)
265         {
266             log<level::INFO>(
267                 "checkAllActiveSensors(): Waiting for OCC Active sensors to become available");
268             tracedSensorWait = true;
269         }
270         discoverTimer->restartOnce(10s);
271     }
272 }
273 #endif
274 
275 std::vector<int> Manager::findOCCsInDev()
276 {
277     std::vector<int> occs;
278     std::regex expr{R"(occ(\d+)$)"};
279 
280     for (auto& file : fs::directory_iterator("/dev"))
281     {
282         std::smatch match;
283         std::string path{file.path().string()};
284         if (std::regex_search(path, match, expr))
285         {
286             auto num = std::stoi(match[1].str());
287 
288             // /dev numbering starts at 1, ours starts at 0.
289             occs.push_back(num - 1);
290         }
291     }
292 
293     return occs;
294 }
295 
296 int Manager::cpuCreated(sdbusplus::message_t& msg)
297 {
298     namespace fs = std::filesystem;
299 
300     sdbusplus::message::object_path o;
301     msg.read(o);
302     fs::path cpuPath(std::string(std::move(o)));
303 
304     auto name = cpuPath.filename().string();
305     auto index = name.find(CPU_NAME);
306     name.replace(index, std::strlen(CPU_NAME), OCC_NAME);
307 
308     createObjects(name);
309 
310     return 0;
311 }
312 
313 void Manager::createObjects(const std::string& occ)
314 {
315     auto path = fs::path(OCC_CONTROL_ROOT) / occ;
316 
317     statusObjects.emplace_back(std::make_unique<Status>(
318         event, path.c_str(), *this,
319 #ifdef POWER10
320         pmode,
321 #endif
322         std::bind(std::mem_fn(&Manager::statusCallBack), this,
323                   std::placeholders::_1, std::placeholders::_2)
324 #ifdef PLDM
325             ,
326         std::bind(std::mem_fn(&pldm::Interface::resetOCC), pldmHandle.get(),
327                   std::placeholders::_1)
328 #endif
329             ));
330 
331     // Create the power cap monitor object
332     if (!pcap)
333     {
334         pcap = std::make_unique<open_power::occ::powercap::PowerCap>(
335             *statusObjects.back());
336     }
337 
338     if (statusObjects.back()->isMasterOcc())
339     {
340         log<level::INFO>(
341             std::format("Manager::createObjects(): OCC{} is the master",
342                         statusObjects.back()->getOccInstanceID())
343                 .c_str());
344         _pollTimer->setEnabled(false);
345 
346 #ifdef POWER10
347         // Set the master OCC on the PowerMode object
348         pmode->setMasterOcc(path);
349 #endif
350     }
351 
352     passThroughObjects.emplace_back(std::make_unique<PassThrough>(path.c_str()
353 #ifdef POWER10
354                                                                       ,
355                                                                   pmode
356 #endif
357                                                                   ));
358 }
359 
360 void Manager::statusCallBack(instanceID instance, bool status)
361 {
362     if (status == true)
363     {
364         // OCC went active
365         ++activeCount;
366 
367 #ifdef POWER10
368         if (activeCount == 1)
369         {
370             // First OCC went active (allow some time for all OCCs to go active)
371             waitForAllOccsTimer->restartOnce(60s);
372         }
373 #endif
374 
375         if (activeCount == statusObjects.size())
376         {
377 #ifdef POWER10
378             // All OCCs are now running
379             if (waitForAllOccsTimer->isEnabled())
380             {
381                 // stop occ wait timer
382                 waitForAllOccsTimer->setEnabled(false);
383             }
384 #endif
385 
386             // Verify master OCC and start presence monitor
387             validateOccMaster();
388         }
389 
390         // Start poll timer if not already started
391         if (!_pollTimer->isEnabled())
392         {
393             log<level::INFO>(
394                 std::format("Manager: OCCs will be polled every {} seconds",
395                             pollInterval)
396                     .c_str());
397 
398             // Send poll and start OCC poll timer
399             pollerTimerExpired();
400         }
401     }
402     else
403     {
404         // OCC went away
405         if (activeCount > 0)
406         {
407             --activeCount;
408         }
409         else
410         {
411             log<level::ERR>(
412                 std::format("OCC{} disabled, but currently no active OCCs",
413                             instance)
414                     .c_str());
415         }
416 
417         if (activeCount == 0)
418         {
419             // No OCCs are running
420 
421             // Stop OCC poll timer
422             if (_pollTimer->isEnabled())
423             {
424                 log<level::INFO>(
425                     "Manager::statusCallBack(): OCCs are not running, stopping poll timer");
426                 _pollTimer->setEnabled(false);
427             }
428 
429 #ifdef POWER10
430             // stop wait timer
431             if (waitForAllOccsTimer->isEnabled())
432             {
433                 waitForAllOccsTimer->setEnabled(false);
434             }
435 #endif
436         }
437 #ifdef READ_OCC_SENSORS
438         // Clear OCC sensors
439         setSensorValueToNaN(instance);
440 #endif
441     }
442 
443 #ifdef POWER10
444     if (waitingForAllOccActiveSensors)
445     {
446         if (utils::isHostRunning())
447         {
448             checkAllActiveSensors();
449         }
450     }
451 #endif
452 }
453 
454 #ifdef I2C_OCC
455 void Manager::initStatusObjects()
456 {
457     // Make sure we have a valid path string
458     static_assert(sizeof(DEV_PATH) != 0);
459 
460     auto deviceNames = i2c_occ::getOccHwmonDevices(DEV_PATH);
461     for (auto& name : deviceNames)
462     {
463         i2c_occ::i2cToDbus(name);
464         name = std::string(OCC_NAME) + '_' + name;
465         auto path = fs::path(OCC_CONTROL_ROOT) / name;
466         statusObjects.emplace_back(
467             std::make_unique<Status>(event, path.c_str(), *this));
468     }
469     // The first device is master occ
470     pcap = std::make_unique<open_power::occ::powercap::PowerCap>(
471         *statusObjects.front());
472 #ifdef POWER10
473     pmode = std::make_unique<powermode::PowerMode>(*this, powermode::PMODE_PATH,
474                                                    powermode::PIPS_PATH);
475     // Set the master OCC on the PowerMode object
476     pmode->setMasterOcc(path);
477 #endif
478 }
479 #endif
480 
481 #ifdef PLDM
482 void Manager::sbeTimeout(unsigned int instance)
483 {
484     auto obj = std::find_if(statusObjects.begin(), statusObjects.end(),
485                             [instance](const auto& obj) {
486         return instance == obj->getOccInstanceID();
487     });
488 
489     if (obj != statusObjects.end() && (*obj)->occActive())
490     {
491         log<level::INFO>(
492             std::format("SBE timeout, requesting HRESET (OCC{})", instance)
493                 .c_str());
494 
495         setSBEState(instance, SBE_STATE_NOT_USABLE);
496 
497         pldmHandle->sendHRESET(instance);
498     }
499 }
500 
501 bool Manager::updateOCCActive(instanceID instance, bool status)
502 {
503     auto obj = std::find_if(statusObjects.begin(), statusObjects.end(),
504                             [instance](const auto& obj) {
505         return instance == obj->getOccInstanceID();
506     });
507 
508     const bool hostRunning = open_power::occ::utils::isHostRunning();
509     if (obj != statusObjects.end())
510     {
511         if (!hostRunning && (status == true))
512         {
513             log<level::WARNING>(
514                 std::format(
515                     "updateOCCActive: Host is not running yet (OCC{} active={}), clearing sensor received",
516                     instance, status)
517                     .c_str());
518             (*obj)->setPldmSensorReceived(false);
519             if (!waitingForAllOccActiveSensors)
520             {
521                 log<level::INFO>(
522                     "updateOCCActive: Waiting for Host and all OCC Active Sensors");
523                 waitingForAllOccActiveSensors = true;
524             }
525 #ifdef POWER10
526             discoverTimer->restartOnce(30s);
527 #endif
528             return false;
529         }
530         else
531         {
532             log<level::INFO>(std::format("updateOCCActive: OCC{} active={}",
533                                          instance, status)
534                                  .c_str());
535             (*obj)->setPldmSensorReceived(true);
536             return (*obj)->occActive(status);
537         }
538     }
539     else
540     {
541         if (hostRunning)
542         {
543             log<level::WARNING>(
544                 std::format(
545                     "updateOCCActive: No status object to update for OCC{} (active={})",
546                     instance, status)
547                     .c_str());
548         }
549         else
550         {
551             if (status == true)
552             {
553                 log<level::WARNING>(
554                     std::format(
555                         "updateOCCActive: No status objects and Host is not running yet (OCC{} active={})",
556                         instance, status)
557                         .c_str());
558             }
559         }
560         if (status == true)
561         {
562             // OCC went active
563             queuedActiveState.insert(instance);
564         }
565         else
566         {
567             auto match = queuedActiveState.find(instance);
568             if (match != queuedActiveState.end())
569             {
570                 // OCC was disabled
571                 queuedActiveState.erase(match);
572             }
573         }
574         return false;
575     }
576 }
577 
578 // Called upon pldm event To set powermode Safe Mode State for system.
579 void Manager::updateOccSafeMode(bool safeMode)
580 {
581 #ifdef POWER10
582     pmode->updateDbusSafeMode(safeMode);
583 #endif
584     // Update the processor throttle status on dbus
585     for (auto& obj : statusObjects)
586     {
587         obj->updateThrottle(safeMode, THROTTLED_SAFE);
588     }
589 }
590 
591 void Manager::sbeHRESETResult(instanceID instance, bool success)
592 {
593     if (success)
594     {
595         log<level::INFO>(
596             std::format("HRESET succeeded (OCC{})", instance).c_str());
597 
598         setSBEState(instance, SBE_STATE_BOOTED);
599 
600         return;
601     }
602 
603     setSBEState(instance, SBE_STATE_FAILED);
604 
605     if (sbeCanDump(instance))
606     {
607         log<level::INFO>(
608             std::format("HRESET failed (OCC{}), triggering SBE dump", instance)
609                 .c_str());
610 
611         auto& bus = utils::getBus();
612         uint32_t src6 = instance << 16;
613         uint32_t logId =
614             FFDC::createPEL("org.open_power.Processor.Error.SbeChipOpTimeout",
615                             src6, "SBE command timeout");
616 
617         try
618         {
619             constexpr auto path = "/org/openpower/dump";
620             constexpr auto interface = "xyz.openbmc_project.Dump.Create";
621             constexpr auto function = "CreateDump";
622 
623             std::string service = utils::getService(path, interface);
624             auto method = bus.new_method_call(service.c_str(), path, interface,
625                                               function);
626 
627             std::map<std::string, std::variant<std::string, uint64_t>>
628                 createParams{
629                     {"com.ibm.Dump.Create.CreateParameters.ErrorLogId",
630                      uint64_t(logId)},
631                     {"com.ibm.Dump.Create.CreateParameters.DumpType",
632                      "com.ibm.Dump.Create.DumpType.SBE"},
633                     {"com.ibm.Dump.Create.CreateParameters.FailingUnitId",
634                      uint64_t(instance)},
635                 };
636 
637             method.append(createParams);
638 
639             auto response = bus.call(method);
640         }
641         catch (const sdbusplus::exception_t& e)
642         {
643             constexpr auto ERROR_DUMP_DISABLED =
644                 "xyz.openbmc_project.Dump.Create.Error.Disabled";
645             if (e.name() == ERROR_DUMP_DISABLED)
646             {
647                 log<level::INFO>("Dump is disabled, skipping");
648             }
649             else
650             {
651                 log<level::ERR>("Dump failed");
652             }
653         }
654     }
655 }
656 
657 bool Manager::sbeCanDump(unsigned int instance)
658 {
659     struct pdbg_target* proc = getPdbgTarget(instance);
660 
661     if (!proc)
662     {
663         // allow the dump in the error case
664         return true;
665     }
666 
667     try
668     {
669         if (!openpower::phal::sbe::isDumpAllowed(proc))
670         {
671             return false;
672         }
673 
674         if (openpower::phal::pdbg::isSbeVitalAttnActive(proc))
675         {
676             return false;
677         }
678     }
679     catch (openpower::phal::exception::SbeError& e)
680     {
681         log<level::INFO>("Failed to query SBE state");
682     }
683 
684     // allow the dump in the error case
685     return true;
686 }
687 
688 void Manager::setSBEState(unsigned int instance, enum sbe_state state)
689 {
690     struct pdbg_target* proc = getPdbgTarget(instance);
691 
692     if (!proc)
693     {
694         return;
695     }
696 
697     try
698     {
699         openpower::phal::sbe::setState(proc, state);
700     }
701     catch (const openpower::phal::exception::SbeError& e)
702     {
703         log<level::ERR>("Failed to set SBE state");
704     }
705 }
706 
707 struct pdbg_target* Manager::getPdbgTarget(unsigned int instance)
708 {
709     if (!pdbgInitialized)
710     {
711         try
712         {
713             openpower::phal::pdbg::init();
714             pdbgInitialized = true;
715         }
716         catch (const openpower::phal::exception::PdbgError& e)
717         {
718             log<level::ERR>("pdbg initialization failed");
719             return nullptr;
720         }
721     }
722 
723     struct pdbg_target* proc = nullptr;
724     pdbg_for_each_class_target("proc", proc)
725     {
726         if (pdbg_target_index(proc) == instance)
727         {
728             return proc;
729         }
730     }
731 
732     log<level::ERR>("Failed to get pdbg target");
733     return nullptr;
734 }
735 #endif
736 
737 void Manager::pollerTimerExpired()
738 {
739     if (!_pollTimer)
740     {
741         log<level::ERR>(
742             "Manager::pollerTimerExpired() ERROR: Timer not defined");
743         return;
744     }
745 
746     for (auto& obj : statusObjects)
747     {
748         if (!obj->occActive())
749         {
750             // OCC is not running yet
751 #ifdef READ_OCC_SENSORS
752             auto id = obj->getOccInstanceID();
753             setSensorValueToNaN(id);
754 #endif
755             continue;
756         }
757 
758         // Read sysfs to force kernel to poll OCC
759         obj->readOccState();
760 
761 #ifdef READ_OCC_SENSORS
762         // Read occ sensor values
763         getSensorValues(obj);
764 #endif
765     }
766 
767     if (activeCount > 0)
768     {
769         // Restart OCC poll timer
770         _pollTimer->restartOnce(std::chrono::seconds(pollInterval));
771     }
772     else
773     {
774         // No OCCs running, so poll timer will not be restarted
775         log<level::INFO>(
776             std::format(
777                 "Manager::pollerTimerExpired: poll timer will not be restarted")
778                 .c_str());
779     }
780 }
781 
782 #ifdef READ_OCC_SENSORS
783 void Manager::readTempSensors(const fs::path& path, uint32_t occInstance)
784 {
785     // There may be more than one sensor with the same FRU type
786     // and label so make two passes: the first to read the temps
787     // from sysfs, and the second to put them on D-Bus after
788     // resolving any conflicts.
789     std::map<std::string, double> sensorData;
790 
791     std::regex expr{"temp\\d+_label$"}; // Example: temp5_label
792     for (auto& file : fs::directory_iterator(path))
793     {
794         if (!std::regex_search(file.path().string(), expr))
795         {
796             continue;
797         }
798 
799         uint32_t labelValue{0};
800 
801         try
802         {
803             labelValue = readFile<uint32_t>(file.path());
804         }
805         catch (const std::system_error& e)
806         {
807             log<level::DEBUG>(
808                 std::format("readTempSensors: Failed reading {}, errno = {}",
809                             file.path().string(), e.code().value())
810                     .c_str());
811             continue;
812         }
813 
814         const std::string& tempLabel = "label";
815         const std::string filePathString = file.path().string().substr(
816             0, file.path().string().length() - tempLabel.length());
817 
818         uint32_t fruTypeValue{0};
819         try
820         {
821             fruTypeValue = readFile<uint32_t>(filePathString + fruTypeSuffix);
822         }
823         catch (const std::system_error& e)
824         {
825             log<level::DEBUG>(
826                 std::format("readTempSensors: Failed reading {}, errno = {}",
827                             filePathString + fruTypeSuffix, e.code().value())
828                     .c_str());
829             continue;
830         }
831 
832         std::string sensorPath = OCC_SENSORS_ROOT +
833                                  std::string("/temperature/");
834 
835         std::string dvfsTempPath;
836 
837         if (fruTypeValue == VRMVdd)
838         {
839             sensorPath.append("vrm_vdd" + std::to_string(occInstance) +
840                               "_temp");
841         }
842         else if (fruTypeValue == processorIoRing)
843         {
844             sensorPath.append("proc" + std::to_string(occInstance) +
845                               "_ioring_temp");
846             dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/proc" +
847                            std::to_string(occInstance) + "_ioring_dvfs_temp";
848         }
849         else
850         {
851             uint16_t type = (labelValue & 0xFF000000) >> 24;
852             uint16_t instanceID = labelValue & 0x0000FFFF;
853 
854             if (type == OCC_DIMM_TEMP_SENSOR_TYPE)
855             {
856                 if (fruTypeValue == fruTypeNotAvailable)
857                 {
858                     // Not all DIMM related temps are available to read
859                     // (no _input file in this case)
860                     continue;
861                 }
862                 auto iter = dimmTempSensorName.find(fruTypeValue);
863                 if (iter == dimmTempSensorName.end())
864                 {
865                     log<level::ERR>(
866                         std::format(
867                             "readTempSensors: Fru type error! fruTypeValue = {}) ",
868                             fruTypeValue)
869                             .c_str());
870                     continue;
871                 }
872 
873                 sensorPath.append("dimm" + std::to_string(instanceID) +
874                                   iter->second);
875 
876                 dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/" +
877                                dimmDVFSSensorName.at(fruTypeValue);
878             }
879             else if (type == OCC_CPU_TEMP_SENSOR_TYPE)
880             {
881                 if (fruTypeValue == processorCore)
882                 {
883                     // The OCC reports small core temps, of which there are
884                     // two per big core.  All current P10 systems are in big
885                     // core mode, so use a big core name.
886                     uint16_t coreNum = instanceID / 2;
887                     uint16_t tempNum = instanceID % 2;
888                     sensorPath.append("proc" + std::to_string(occInstance) +
889                                       "_core" + std::to_string(coreNum) + "_" +
890                                       std::to_string(tempNum) + "_temp");
891 
892                     dvfsTempPath =
893                         std::string{OCC_SENSORS_ROOT} + "/temperature/proc" +
894                         std::to_string(occInstance) + "_core_dvfs_temp";
895                 }
896                 else
897                 {
898                     continue;
899                 }
900             }
901             else
902             {
903                 continue;
904             }
905         }
906 
907         // The dvfs temp file only needs to be read once per chip per type.
908         if (!dvfsTempPath.empty() &&
909             !dbus::OccDBusSensors::getOccDBus().hasDvfsTemp(dvfsTempPath))
910         {
911             try
912             {
913                 auto dvfsValue = readFile<double>(filePathString + maxSuffix);
914 
915                 dbus::OccDBusSensors::getOccDBus().setDvfsTemp(
916                     dvfsTempPath, dvfsValue * std::pow(10, -3));
917             }
918             catch (const std::system_error& e)
919             {
920                 log<level::DEBUG>(
921                     std::format(
922                         "readTempSensors: Failed reading {}, errno = {}",
923                         filePathString + maxSuffix, e.code().value())
924                         .c_str());
925             }
926         }
927 
928         uint32_t faultValue{0};
929         try
930         {
931             faultValue = readFile<uint32_t>(filePathString + faultSuffix);
932         }
933         catch (const std::system_error& e)
934         {
935             log<level::DEBUG>(
936                 std::format("readTempSensors: Failed reading {}, errno = {}",
937                             filePathString + faultSuffix, e.code().value())
938                     .c_str());
939             continue;
940         }
941 
942         double tempValue{0};
943         // NOTE: if OCC sends back 0xFF, kernal sets this fault value to 1.
944         if (faultValue != 0)
945         {
946             tempValue = std::numeric_limits<double>::quiet_NaN();
947         }
948         else
949         {
950             // Read the temperature
951             try
952             {
953                 tempValue = readFile<double>(filePathString + inputSuffix);
954             }
955             catch (const std::system_error& e)
956             {
957                 log<level::DEBUG>(
958                     std::format(
959                         "readTempSensors: Failed reading {}, errno = {}",
960                         filePathString + inputSuffix, e.code().value())
961                         .c_str());
962 
963                 // if errno == EAGAIN(Resource temporarily unavailable) then set
964                 // temp to 0, to avoid using old temp, and affecting FAN
965                 // Control.
966                 if (e.code().value() == EAGAIN)
967                 {
968                     tempValue = 0;
969                 }
970                 // else the errno would be something like
971                 //     EBADF(Bad file descriptor)
972                 // or ENOENT(No such file or directory)
973                 else
974                 {
975                     continue;
976                 }
977             }
978         }
979 
980         // If this object path already has a value, only overwite
981         // it if the previous one was an NaN or a smaller value.
982         auto existing = sensorData.find(sensorPath);
983         if (existing != sensorData.end())
984         {
985             // Multiple sensors found for this FRU type
986             if ((std::isnan(existing->second) && (tempValue == 0)) ||
987                 ((existing->second == 0) && std::isnan(tempValue)))
988             {
989                 // One of the redundant sensors has failed (0xFF/nan), and the
990                 // other sensor has no reading (0), so set the FRU to NaN to
991                 // force fan increase
992                 tempValue = std::numeric_limits<double>::quiet_NaN();
993                 existing->second = tempValue;
994             }
995             if (std::isnan(existing->second) || (tempValue > existing->second))
996             {
997                 existing->second = tempValue;
998             }
999         }
1000         else
1001         {
1002             // First sensor for this FRU type
1003             sensorData[sensorPath] = tempValue;
1004         }
1005     }
1006 
1007     // Now publish the values on D-Bus.
1008     for (const auto& [objectPath, value] : sensorData)
1009     {
1010         dbus::OccDBusSensors::getOccDBus().setValue(objectPath,
1011                                                     value * std::pow(10, -3));
1012 
1013         dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1014             objectPath, !std::isnan(value));
1015 
1016         if (existingSensors.find(objectPath) == existingSensors.end())
1017         {
1018             dbus::OccDBusSensors::getOccDBus().setChassisAssociation(
1019                 objectPath);
1020         }
1021 
1022         existingSensors[objectPath] = occInstance;
1023     }
1024 }
1025 
1026 std::optional<std::string>
1027     Manager::getPowerLabelFunctionID(const std::string& value)
1028 {
1029     // If the value is "system", then the FunctionID is "system".
1030     if (value == "system")
1031     {
1032         return value;
1033     }
1034 
1035     // If the value is not "system", then the label value have 3 numbers, of
1036     // which we only care about the middle one:
1037     // <sensor id>_<function id>_<apss channel>
1038     // eg: The value is "0_10_5" , then the FunctionID is "10".
1039     if (value.find("_") == std::string::npos)
1040     {
1041         return std::nullopt;
1042     }
1043 
1044     auto powerLabelValue = value.substr((value.find("_") + 1));
1045 
1046     if (powerLabelValue.find("_") == std::string::npos)
1047     {
1048         return std::nullopt;
1049     }
1050 
1051     return powerLabelValue.substr(0, powerLabelValue.find("_"));
1052 }
1053 
1054 void Manager::readPowerSensors(const fs::path& path, uint32_t id)
1055 {
1056     std::regex expr{"power\\d+_label$"}; // Example: power5_label
1057     for (auto& file : fs::directory_iterator(path))
1058     {
1059         if (!std::regex_search(file.path().string(), expr))
1060         {
1061             continue;
1062         }
1063 
1064         std::string labelValue;
1065         try
1066         {
1067             labelValue = readFile<std::string>(file.path());
1068         }
1069         catch (const std::system_error& e)
1070         {
1071             log<level::DEBUG>(
1072                 std::format("readPowerSensors: Failed reading {}, errno = {}",
1073                             file.path().string(), e.code().value())
1074                     .c_str());
1075             continue;
1076         }
1077 
1078         auto functionID = getPowerLabelFunctionID(labelValue);
1079         if (functionID == std::nullopt)
1080         {
1081             continue;
1082         }
1083 
1084         const std::string& tempLabel = "label";
1085         const std::string filePathString = file.path().string().substr(
1086             0, file.path().string().length() - tempLabel.length());
1087 
1088         std::string sensorPath = OCC_SENSORS_ROOT + std::string("/power/");
1089 
1090         auto iter = powerSensorName.find(*functionID);
1091         if (iter == powerSensorName.end())
1092         {
1093             continue;
1094         }
1095         sensorPath.append(iter->second);
1096 
1097         double tempValue{0};
1098 
1099         try
1100         {
1101             tempValue = readFile<double>(filePathString + inputSuffix);
1102         }
1103         catch (const std::system_error& e)
1104         {
1105             log<level::DEBUG>(
1106                 std::format("readPowerSensors: Failed reading {}, errno = {}",
1107                             filePathString + inputSuffix, e.code().value())
1108                     .c_str());
1109             continue;
1110         }
1111 
1112         dbus::OccDBusSensors::getOccDBus().setUnit(
1113             sensorPath, "xyz.openbmc_project.Sensor.Value.Unit.Watts");
1114 
1115         dbus::OccDBusSensors::getOccDBus().setValue(
1116             sensorPath, tempValue * std::pow(10, -3) * std::pow(10, -3));
1117 
1118         dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath,
1119                                                                 true);
1120 
1121         if (existingSensors.find(sensorPath) == existingSensors.end())
1122         {
1123             dbus::OccDBusSensors::getOccDBus().setChassisAssociation(
1124                 sensorPath);
1125         }
1126 
1127         existingSensors[sensorPath] = id;
1128     }
1129     return;
1130 }
1131 
1132 void Manager::setSensorValueToNaN(uint32_t id) const
1133 {
1134     for (const auto& [sensorPath, occId] : existingSensors)
1135     {
1136         if (occId == id)
1137         {
1138             dbus::OccDBusSensors::getOccDBus().setValue(
1139                 sensorPath, std::numeric_limits<double>::quiet_NaN());
1140 
1141             dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath,
1142                                                                     true);
1143         }
1144     }
1145     return;
1146 }
1147 
1148 void Manager::setSensorValueToNonFunctional(uint32_t id) const
1149 {
1150     for (const auto& [sensorPath, occId] : existingSensors)
1151     {
1152         if (occId == id)
1153         {
1154             dbus::OccDBusSensors::getOccDBus().setValue(
1155                 sensorPath, std::numeric_limits<double>::quiet_NaN());
1156 
1157             dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath,
1158                                                                     false);
1159         }
1160     }
1161     return;
1162 }
1163 
1164 void Manager::getSensorValues(std::unique_ptr<Status>& occ)
1165 {
1166     static bool tracedError[8] = {0};
1167     const fs::path sensorPath = occ->getHwmonPath();
1168     const uint32_t id = occ->getOccInstanceID();
1169 
1170     if (fs::exists(sensorPath))
1171     {
1172         // Read temperature sensors
1173         readTempSensors(sensorPath, id);
1174 
1175         if (occ->isMasterOcc())
1176         {
1177             // Read power sensors
1178             readPowerSensors(sensorPath, id);
1179         }
1180         tracedError[id] = false;
1181     }
1182     else
1183     {
1184         if (!tracedError[id])
1185         {
1186             log<level::ERR>(
1187                 std::format(
1188                     "Manager::getSensorValues: OCC{} sensor path missing: {}",
1189                     id, sensorPath.c_str())
1190                     .c_str());
1191             tracedError[id] = true;
1192         }
1193     }
1194 
1195     return;
1196 }
1197 #endif
1198 
1199 // Read the altitude from DBus
1200 void Manager::readAltitude()
1201 {
1202     static bool traceAltitudeErr = true;
1203 
1204     utils::PropertyValue altitudeProperty{};
1205     try
1206     {
1207         altitudeProperty = utils::getProperty(ALTITUDE_PATH, ALTITUDE_INTERFACE,
1208                                               ALTITUDE_PROP);
1209         auto sensorVal = std::get<double>(altitudeProperty);
1210         if (sensorVal < 0xFFFF)
1211         {
1212             if (sensorVal < 0)
1213             {
1214                 altitude = 0;
1215             }
1216             else
1217             {
1218                 // Round to nearest meter
1219                 altitude = uint16_t(sensorVal + 0.5);
1220             }
1221             log<level::DEBUG>(std::format("readAltitude: sensor={} ({}m)",
1222                                           sensorVal, altitude)
1223                                   .c_str());
1224             traceAltitudeErr = true;
1225         }
1226         else
1227         {
1228             if (traceAltitudeErr)
1229             {
1230                 traceAltitudeErr = false;
1231                 log<level::DEBUG>(
1232                     std::format("Invalid altitude value: {}", sensorVal)
1233                         .c_str());
1234             }
1235         }
1236     }
1237     catch (const sdbusplus::exception_t& e)
1238     {
1239         if (traceAltitudeErr)
1240         {
1241             traceAltitudeErr = false;
1242             log<level::INFO>(
1243                 std::format("Unable to read Altitude: {}", e.what()).c_str());
1244         }
1245         altitude = 0xFFFF; // not available
1246     }
1247 }
1248 
1249 // Callback function when ambient temperature changes
1250 void Manager::ambientCallback(sdbusplus::message_t& msg)
1251 {
1252     double currentTemp = 0;
1253     uint8_t truncatedTemp = 0xFF;
1254     std::string msgSensor;
1255     std::map<std::string, std::variant<double>> msgData;
1256     msg.read(msgSensor, msgData);
1257 
1258     auto valPropMap = msgData.find(AMBIENT_PROP);
1259     if (valPropMap == msgData.end())
1260     {
1261         log<level::DEBUG>("ambientCallback: Unknown ambient property changed");
1262         return;
1263     }
1264     currentTemp = std::get<double>(valPropMap->second);
1265     if (std::isnan(currentTemp))
1266     {
1267         truncatedTemp = 0xFF;
1268     }
1269     else
1270     {
1271         if (currentTemp < 0)
1272         {
1273             truncatedTemp = 0;
1274         }
1275         else
1276         {
1277             // Round to nearest degree C
1278             truncatedTemp = uint8_t(currentTemp + 0.5);
1279         }
1280     }
1281 
1282     // If ambient changes, notify OCCs
1283     if (truncatedTemp != ambient)
1284     {
1285         log<level::DEBUG>(
1286             std::format("ambientCallback: Ambient change from {} to {}C",
1287                         ambient, currentTemp)
1288                 .c_str());
1289 
1290         ambient = truncatedTemp;
1291         if (altitude == 0xFFFF)
1292         {
1293             // No altitude yet, try reading again
1294             readAltitude();
1295         }
1296 
1297         log<level::DEBUG>(
1298             std::format("ambientCallback: Ambient: {}C, altitude: {}m", ambient,
1299                         altitude)
1300                 .c_str());
1301 #ifdef POWER10
1302         // Send ambient and altitude to all OCCs
1303         for (auto& obj : statusObjects)
1304         {
1305             if (obj->occActive())
1306             {
1307                 obj->sendAmbient(ambient, altitude);
1308             }
1309         }
1310 #endif // POWER10
1311     }
1312 }
1313 
1314 // return the current ambient and altitude readings
1315 void Manager::getAmbientData(bool& ambientValid, uint8_t& ambientTemp,
1316                              uint16_t& altitudeValue) const
1317 {
1318     ambientValid = true;
1319     ambientTemp = ambient;
1320     altitudeValue = altitude;
1321 
1322     if (ambient == 0xFF)
1323     {
1324         ambientValid = false;
1325     }
1326 }
1327 
1328 #ifdef POWER10
1329 // Called when waitForAllOccsTimer expires
1330 // After the first OCC goes active, this timer will be started (60 seconds)
1331 void Manager::occsNotAllRunning()
1332 {
1333     if (activeCount != statusObjects.size())
1334     {
1335         // Not all OCCs went active
1336         log<level::WARNING>(
1337             std::format(
1338                 "occsNotAllRunning: Active OCC count ({}) does not match expected count ({})",
1339                 activeCount, statusObjects.size())
1340                 .c_str());
1341         // Procs may be garded, so may be expected
1342     }
1343 
1344     validateOccMaster();
1345 }
1346 
1347 #ifdef PLDM
1348 // Called when throttleTraceTimer expires.
1349 // If this timer expires, that indicates there is still no confirmed OCC status
1350 //   which will trigger pldm traces to be throttled.
1351 void Manager::throttleTraceExpired()
1352 {
1353     // Throttle traces
1354     pldmHandle->setTraceThrottle(true);
1355     // Create PEL
1356     createPldmSensorPEL();
1357 }
1358 
1359 void Manager::createPldmSensorPEL()
1360 {
1361     Error::Descriptor d = Error::Descriptor(MISSING_OCC_SENSORS_PATH);
1362     std::map<std::string, std::string> additionalData;
1363 
1364     additionalData.emplace("_PID", std::to_string(getpid()));
1365 
1366     log<level::INFO>(
1367         std::format(
1368             "createPldmSensorPEL(): Unable to find PLDM sensors for the OCCs")
1369             .c_str());
1370 
1371     auto& bus = utils::getBus();
1372 
1373     try
1374     {
1375         FFDCFiles ffdc;
1376         // Add occ-control journal traces to PEL FFDC
1377         auto occJournalFile =
1378             FFDC::addJournalEntries(ffdc, "openpower-occ-control", 40);
1379 
1380         static constexpr auto loggingObjectPath =
1381             "/xyz/openbmc_project/logging";
1382         static constexpr auto opLoggingInterface = "org.open_power.Logging.PEL";
1383         std::string service = utils::getService(loggingObjectPath,
1384                                                 opLoggingInterface);
1385         auto method = bus.new_method_call(service.c_str(), loggingObjectPath,
1386                                           opLoggingInterface,
1387                                           "CreatePELWithFFDCFiles");
1388 
1389         // Set level to Notice (Informational).
1390         auto level =
1391             sdbusplus::xyz::openbmc_project::Logging::server::convertForMessage(
1392                 sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level::
1393                     Notice);
1394 
1395         method.append(d.path, level, additionalData, ffdc);
1396         bus.call(method);
1397     }
1398     catch (const sdbusplus::exception_t& e)
1399     {
1400         log<level::ERR>(
1401             std::format("Failed to create MISSING_OCC_SENSORS PEL: {}",
1402                         e.what())
1403                 .c_str());
1404     }
1405 }
1406 #endif // PLDM
1407 #endif // POWER10
1408 
1409 // Verify single master OCC and start presence monitor
1410 void Manager::validateOccMaster()
1411 {
1412     int masterInstance = -1;
1413     for (auto& obj : statusObjects)
1414     {
1415         auto instance = obj->getOccInstanceID();
1416 #ifdef POWER10
1417         if (!obj->occActive())
1418         {
1419             if (utils::isHostRunning())
1420             {
1421                 // Check if sensor was queued while waiting for discovery
1422                 auto match = queuedActiveState.find(instance);
1423                 if (match != queuedActiveState.end())
1424                 {
1425                     queuedActiveState.erase(match);
1426                     log<level::INFO>(
1427                         std::format(
1428                             "validateOccMaster: OCC{} is ACTIVE (queued)",
1429                             instance)
1430                             .c_str());
1431                     obj->occActive(true);
1432                 }
1433                 else
1434                 {
1435                     // OCC does not appear to be active yet, check active sensor
1436 #ifdef PLDM
1437                     pldmHandle->checkActiveSensor(instance);
1438 #endif
1439                     if (obj->occActive())
1440                     {
1441                         log<level::INFO>(
1442                             std::format(
1443                                 "validateOccMaster: OCC{} is ACTIVE after reading sensor",
1444                                 instance)
1445                                 .c_str());
1446                     }
1447                 }
1448             }
1449             else
1450             {
1451                 log<level::WARNING>(
1452                     std::format(
1453                         "validateOccMaster: HOST is not running (OCC{})",
1454                         instance)
1455                         .c_str());
1456                 return;
1457             }
1458         }
1459 #endif // POWER10
1460 
1461         if (obj->isMasterOcc())
1462         {
1463             obj->addPresenceWatchMaster();
1464 
1465             if (masterInstance == -1)
1466             {
1467                 masterInstance = instance;
1468             }
1469             else
1470             {
1471                 log<level::ERR>(
1472                     std::format(
1473                         "validateOccMaster: Multiple OCC masters! ({} and {})",
1474                         masterInstance, instance)
1475                         .c_str());
1476                 // request reset
1477                 obj->deviceError(Error::Descriptor(PRESENCE_ERROR_PATH));
1478             }
1479         }
1480     }
1481 
1482     if (masterInstance < 0)
1483     {
1484         log<level::ERR>(
1485             std::format("validateOccMaster: Master OCC not found! (of {} OCCs)",
1486                         statusObjects.size())
1487                 .c_str());
1488         // request reset
1489         statusObjects.front()->deviceError(
1490             Error::Descriptor(PRESENCE_ERROR_PATH));
1491     }
1492     else
1493     {
1494         log<level::INFO>(
1495             std::format("validateOccMaster: OCC{} is master of {} OCCs",
1496                         masterInstance, activeCount)
1497                 .c_str());
1498 #ifdef POWER10
1499         pmode->updateDbusSafeMode(false);
1500 #endif
1501     }
1502 }
1503 
1504 void Manager::updatePcapBounds() const
1505 {
1506     if (pcap)
1507     {
1508         pcap->updatePcapBounds();
1509     }
1510 }
1511 
1512 } // namespace occ
1513 } // namespace open_power
1514