1 #include "config.h"
2 
3 #include "occ_manager.hpp"
4 
5 #include "i2c_occ.hpp"
6 #include "occ_dbus.hpp"
7 #include "utils.hpp"
8 
9 #include <phosphor-logging/elog-errors.hpp>
10 #include <phosphor-logging/log.hpp>
11 #include <xyz/openbmc_project/Common/error.hpp>
12 
13 #include <chrono>
14 #include <cmath>
15 #include <filesystem>
16 #include <fstream>
17 #include <regex>
18 
19 namespace open_power
20 {
21 namespace occ
22 {
23 
24 constexpr uint32_t fruTypeNotAvailable = 0xFF;
25 constexpr auto fruTypeSuffix = "fru_type";
26 constexpr auto faultSuffix = "fault";
27 constexpr auto inputSuffix = "input";
28 constexpr auto maxSuffix = "max";
29 
30 const auto HOST_ON_FILE = "/run/openbmc/host@0-on";
31 
32 using namespace phosphor::logging;
33 using namespace std::literals::chrono_literals;
34 
35 template <typename T>
36 T readFile(const std::string& path)
37 {
38     std::ifstream ifs;
39     ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit |
40                    std::ifstream::eofbit);
41     T data;
42 
43     try
44     {
45         ifs.open(path);
46         ifs >> data;
47         ifs.close();
48     }
49     catch (const std::exception& e)
50     {
51         auto err = errno;
52         throw std::system_error(err, std::generic_category());
53     }
54 
55     return data;
56 }
57 
58 void Manager::findAndCreateObjects()
59 {
60 #ifndef POWER10
61     for (auto id = 0; id < MAX_CPUS; ++id)
62     {
63         // Create one occ per cpu
64         auto occ = std::string(OCC_NAME) + std::to_string(id);
65         createObjects(occ);
66     }
67 #else
68     if (!pmode)
69     {
70         // Create the power mode object
71         pmode = std::make_unique<powermode::PowerMode>(
72             *this, powermode::PMODE_PATH, powermode::PIPS_PATH, event);
73     }
74 
75     if (!fs::exists(HOST_ON_FILE))
76     {
77         static bool statusObjCreated = false;
78         if (!statusObjCreated)
79         {
80             // Create the OCCs based on on the /dev/occX devices
81             auto occs = findOCCsInDev();
82 
83             if (occs.empty() || (prevOCCSearch.size() != occs.size()))
84             {
85                 // Something changed or no OCCs yet, try again in 10s.
86                 // Note on the first pass prevOCCSearch will be empty,
87                 // so there will be at least one delay to give things
88                 // a chance to settle.
89                 prevOCCSearch = occs;
90 
91                 log<level::INFO>(
92                     fmt::format(
93                         "Manager::findAndCreateObjects(): Waiting for OCCs (currently {})",
94                         occs.size())
95                         .c_str());
96 
97                 discoverTimer->restartOnce(10s);
98             }
99             else
100             {
101                 // All OCCs appear to be available, create status objects
102 
103                 // createObjects requires OCC0 first.
104                 std::sort(occs.begin(), occs.end());
105 
106                 log<level::INFO>(
107                     fmt::format(
108                         "Manager::findAndCreateObjects(): Creating {} OCC Status Objects",
109                         occs.size())
110                         .c_str());
111                 for (auto id : occs)
112                 {
113                     createObjects(std::string(OCC_NAME) + std::to_string(id));
114                 }
115                 statusObjCreated = true;
116                 waitingForAllOccActiveSensors = true;
117             }
118         }
119 
120         if (statusObjCreated && waitingForAllOccActiveSensors)
121         {
122             static bool tracedHostWait = false;
123             if (utils::isHostRunning())
124             {
125                 if (tracedHostWait)
126                 {
127                     log<level::INFO>(
128                         "Manager::findAndCreateObjects(): Host is running");
129                     tracedHostWait = false;
130                 }
131                 checkAllActiveSensors();
132             }
133             else
134             {
135                 if (!tracedHostWait)
136                 {
137                     log<level::INFO>(
138                         "Manager::findAndCreateObjects(): Waiting for host to start");
139                     tracedHostWait = true;
140                 }
141                 discoverTimer->restartOnce(30s);
142             }
143         }
144     }
145     else
146     {
147         log<level::INFO>(
148             fmt::format(
149                 "Manager::findAndCreateObjects(): Waiting for {} to complete...",
150                 HOST_ON_FILE)
151                 .c_str());
152         discoverTimer->restartOnce(10s);
153     }
154 #endif
155 }
156 
157 #ifdef POWER10
158 // Check if all occActive sensors are available
159 void Manager::checkAllActiveSensors()
160 {
161     static bool allActiveSensorAvailable = false;
162     static bool tracedSensorWait = false;
163     static bool waitingForHost = false;
164 
165     if (open_power::occ::utils::isHostRunning())
166     {
167         if (waitingForHost)
168         {
169             waitingForHost = false;
170             log<level::INFO>("checkAllActiveSensors(): Host is now running");
171         }
172 
173         // Start with the assumption that all are available
174         allActiveSensorAvailable = true;
175         for (auto& obj : statusObjects)
176         {
177             if ((!obj->occActive()) && (!obj->getPldmSensorReceived()))
178             {
179                 auto instance = obj->getOccInstanceID();
180                 // Check if sensor was queued while waiting for discovery
181                 auto match = queuedActiveState.find(instance);
182                 if (match != queuedActiveState.end())
183                 {
184                     queuedActiveState.erase(match);
185                     log<level::INFO>(
186                         fmt::format(
187                             "checkAllActiveSensors(): OCC{} is ACTIVE (queued)",
188                             instance)
189                             .c_str());
190                     obj->occActive(true);
191                 }
192                 else
193                 {
194                     allActiveSensorAvailable = false;
195                     if (!tracedSensorWait)
196                     {
197                         log<level::INFO>(
198                             fmt::format(
199                                 "checkAllActiveSensors(): Waiting on OCC{} Active sensor",
200                                 instance)
201                                 .c_str());
202                         tracedSensorWait = true;
203                     }
204                     pldmHandle->checkActiveSensor(obj->getOccInstanceID());
205                     break;
206                 }
207             }
208         }
209     }
210     else
211     {
212         if (!waitingForHost)
213         {
214             waitingForHost = true;
215             log<level::INFO>(
216                 "checkAllActiveSensors(): Waiting for host to start");
217         }
218     }
219 
220     if (allActiveSensorAvailable)
221     {
222         // All sensors were found, disable the discovery timer
223         if (discoverTimer->isEnabled())
224         {
225             discoverTimer->setEnabled(false);
226         }
227 
228         if (waitingForAllOccActiveSensors)
229         {
230             log<level::INFO>(
231                 "checkAllActiveSensors(): OCC Active sensors are available");
232             waitingForAllOccActiveSensors = false;
233         }
234         queuedActiveState.clear();
235         tracedSensorWait = false;
236     }
237     else
238     {
239         // Not all sensors were available, so keep waiting
240         if (!tracedSensorWait)
241         {
242             log<level::INFO>(
243                 "checkAllActiveSensors(): Waiting for OCC Active sensors to become available");
244             tracedSensorWait = true;
245         }
246         discoverTimer->restartOnce(10s);
247     }
248 }
249 #endif
250 
251 std::vector<int> Manager::findOCCsInDev()
252 {
253     std::vector<int> occs;
254     std::regex expr{R"(occ(\d+)$)"};
255 
256     for (auto& file : fs::directory_iterator("/dev"))
257     {
258         std::smatch match;
259         std::string path{file.path().string()};
260         if (std::regex_search(path, match, expr))
261         {
262             auto num = std::stoi(match[1].str());
263 
264             // /dev numbering starts at 1, ours starts at 0.
265             occs.push_back(num - 1);
266         }
267     }
268 
269     return occs;
270 }
271 
272 int Manager::cpuCreated(sdbusplus::message_t& msg)
273 {
274     namespace fs = std::filesystem;
275 
276     sdbusplus::message::object_path o;
277     msg.read(o);
278     fs::path cpuPath(std::string(std::move(o)));
279 
280     auto name = cpuPath.filename().string();
281     auto index = name.find(CPU_NAME);
282     name.replace(index, std::strlen(CPU_NAME), OCC_NAME);
283 
284     createObjects(name);
285 
286     return 0;
287 }
288 
289 void Manager::createObjects(const std::string& occ)
290 {
291     auto path = fs::path(OCC_CONTROL_ROOT) / occ;
292 
293     statusObjects.emplace_back(std::make_unique<Status>(
294         event, path.c_str(), *this,
295 #ifdef POWER10
296         pmode,
297 #endif
298         std::bind(std::mem_fn(&Manager::statusCallBack), this,
299                   std::placeholders::_1, std::placeholders::_2)
300 #ifdef PLDM
301             ,
302         std::bind(std::mem_fn(&pldm::Interface::resetOCC), pldmHandle.get(),
303                   std::placeholders::_1)
304 #endif
305             ));
306 
307     // Create the power cap monitor object
308     if (!pcap)
309     {
310         pcap = std::make_unique<open_power::occ::powercap::PowerCap>(
311             *statusObjects.back());
312     }
313 
314     if (statusObjects.back()->isMasterOcc())
315     {
316         log<level::INFO>(
317             fmt::format("Manager::createObjects(): OCC{} is the master",
318                         statusObjects.back()->getOccInstanceID())
319                 .c_str());
320         _pollTimer->setEnabled(false);
321 
322 #ifdef POWER10
323         // Set the master OCC on the PowerMode object
324         pmode->setMasterOcc(path);
325 #endif
326     }
327 
328     passThroughObjects.emplace_back(std::make_unique<PassThrough>(path.c_str()
329 #ifdef POWER10
330                                                                       ,
331                                                                   pmode
332 #endif
333                                                                   ));
334 }
335 
336 void Manager::statusCallBack(instanceID instance, bool status)
337 {
338     if (status == true)
339     {
340         // OCC went active
341         ++activeCount;
342 
343 #ifdef POWER10
344         if (activeCount == 1)
345         {
346             // First OCC went active (allow some time for all OCCs to go active)
347             waitForAllOccsTimer->restartOnce(60s);
348         }
349 #endif
350 
351         if (activeCount == statusObjects.size())
352         {
353 #ifdef POWER10
354             // All OCCs are now running
355             if (waitForAllOccsTimer->isEnabled())
356             {
357                 // stop occ wait timer
358                 waitForAllOccsTimer->setEnabled(false);
359             }
360 #endif
361 
362             // Verify master OCC and start presence monitor
363             validateOccMaster();
364         }
365 
366         // Start poll timer if not already started
367         if (!_pollTimer->isEnabled())
368         {
369             log<level::INFO>(
370                 fmt::format("Manager: OCCs will be polled every {} seconds",
371                             pollInterval)
372                     .c_str());
373 
374             // Send poll and start OCC poll timer
375             pollerTimerExpired();
376         }
377     }
378     else
379     {
380         // OCC went away
381         if (activeCount > 0)
382         {
383             --activeCount;
384         }
385         else
386         {
387             log<level::ERR>(
388                 fmt::format("OCC{} disabled, but currently no active OCCs",
389                             instance)
390                     .c_str());
391         }
392 
393         if (activeCount == 0)
394         {
395             // No OCCs are running
396 
397             // Stop OCC poll timer
398             if (_pollTimer->isEnabled())
399             {
400                 log<level::INFO>(
401                     "Manager::statusCallBack(): OCCs are not running, stopping poll timer");
402                 _pollTimer->setEnabled(false);
403             }
404 
405 #ifdef POWER10
406             // stop wait timer
407             if (waitForAllOccsTimer->isEnabled())
408             {
409                 waitForAllOccsTimer->setEnabled(false);
410             }
411 #endif
412         }
413 #ifdef READ_OCC_SENSORS
414         // Clear OCC sensors
415         setSensorValueToNaN(instance);
416 #endif
417     }
418 
419 #ifdef POWER10
420     if (waitingForAllOccActiveSensors)
421     {
422         if (utils::isHostRunning())
423         {
424             checkAllActiveSensors();
425         }
426     }
427 #endif
428 }
429 
430 #ifdef I2C_OCC
431 void Manager::initStatusObjects()
432 {
433     // Make sure we have a valid path string
434     static_assert(sizeof(DEV_PATH) != 0);
435 
436     auto deviceNames = i2c_occ::getOccHwmonDevices(DEV_PATH);
437     for (auto& name : deviceNames)
438     {
439         i2c_occ::i2cToDbus(name);
440         name = std::string(OCC_NAME) + '_' + name;
441         auto path = fs::path(OCC_CONTROL_ROOT) / name;
442         statusObjects.emplace_back(
443             std::make_unique<Status>(event, path.c_str(), *this));
444     }
445     // The first device is master occ
446     pcap = std::make_unique<open_power::occ::powercap::PowerCap>(
447         *statusObjects.front());
448 #ifdef POWER10
449     pmode = std::make_unique<powermode::PowerMode>(*this, powermode::PMODE_PATH,
450                                                    powermode::PIPS_PATH);
451     // Set the master OCC on the PowerMode object
452     pmode->setMasterOcc(path);
453 #endif
454 }
455 #endif
456 
457 #ifdef PLDM
458 void Manager::sbeTimeout(unsigned int instance)
459 {
460     auto obj = std::find_if(statusObjects.begin(), statusObjects.end(),
461                             [instance](const auto& obj) {
462         return instance == obj->getOccInstanceID();
463     });
464 
465     if (obj != statusObjects.end() && (*obj)->occActive())
466     {
467         log<level::INFO>(
468             fmt::format("SBE timeout, requesting HRESET (OCC{})", instance)
469                 .c_str());
470 
471         setSBEState(instance, SBE_STATE_NOT_USABLE);
472 
473         pldmHandle->sendHRESET(instance);
474     }
475 }
476 
477 bool Manager::updateOCCActive(instanceID instance, bool status)
478 {
479     auto obj = std::find_if(statusObjects.begin(), statusObjects.end(),
480                             [instance](const auto& obj) {
481         return instance == obj->getOccInstanceID();
482     });
483 
484     const bool hostRunning = open_power::occ::utils::isHostRunning();
485     if (obj != statusObjects.end())
486     {
487         if (!hostRunning && (status == true))
488         {
489             log<level::WARNING>(
490                 fmt::format(
491                     "updateOCCActive: Host is not running yet (OCC{} active={}), clearing sensor received",
492                     instance, status)
493                     .c_str());
494             (*obj)->setPldmSensorReceived(false);
495             if (!waitingForAllOccActiveSensors)
496             {
497                 log<level::INFO>(
498                     "updateOCCActive: Waiting for Host and all OCC Active Sensors");
499                 waitingForAllOccActiveSensors = true;
500             }
501             discoverTimer->restartOnce(30s);
502             return false;
503         }
504         else
505         {
506             log<level::INFO>(fmt::format("updateOCCActive: OCC{} active={}",
507                                          instance, status)
508                                  .c_str());
509             (*obj)->setPldmSensorReceived(true);
510             return (*obj)->occActive(status);
511         }
512     }
513     else
514     {
515         if (hostRunning)
516         {
517             log<level::WARNING>(
518                 fmt::format(
519                     "updateOCCActive: No status object to update for OCC{} (active={})",
520                     instance, status)
521                     .c_str());
522         }
523         else
524         {
525             if (status == true)
526             {
527                 log<level::WARNING>(
528                     fmt::format(
529                         "updateOCCActive: No status objects and Host is not running yet (OCC{} active={})",
530                         instance, status)
531                         .c_str());
532             }
533         }
534         if (status == true)
535         {
536             // OCC went active
537             queuedActiveState.insert(instance);
538         }
539         else
540         {
541             auto match = queuedActiveState.find(instance);
542             if (match != queuedActiveState.end())
543             {
544                 // OCC was disabled
545                 queuedActiveState.erase(match);
546             }
547         }
548         return false;
549     }
550 }
551 
552 // Called upon pldm event To set powermode Safe Mode State for system.
553 void Manager::updateOccSafeMode(bool safeMode)
554 {
555 #ifdef POWER10
556     pmode->updateDbusSafeMode(safeMode);
557 #endif
558 }
559 
560 void Manager::sbeHRESETResult(instanceID instance, bool success)
561 {
562     if (success)
563     {
564         log<level::INFO>(
565             fmt::format("HRESET succeeded (OCC{})", instance).c_str());
566 
567         setSBEState(instance, SBE_STATE_BOOTED);
568 
569         return;
570     }
571 
572     setSBEState(instance, SBE_STATE_FAILED);
573 
574     if (sbeCanDump(instance))
575     {
576         log<level::INFO>(
577             fmt::format("HRESET failed (OCC{}), triggering SBE dump", instance)
578                 .c_str());
579 
580         auto& bus = utils::getBus();
581         uint32_t src6 = instance << 16;
582         uint32_t logId =
583             FFDC::createPEL("org.open_power.Processor.Error.SbeChipOpTimeout",
584                             src6, "SBE command timeout");
585 
586         try
587         {
588             constexpr auto path = "/org/openpower/dump";
589             constexpr auto interface = "xyz.openbmc_project.Dump.Create";
590             constexpr auto function = "CreateDump";
591 
592             std::string service = utils::getService(path, interface);
593             auto method = bus.new_method_call(service.c_str(), path, interface,
594                                               function);
595 
596             std::map<std::string, std::variant<std::string, uint64_t>>
597                 createParams{
598                     {"com.ibm.Dump.Create.CreateParameters.ErrorLogId",
599                      uint64_t(logId)},
600                     {"com.ibm.Dump.Create.CreateParameters.DumpType",
601                      "com.ibm.Dump.Create.DumpType.SBE"},
602                     {"com.ibm.Dump.Create.CreateParameters.FailingUnitId",
603                      uint64_t(instance)},
604                 };
605 
606             method.append(createParams);
607 
608             auto response = bus.call(method);
609         }
610         catch (const sdbusplus::exception_t& e)
611         {
612             constexpr auto ERROR_DUMP_DISABLED =
613                 "xyz.openbmc_project.Dump.Create.Error.Disabled";
614             if (e.name() == ERROR_DUMP_DISABLED)
615             {
616                 log<level::INFO>("Dump is disabled, skipping");
617             }
618             else
619             {
620                 log<level::ERR>("Dump failed");
621             }
622         }
623     }
624 }
625 
626 bool Manager::sbeCanDump(unsigned int instance)
627 {
628     struct pdbg_target* proc = getPdbgTarget(instance);
629 
630     if (!proc)
631     {
632         // allow the dump in the error case
633         return true;
634     }
635 
636     try
637     {
638         if (!openpower::phal::sbe::isDumpAllowed(proc))
639         {
640             return false;
641         }
642 
643         if (openpower::phal::pdbg::isSbeVitalAttnActive(proc))
644         {
645             return false;
646         }
647     }
648     catch (openpower::phal::exception::SbeError& e)
649     {
650         log<level::INFO>("Failed to query SBE state");
651     }
652 
653     // allow the dump in the error case
654     return true;
655 }
656 
657 void Manager::setSBEState(unsigned int instance, enum sbe_state state)
658 {
659     struct pdbg_target* proc = getPdbgTarget(instance);
660 
661     if (!proc)
662     {
663         return;
664     }
665 
666     try
667     {
668         openpower::phal::sbe::setState(proc, state);
669     }
670     catch (const openpower::phal::exception::SbeError& e)
671     {
672         log<level::ERR>("Failed to set SBE state");
673     }
674 }
675 
676 struct pdbg_target* Manager::getPdbgTarget(unsigned int instance)
677 {
678     if (!pdbgInitialized)
679     {
680         try
681         {
682             openpower::phal::pdbg::init();
683             pdbgInitialized = true;
684         }
685         catch (const openpower::phal::exception::PdbgError& e)
686         {
687             log<level::ERR>("pdbg initialization failed");
688             return nullptr;
689         }
690     }
691 
692     struct pdbg_target* proc = nullptr;
693     pdbg_for_each_class_target("proc", proc)
694     {
695         if (pdbg_target_index(proc) == instance)
696         {
697             return proc;
698         }
699     }
700 
701     log<level::ERR>("Failed to get pdbg target");
702     return nullptr;
703 }
704 #endif
705 
706 void Manager::pollerTimerExpired()
707 {
708     if (!_pollTimer)
709     {
710         log<level::ERR>(
711             "Manager::pollerTimerExpired() ERROR: Timer not defined");
712         return;
713     }
714 
715     for (auto& obj : statusObjects)
716     {
717         if (!obj->occActive())
718         {
719             // OCC is not running yet
720 #ifdef READ_OCC_SENSORS
721             auto id = obj->getOccInstanceID();
722             setSensorValueToNaN(id);
723 #endif
724             continue;
725         }
726 
727         // Read sysfs to force kernel to poll OCC
728         obj->readOccState();
729 
730 #ifdef READ_OCC_SENSORS
731         // Read occ sensor values
732         getSensorValues(obj);
733 #endif
734     }
735 
736     if (activeCount > 0)
737     {
738         // Restart OCC poll timer
739         _pollTimer->restartOnce(std::chrono::seconds(pollInterval));
740     }
741     else
742     {
743         // No OCCs running, so poll timer will not be restarted
744         log<level::INFO>(
745             fmt::format(
746                 "Manager::pollerTimerExpired: poll timer will not be restarted")
747                 .c_str());
748     }
749 }
750 
751 #ifdef READ_OCC_SENSORS
752 void Manager::readTempSensors(const fs::path& path, uint32_t id)
753 {
754     std::regex expr{"temp\\d+_label$"}; // Example: temp5_label
755     for (auto& file : fs::directory_iterator(path))
756     {
757         if (!std::regex_search(file.path().string(), expr))
758         {
759             continue;
760         }
761 
762         uint32_t labelValue{0};
763 
764         try
765         {
766             labelValue = readFile<uint32_t>(file.path());
767         }
768         catch (const std::system_error& e)
769         {
770             log<level::DEBUG>(
771                 fmt::format("readTempSensors: Failed reading {}, errno = {}",
772                             file.path().string(), e.code().value())
773                     .c_str());
774             continue;
775         }
776 
777         const std::string& tempLabel = "label";
778         const std::string filePathString = file.path().string().substr(
779             0, file.path().string().length() - tempLabel.length());
780 
781         uint32_t fruTypeValue{0};
782         try
783         {
784             fruTypeValue = readFile<uint32_t>(filePathString + fruTypeSuffix);
785         }
786         catch (const std::system_error& e)
787         {
788             log<level::DEBUG>(
789                 fmt::format("readTempSensors: Failed reading {}, errno = {}",
790                             filePathString + fruTypeSuffix, e.code().value())
791                     .c_str());
792             continue;
793         }
794 
795         std::string sensorPath = OCC_SENSORS_ROOT +
796                                  std::string("/temperature/");
797 
798         std::string dvfsTempPath;
799 
800         if (fruTypeValue == VRMVdd)
801         {
802             sensorPath.append("vrm_vdd" + std::to_string(id) + "_temp");
803         }
804         else if (fruTypeValue == processorIoRing)
805         {
806             sensorPath.append("proc" + std::to_string(id) + "_ioring_temp");
807             dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/proc" +
808                            std::to_string(id) + "_ioring_dvfs_temp";
809         }
810         else
811         {
812             uint16_t type = (labelValue & 0xFF000000) >> 24;
813             uint16_t instanceID = labelValue & 0x0000FFFF;
814 
815             if (type == OCC_DIMM_TEMP_SENSOR_TYPE)
816             {
817                 if (fruTypeValue == fruTypeNotAvailable)
818                 {
819                     // Not all DIMM related temps are available to read
820                     // (no _input file in this case)
821                     continue;
822                 }
823                 auto iter = dimmTempSensorName.find(fruTypeValue);
824                 if (iter == dimmTempSensorName.end())
825                 {
826                     log<level::ERR>(
827                         fmt::format(
828                             "readTempSensors: Fru type error! fruTypeValue = {}) ",
829                             fruTypeValue)
830                             .c_str());
831                     continue;
832                 }
833 
834                 sensorPath.append("dimm" + std::to_string(instanceID) +
835                                   iter->second);
836             }
837             else if (type == OCC_CPU_TEMP_SENSOR_TYPE)
838             {
839                 if (fruTypeValue == processorCore)
840                 {
841                     // The OCC reports small core temps, of which there are
842                     // two per big core.  All current P10 systems are in big
843                     // core mode, so use a big core name.
844                     uint16_t coreNum = instanceID / 2;
845                     uint16_t tempNum = instanceID % 2;
846                     sensorPath.append("proc" + std::to_string(id) + "_core" +
847                                       std::to_string(coreNum) + "_" +
848                                       std::to_string(tempNum) + "_temp");
849 
850                     dvfsTempPath = std::string{OCC_SENSORS_ROOT} +
851                                    "/temperature/proc" + std::to_string(id) +
852                                    "_core_dvfs_temp";
853                 }
854                 else
855                 {
856                     continue;
857                 }
858             }
859             else
860             {
861                 continue;
862             }
863         }
864 
865         // The dvfs temp file only needs to be read once per chip per type.
866         if (!dvfsTempPath.empty() &&
867             !dbus::OccDBusSensors::getOccDBus().hasDvfsTemp(dvfsTempPath))
868         {
869             try
870             {
871                 auto dvfsValue = readFile<double>(filePathString + maxSuffix);
872 
873                 dbus::OccDBusSensors::getOccDBus().setDvfsTemp(
874                     dvfsTempPath, dvfsValue * std::pow(10, -3));
875             }
876             catch (const std::system_error& e)
877             {
878                 log<level::DEBUG>(
879                     fmt::format(
880                         "readTempSensors: Failed reading {}, errno = {}",
881                         filePathString + maxSuffix, e.code().value())
882                         .c_str());
883             }
884         }
885 
886         uint32_t faultValue{0};
887         try
888         {
889             faultValue = readFile<uint32_t>(filePathString + faultSuffix);
890         }
891         catch (const std::system_error& e)
892         {
893             log<level::DEBUG>(
894                 fmt::format("readTempSensors: Failed reading {}, errno = {}",
895                             filePathString + faultSuffix, e.code().value())
896                     .c_str());
897             continue;
898         }
899 
900         // NOTE: if OCC sends back 0xFF kernal sets this fault value to 1.
901         if (faultValue != 0)
902         {
903             dbus::OccDBusSensors::getOccDBus().setValue(
904                 sensorPath, std::numeric_limits<double>::quiet_NaN());
905 
906             dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath,
907                                                                     false);
908 
909             continue;
910         }
911 
912         double tempValue{0};
913 
914         try
915         {
916             tempValue = readFile<double>(filePathString + inputSuffix);
917         }
918         catch (const std::system_error& e)
919         {
920             log<level::DEBUG>(
921                 fmt::format("readTempSensors: Failed reading {}, errno = {}",
922                             filePathString + inputSuffix, e.code().value())
923                     .c_str());
924 
925             // if errno == EAGAIN(Resource temporarily unavailable) then set
926             // temp to 0, to avoid using old temp, and affecting FAN Control.
927             if (e.code().value() == EAGAIN)
928             {
929                 tempValue = 0;
930             }
931             // else the errno would be something like
932             //     EBADF(Bad file descriptor)
933             // or ENOENT(No such file or directory)
934             else
935             {
936                 continue;
937             }
938         }
939 
940         dbus::OccDBusSensors::getOccDBus().setValue(
941             sensorPath, tempValue * std::pow(10, -3));
942 
943         dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath,
944                                                                 true);
945 
946         // At this point, the sensor will be created for sure.
947         if (existingSensors.find(sensorPath) == existingSensors.end())
948         {
949             dbus::OccDBusSensors::getOccDBus().setChassisAssociation(
950                 sensorPath);
951         }
952 
953         existingSensors[sensorPath] = id;
954     }
955     return;
956 }
957 
958 std::optional<std::string>
959     Manager::getPowerLabelFunctionID(const std::string& value)
960 {
961     // If the value is "system", then the FunctionID is "system".
962     if (value == "system")
963     {
964         return value;
965     }
966 
967     // If the value is not "system", then the label value have 3 numbers, of
968     // which we only care about the middle one:
969     // <sensor id>_<function id>_<apss channel>
970     // eg: The value is "0_10_5" , then the FunctionID is "10".
971     if (value.find("_") == std::string::npos)
972     {
973         return std::nullopt;
974     }
975 
976     auto powerLabelValue = value.substr((value.find("_") + 1));
977 
978     if (powerLabelValue.find("_") == std::string::npos)
979     {
980         return std::nullopt;
981     }
982 
983     return powerLabelValue.substr(0, powerLabelValue.find("_"));
984 }
985 
986 void Manager::readPowerSensors(const fs::path& path, uint32_t id)
987 {
988     std::regex expr{"power\\d+_label$"}; // Example: power5_label
989     for (auto& file : fs::directory_iterator(path))
990     {
991         if (!std::regex_search(file.path().string(), expr))
992         {
993             continue;
994         }
995 
996         std::string labelValue;
997         try
998         {
999             labelValue = readFile<std::string>(file.path());
1000         }
1001         catch (const std::system_error& e)
1002         {
1003             log<level::DEBUG>(
1004                 fmt::format("readPowerSensors: Failed reading {}, errno = {}",
1005                             file.path().string(), e.code().value())
1006                     .c_str());
1007             continue;
1008         }
1009 
1010         auto functionID = getPowerLabelFunctionID(labelValue);
1011         if (functionID == std::nullopt)
1012         {
1013             continue;
1014         }
1015 
1016         const std::string& tempLabel = "label";
1017         const std::string filePathString = file.path().string().substr(
1018             0, file.path().string().length() - tempLabel.length());
1019 
1020         std::string sensorPath = OCC_SENSORS_ROOT + std::string("/power/");
1021 
1022         auto iter = powerSensorName.find(*functionID);
1023         if (iter == powerSensorName.end())
1024         {
1025             continue;
1026         }
1027         sensorPath.append(iter->second);
1028 
1029         double tempValue{0};
1030 
1031         try
1032         {
1033             tempValue = readFile<double>(filePathString + inputSuffix);
1034         }
1035         catch (const std::system_error& e)
1036         {
1037             log<level::DEBUG>(
1038                 fmt::format("readPowerSensors: Failed reading {}, errno = {}",
1039                             filePathString + inputSuffix, e.code().value())
1040                     .c_str());
1041             continue;
1042         }
1043 
1044         dbus::OccDBusSensors::getOccDBus().setUnit(
1045             sensorPath, "xyz.openbmc_project.Sensor.Value.Unit.Watts");
1046 
1047         dbus::OccDBusSensors::getOccDBus().setValue(
1048             sensorPath, tempValue * std::pow(10, -3) * std::pow(10, -3));
1049 
1050         dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath,
1051                                                                 true);
1052 
1053         if (existingSensors.find(sensorPath) == existingSensors.end())
1054         {
1055             dbus::OccDBusSensors::getOccDBus().setChassisAssociation(
1056                 sensorPath);
1057         }
1058 
1059         existingSensors[sensorPath] = id;
1060     }
1061     return;
1062 }
1063 
1064 void Manager::setSensorValueToNaN(uint32_t id) const
1065 {
1066     for (const auto& [sensorPath, occId] : existingSensors)
1067     {
1068         if (occId == id)
1069         {
1070             dbus::OccDBusSensors::getOccDBus().setValue(
1071                 sensorPath, std::numeric_limits<double>::quiet_NaN());
1072 
1073             dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath,
1074                                                                     true);
1075         }
1076     }
1077     return;
1078 }
1079 
1080 void Manager::setSensorValueToNonFunctional(uint32_t id) const
1081 {
1082     for (const auto& [sensorPath, occId] : existingSensors)
1083     {
1084         if (occId == id)
1085         {
1086             dbus::OccDBusSensors::getOccDBus().setValue(
1087                 sensorPath, std::numeric_limits<double>::quiet_NaN());
1088 
1089             dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath,
1090                                                                     false);
1091         }
1092     }
1093     return;
1094 }
1095 
1096 void Manager::getSensorValues(std::unique_ptr<Status>& occ)
1097 {
1098     static bool tracedError[8] = {0};
1099     const fs::path sensorPath = occ->getHwmonPath();
1100     const uint32_t id = occ->getOccInstanceID();
1101 
1102     if (fs::exists(sensorPath))
1103     {
1104         // Read temperature sensors
1105         readTempSensors(sensorPath, id);
1106 
1107         if (occ->isMasterOcc())
1108         {
1109             // Read power sensors
1110             readPowerSensors(sensorPath, id);
1111         }
1112         tracedError[id] = false;
1113     }
1114     else
1115     {
1116         if (!tracedError[id])
1117         {
1118             log<level::ERR>(
1119                 fmt::format(
1120                     "Manager::getSensorValues: OCC{} sensor path missing: {}",
1121                     id, sensorPath.c_str())
1122                     .c_str());
1123             tracedError[id] = true;
1124         }
1125     }
1126 
1127     return;
1128 }
1129 #endif
1130 
1131 // Read the altitude from DBus
1132 void Manager::readAltitude()
1133 {
1134     static bool traceAltitudeErr = true;
1135 
1136     utils::PropertyValue altitudeProperty{};
1137     try
1138     {
1139         altitudeProperty = utils::getProperty(ALTITUDE_PATH, ALTITUDE_INTERFACE,
1140                                               ALTITUDE_PROP);
1141         auto sensorVal = std::get<double>(altitudeProperty);
1142         if (sensorVal < 0xFFFF)
1143         {
1144             if (sensorVal < 0)
1145             {
1146                 altitude = 0;
1147             }
1148             else
1149             {
1150                 // Round to nearest meter
1151                 altitude = uint16_t(sensorVal + 0.5);
1152             }
1153             log<level::DEBUG>(fmt::format("readAltitude: sensor={} ({}m)",
1154                                           sensorVal, altitude)
1155                                   .c_str());
1156             traceAltitudeErr = true;
1157         }
1158         else
1159         {
1160             if (traceAltitudeErr)
1161             {
1162                 traceAltitudeErr = false;
1163                 log<level::DEBUG>(
1164                     fmt::format("Invalid altitude value: {}", sensorVal)
1165                         .c_str());
1166             }
1167         }
1168     }
1169     catch (const sdbusplus::exception_t& e)
1170     {
1171         if (traceAltitudeErr)
1172         {
1173             traceAltitudeErr = false;
1174             log<level::INFO>(
1175                 fmt::format("Unable to read Altitude: {}", e.what()).c_str());
1176         }
1177         altitude = 0xFFFF; // not available
1178     }
1179 }
1180 
1181 // Callback function when ambient temperature changes
1182 void Manager::ambientCallback(sdbusplus::message_t& msg)
1183 {
1184     double currentTemp = 0;
1185     uint8_t truncatedTemp = 0xFF;
1186     std::string msgSensor;
1187     std::map<std::string, std::variant<double>> msgData;
1188     msg.read(msgSensor, msgData);
1189 
1190     auto valPropMap = msgData.find(AMBIENT_PROP);
1191     if (valPropMap == msgData.end())
1192     {
1193         log<level::DEBUG>("ambientCallback: Unknown ambient property changed");
1194         return;
1195     }
1196     currentTemp = std::get<double>(valPropMap->second);
1197     if (std::isnan(currentTemp))
1198     {
1199         truncatedTemp = 0xFF;
1200     }
1201     else
1202     {
1203         if (currentTemp < 0)
1204         {
1205             truncatedTemp = 0;
1206         }
1207         else
1208         {
1209             // Round to nearest degree C
1210             truncatedTemp = uint8_t(currentTemp + 0.5);
1211         }
1212     }
1213 
1214     // If ambient changes, notify OCCs
1215     if (truncatedTemp != ambient)
1216     {
1217         log<level::DEBUG>(
1218             fmt::format("ambientCallback: Ambient change from {} to {}C",
1219                         ambient, currentTemp)
1220                 .c_str());
1221 
1222         ambient = truncatedTemp;
1223         if (altitude == 0xFFFF)
1224         {
1225             // No altitude yet, try reading again
1226             readAltitude();
1227         }
1228 
1229         log<level::DEBUG>(
1230             fmt::format("ambientCallback: Ambient: {}C, altitude: {}m", ambient,
1231                         altitude)
1232                 .c_str());
1233 #ifdef POWER10
1234         // Send ambient and altitude to all OCCs
1235         for (auto& obj : statusObjects)
1236         {
1237             if (obj->occActive())
1238             {
1239                 obj->sendAmbient(ambient, altitude);
1240             }
1241         }
1242 #endif // POWER10
1243     }
1244 }
1245 
1246 // return the current ambient and altitude readings
1247 void Manager::getAmbientData(bool& ambientValid, uint8_t& ambientTemp,
1248                              uint16_t& altitudeValue) const
1249 {
1250     ambientValid = true;
1251     ambientTemp = ambient;
1252     altitudeValue = altitude;
1253 
1254     if (ambient == 0xFF)
1255     {
1256         ambientValid = false;
1257     }
1258 }
1259 
1260 #ifdef POWER10
1261 // Called when waitForAllOccsTimer expires
1262 // After the first OCC goes active, this timer will be started (60 seconds)
1263 void Manager::occsNotAllRunning()
1264 {
1265     if (activeCount != statusObjects.size())
1266     {
1267         // Not all OCCs went active
1268         log<level::WARNING>(
1269             fmt::format(
1270                 "occsNotAllRunning: Active OCC count ({}) does not match expected count ({})",
1271                 activeCount, statusObjects.size())
1272                 .c_str());
1273         // Procs may be garded, so may be expected
1274     }
1275 
1276     validateOccMaster();
1277 }
1278 #endif // POWER10
1279 
1280 // Verify single master OCC and start presence monitor
1281 void Manager::validateOccMaster()
1282 {
1283     int masterInstance = -1;
1284     for (auto& obj : statusObjects)
1285     {
1286         auto instance = obj->getOccInstanceID();
1287 #ifdef POWER10
1288         if (!obj->occActive())
1289         {
1290             if (utils::isHostRunning())
1291             {
1292                 // Check if sensor was queued while waiting for discovery
1293                 auto match = queuedActiveState.find(instance);
1294                 if (match != queuedActiveState.end())
1295                 {
1296                     queuedActiveState.erase(match);
1297                     log<level::INFO>(
1298                         fmt::format(
1299                             "validateOccMaster: OCC{} is ACTIVE (queued)",
1300                             instance)
1301                             .c_str());
1302                     obj->occActive(true);
1303                 }
1304                 else
1305                 {
1306                     // OCC does not appear to be active yet, check active sensor
1307                     pldmHandle->checkActiveSensor(instance);
1308                     if (obj->occActive())
1309                     {
1310                         log<level::INFO>(
1311                             fmt::format(
1312                                 "validateOccMaster: OCC{} is ACTIVE after reading sensor",
1313                                 instance)
1314                                 .c_str());
1315                     }
1316                 }
1317             }
1318             else
1319             {
1320                 log<level::WARNING>(
1321                     fmt::format(
1322                         "validateOccMaster: HOST is not running (OCC{})",
1323                         instance)
1324                         .c_str());
1325                 return;
1326             }
1327         }
1328 #endif // POWER10
1329 
1330         if (obj->isMasterOcc())
1331         {
1332             obj->addPresenceWatchMaster();
1333 
1334             if (masterInstance == -1)
1335             {
1336                 masterInstance = instance;
1337             }
1338             else
1339             {
1340                 log<level::ERR>(
1341                     fmt::format(
1342                         "validateOccMaster: Multiple OCC masters! ({} and {})",
1343                         masterInstance, instance)
1344                         .c_str());
1345                 // request reset
1346                 obj->deviceError(Error::Descriptor(PRESENCE_ERROR_PATH));
1347             }
1348         }
1349     }
1350 
1351     if (masterInstance < 0)
1352     {
1353         log<level::ERR>(
1354             fmt::format("validateOccMaster: Master OCC not found! (of {} OCCs)",
1355                         statusObjects.size())
1356                 .c_str());
1357         // request reset
1358         statusObjects.front()->deviceError(
1359             Error::Descriptor(PRESENCE_ERROR_PATH));
1360     }
1361     else
1362     {
1363         log<level::INFO>(
1364             fmt::format("validateOccMaster: OCC{} is master of {} OCCs",
1365                         masterInstance, activeCount)
1366                 .c_str());
1367 #ifdef POWER10
1368         pmode->updateDbusSafeMode(false);
1369 #endif
1370     }
1371 }
1372 
1373 void Manager::updatePcapBounds() const
1374 {
1375     if (pcap)
1376     {
1377         pcap->updatePcapBounds();
1378     }
1379 }
1380 
1381 } // namespace occ
1382 } // namespace open_power
1383