1 #include "config.h"
2 
3 #include "occ_manager.hpp"
4 
5 #include "i2c_occ.hpp"
6 #include "occ_dbus.hpp"
7 #include "utils.hpp"
8 
9 #include <phosphor-logging/elog-errors.hpp>
10 #include <phosphor-logging/log.hpp>
11 #include <xyz/openbmc_project/Common/error.hpp>
12 
13 #include <chrono>
14 #include <cmath>
15 #include <filesystem>
16 #include <fstream>
17 #include <regex>
18 
19 namespace open_power
20 {
21 namespace occ
22 {
23 
24 constexpr uint32_t fruTypeNotAvailable = 0xFF;
25 constexpr auto fruTypeSuffix = "fru_type";
26 constexpr auto faultSuffix = "fault";
27 constexpr auto inputSuffix = "input";
28 constexpr auto maxSuffix = "max";
29 
30 const auto HOST_ON_FILE = "/run/openbmc/host@0-on";
31 
32 using namespace phosphor::logging;
33 using namespace std::literals::chrono_literals;
34 
35 template <typename T>
36 T readFile(const std::string& path)
37 {
38     std::ifstream ifs;
39     ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit |
40                    std::ifstream::eofbit);
41     T data;
42 
43     try
44     {
45         ifs.open(path);
46         ifs >> data;
47         ifs.close();
48     }
49     catch (const std::exception& e)
50     {
51         auto err = errno;
52         throw std::system_error(err, std::generic_category());
53     }
54 
55     return data;
56 }
57 
58 void Manager::findAndCreateObjects()
59 {
60 #ifndef POWER10
61     for (auto id = 0; id < MAX_CPUS; ++id)
62     {
63         // Create one occ per cpu
64         auto occ = std::string(OCC_NAME) + std::to_string(id);
65         createObjects(occ);
66     }
67 #else
68     if (!pmode)
69     {
70         // Create the power mode object
71         pmode = std::make_unique<powermode::PowerMode>(
72             *this, powermode::PMODE_PATH, powermode::PIPS_PATH, event);
73     }
74 
75     if (!fs::exists(HOST_ON_FILE))
76     {
77         static bool statusObjCreated = false;
78         if (!statusObjCreated)
79         {
80             // Create the OCCs based on on the /dev/occX devices
81             auto occs = findOCCsInDev();
82 
83             if (occs.empty() || (prevOCCSearch.size() != occs.size()))
84             {
85                 // Something changed or no OCCs yet, try again in 10s.
86                 // Note on the first pass prevOCCSearch will be empty,
87                 // so there will be at least one delay to give things
88                 // a chance to settle.
89                 prevOCCSearch = occs;
90 
91                 log<level::INFO>(
92                     std::format(
93                         "Manager::findAndCreateObjects(): Waiting for OCCs (currently {})",
94                         occs.size())
95                         .c_str());
96 
97                 discoverTimer->restartOnce(10s);
98             }
99             else
100             {
101                 // All OCCs appear to be available, create status objects
102 
103                 // createObjects requires OCC0 first.
104                 std::sort(occs.begin(), occs.end());
105 
106                 log<level::INFO>(
107                     std::format(
108                         "Manager::findAndCreateObjects(): Creating {} OCC Status Objects",
109                         occs.size())
110                         .c_str());
111                 for (auto id : occs)
112                 {
113                     createObjects(std::string(OCC_NAME) + std::to_string(id));
114                 }
115                 statusObjCreated = true;
116                 waitingForAllOccActiveSensors = true;
117 
118                 // Find/update the processor path associated with each OCC
119                 for (auto& obj : statusObjects)
120                 {
121                     obj->updateProcAssociation();
122                 }
123             }
124         }
125 
126         if (statusObjCreated && waitingForAllOccActiveSensors)
127         {
128             static bool tracedHostWait = false;
129             if (utils::isHostRunning())
130             {
131                 if (tracedHostWait)
132                 {
133                     log<level::INFO>(
134                         "Manager::findAndCreateObjects(): Host is running");
135                     tracedHostWait = false;
136                 }
137                 checkAllActiveSensors();
138             }
139             else
140             {
141                 if (!tracedHostWait)
142                 {
143                     log<level::INFO>(
144                         "Manager::findAndCreateObjects(): Waiting for host to start");
145                     tracedHostWait = true;
146                 }
147                 discoverTimer->restartOnce(30s);
148             }
149         }
150     }
151     else
152     {
153         log<level::INFO>(
154             std::format(
155                 "Manager::findAndCreateObjects(): Waiting for {} to complete...",
156                 HOST_ON_FILE)
157                 .c_str());
158         discoverTimer->restartOnce(10s);
159     }
160 #endif
161 }
162 
163 #ifdef POWER10
164 // Check if all occActive sensors are available
165 void Manager::checkAllActiveSensors()
166 {
167     static bool allActiveSensorAvailable = false;
168     static bool tracedSensorWait = false;
169     static bool waitingForHost = false;
170 
171     if (open_power::occ::utils::isHostRunning())
172     {
173         if (waitingForHost)
174         {
175             waitingForHost = false;
176             log<level::INFO>("checkAllActiveSensors(): Host is now running");
177         }
178 
179         // Start with the assumption that all are available
180         allActiveSensorAvailable = true;
181         for (auto& obj : statusObjects)
182         {
183             if ((!obj->occActive()) && (!obj->getPldmSensorReceived()))
184             {
185                 auto instance = obj->getOccInstanceID();
186                 // Check if sensor was queued while waiting for discovery
187                 auto match = queuedActiveState.find(instance);
188                 if (match != queuedActiveState.end())
189                 {
190                     queuedActiveState.erase(match);
191                     log<level::INFO>(
192                         std::format(
193                             "checkAllActiveSensors(): OCC{} is ACTIVE (queued)",
194                             instance)
195                             .c_str());
196                     obj->occActive(true);
197                 }
198                 else
199                 {
200                     allActiveSensorAvailable = false;
201                     if (!tracedSensorWait)
202                     {
203                         log<level::INFO>(
204                             std::format(
205                                 "checkAllActiveSensors(): Waiting on OCC{} Active sensor",
206                                 instance)
207                                 .c_str());
208                         tracedSensorWait = true;
209                     }
210 #ifdef PLDM
211                     pldmHandle->checkActiveSensor(obj->getOccInstanceID());
212 #endif
213                     break;
214                 }
215             }
216         }
217     }
218     else
219     {
220         if (!waitingForHost)
221         {
222             waitingForHost = true;
223             log<level::INFO>(
224                 "checkAllActiveSensors(): Waiting for host to start");
225         }
226     }
227 
228     if (allActiveSensorAvailable)
229     {
230         // All sensors were found, disable the discovery timer
231         if (discoverTimer->isEnabled())
232         {
233             discoverTimer->setEnabled(false);
234         }
235 
236         if (waitingForAllOccActiveSensors)
237         {
238             log<level::INFO>(
239                 "checkAllActiveSensors(): OCC Active sensors are available");
240             waitingForAllOccActiveSensors = false;
241         }
242         queuedActiveState.clear();
243         tracedSensorWait = false;
244     }
245     else
246     {
247         // Not all sensors were available, so keep waiting
248         if (!tracedSensorWait)
249         {
250             log<level::INFO>(
251                 "checkAllActiveSensors(): Waiting for OCC Active sensors to become available");
252             tracedSensorWait = true;
253         }
254         discoverTimer->restartOnce(10s);
255     }
256 }
257 #endif
258 
259 std::vector<int> Manager::findOCCsInDev()
260 {
261     std::vector<int> occs;
262     std::regex expr{R"(occ(\d+)$)"};
263 
264     for (auto& file : fs::directory_iterator("/dev"))
265     {
266         std::smatch match;
267         std::string path{file.path().string()};
268         if (std::regex_search(path, match, expr))
269         {
270             auto num = std::stoi(match[1].str());
271 
272             // /dev numbering starts at 1, ours starts at 0.
273             occs.push_back(num - 1);
274         }
275     }
276 
277     return occs;
278 }
279 
280 int Manager::cpuCreated(sdbusplus::message_t& msg)
281 {
282     namespace fs = std::filesystem;
283 
284     sdbusplus::message::object_path o;
285     msg.read(o);
286     fs::path cpuPath(std::string(std::move(o)));
287 
288     auto name = cpuPath.filename().string();
289     auto index = name.find(CPU_NAME);
290     name.replace(index, std::strlen(CPU_NAME), OCC_NAME);
291 
292     createObjects(name);
293 
294     return 0;
295 }
296 
297 void Manager::createObjects(const std::string& occ)
298 {
299     auto path = fs::path(OCC_CONTROL_ROOT) / occ;
300 
301     statusObjects.emplace_back(std::make_unique<Status>(
302         event, path.c_str(), *this,
303 #ifdef POWER10
304         pmode,
305 #endif
306         std::bind(std::mem_fn(&Manager::statusCallBack), this,
307                   std::placeholders::_1, std::placeholders::_2)
308 #ifdef PLDM
309             ,
310         std::bind(std::mem_fn(&pldm::Interface::resetOCC), pldmHandle.get(),
311                   std::placeholders::_1)
312 #endif
313             ));
314 
315     // Create the power cap monitor object
316     if (!pcap)
317     {
318         pcap = std::make_unique<open_power::occ::powercap::PowerCap>(
319             *statusObjects.back());
320     }
321 
322     if (statusObjects.back()->isMasterOcc())
323     {
324         log<level::INFO>(
325             std::format("Manager::createObjects(): OCC{} is the master",
326                         statusObjects.back()->getOccInstanceID())
327                 .c_str());
328         _pollTimer->setEnabled(false);
329 
330 #ifdef POWER10
331         // Set the master OCC on the PowerMode object
332         pmode->setMasterOcc(path);
333 #endif
334     }
335 
336     passThroughObjects.emplace_back(std::make_unique<PassThrough>(path.c_str()
337 #ifdef POWER10
338                                                                       ,
339                                                                   pmode
340 #endif
341                                                                   ));
342 }
343 
344 void Manager::statusCallBack(instanceID instance, bool status)
345 {
346     if (status == true)
347     {
348         // OCC went active
349         ++activeCount;
350 
351 #ifdef POWER10
352         if (activeCount == 1)
353         {
354             // First OCC went active (allow some time for all OCCs to go active)
355             waitForAllOccsTimer->restartOnce(60s);
356         }
357 #endif
358 
359         if (activeCount == statusObjects.size())
360         {
361 #ifdef POWER10
362             // All OCCs are now running
363             if (waitForAllOccsTimer->isEnabled())
364             {
365                 // stop occ wait timer
366                 waitForAllOccsTimer->setEnabled(false);
367             }
368 #endif
369 
370             // Verify master OCC and start presence monitor
371             validateOccMaster();
372         }
373 
374         // Start poll timer if not already started
375         if (!_pollTimer->isEnabled())
376         {
377             log<level::INFO>(
378                 std::format("Manager: OCCs will be polled every {} seconds",
379                             pollInterval)
380                     .c_str());
381 
382             // Send poll and start OCC poll timer
383             pollerTimerExpired();
384         }
385     }
386     else
387     {
388         // OCC went away
389         if (activeCount > 0)
390         {
391             --activeCount;
392         }
393         else
394         {
395             log<level::ERR>(
396                 std::format("OCC{} disabled, but currently no active OCCs",
397                             instance)
398                     .c_str());
399         }
400 
401         if (activeCount == 0)
402         {
403             // No OCCs are running
404 
405             // Stop OCC poll timer
406             if (_pollTimer->isEnabled())
407             {
408                 log<level::INFO>(
409                     "Manager::statusCallBack(): OCCs are not running, stopping poll timer");
410                 _pollTimer->setEnabled(false);
411             }
412 
413 #ifdef POWER10
414             // stop wait timer
415             if (waitForAllOccsTimer->isEnabled())
416             {
417                 waitForAllOccsTimer->setEnabled(false);
418             }
419 #endif
420         }
421 #ifdef READ_OCC_SENSORS
422         // Clear OCC sensors
423         setSensorValueToNaN(instance);
424 #endif
425     }
426 
427 #ifdef POWER10
428     if (waitingForAllOccActiveSensors)
429     {
430         if (utils::isHostRunning())
431         {
432             checkAllActiveSensors();
433         }
434     }
435 #endif
436 }
437 
438 #ifdef I2C_OCC
439 void Manager::initStatusObjects()
440 {
441     // Make sure we have a valid path string
442     static_assert(sizeof(DEV_PATH) != 0);
443 
444     auto deviceNames = i2c_occ::getOccHwmonDevices(DEV_PATH);
445     for (auto& name : deviceNames)
446     {
447         i2c_occ::i2cToDbus(name);
448         name = std::string(OCC_NAME) + '_' + name;
449         auto path = fs::path(OCC_CONTROL_ROOT) / name;
450         statusObjects.emplace_back(
451             std::make_unique<Status>(event, path.c_str(), *this));
452     }
453     // The first device is master occ
454     pcap = std::make_unique<open_power::occ::powercap::PowerCap>(
455         *statusObjects.front());
456 #ifdef POWER10
457     pmode = std::make_unique<powermode::PowerMode>(*this, powermode::PMODE_PATH,
458                                                    powermode::PIPS_PATH);
459     // Set the master OCC on the PowerMode object
460     pmode->setMasterOcc(path);
461 #endif
462 }
463 #endif
464 
465 #ifdef PLDM
466 void Manager::sbeTimeout(unsigned int instance)
467 {
468     auto obj = std::find_if(statusObjects.begin(), statusObjects.end(),
469                             [instance](const auto& obj) {
470         return instance == obj->getOccInstanceID();
471     });
472 
473     if (obj != statusObjects.end() && (*obj)->occActive())
474     {
475         log<level::INFO>(
476             std::format("SBE timeout, requesting HRESET (OCC{})", instance)
477                 .c_str());
478 
479         setSBEState(instance, SBE_STATE_NOT_USABLE);
480 
481         pldmHandle->sendHRESET(instance);
482     }
483 }
484 
485 bool Manager::updateOCCActive(instanceID instance, bool status)
486 {
487     auto obj = std::find_if(statusObjects.begin(), statusObjects.end(),
488                             [instance](const auto& obj) {
489         return instance == obj->getOccInstanceID();
490     });
491 
492     const bool hostRunning = open_power::occ::utils::isHostRunning();
493     if (obj != statusObjects.end())
494     {
495         if (!hostRunning && (status == true))
496         {
497             log<level::WARNING>(
498                 std::format(
499                     "updateOCCActive: Host is not running yet (OCC{} active={}), clearing sensor received",
500                     instance, status)
501                     .c_str());
502             (*obj)->setPldmSensorReceived(false);
503             if (!waitingForAllOccActiveSensors)
504             {
505                 log<level::INFO>(
506                     "updateOCCActive: Waiting for Host and all OCC Active Sensors");
507                 waitingForAllOccActiveSensors = true;
508             }
509             discoverTimer->restartOnce(30s);
510             return false;
511         }
512         else
513         {
514             log<level::INFO>(std::format("updateOCCActive: OCC{} active={}",
515                                          instance, status)
516                                  .c_str());
517             (*obj)->setPldmSensorReceived(true);
518             return (*obj)->occActive(status);
519         }
520     }
521     else
522     {
523         if (hostRunning)
524         {
525             log<level::WARNING>(
526                 std::format(
527                     "updateOCCActive: No status object to update for OCC{} (active={})",
528                     instance, status)
529                     .c_str());
530         }
531         else
532         {
533             if (status == true)
534             {
535                 log<level::WARNING>(
536                     std::format(
537                         "updateOCCActive: No status objects and Host is not running yet (OCC{} active={})",
538                         instance, status)
539                         .c_str());
540             }
541         }
542         if (status == true)
543         {
544             // OCC went active
545             queuedActiveState.insert(instance);
546         }
547         else
548         {
549             auto match = queuedActiveState.find(instance);
550             if (match != queuedActiveState.end())
551             {
552                 // OCC was disabled
553                 queuedActiveState.erase(match);
554             }
555         }
556         return false;
557     }
558 }
559 
560 // Called upon pldm event To set powermode Safe Mode State for system.
561 void Manager::updateOccSafeMode(bool safeMode)
562 {
563 #ifdef POWER10
564     pmode->updateDbusSafeMode(safeMode);
565 #endif
566     // Update the processor throttle status on dbus
567     for (auto& obj : statusObjects)
568     {
569         obj->updateThrottle(safeMode, THROTTLED_SAFE);
570     }
571 }
572 
573 void Manager::sbeHRESETResult(instanceID instance, bool success)
574 {
575     if (success)
576     {
577         log<level::INFO>(
578             std::format("HRESET succeeded (OCC{})", instance).c_str());
579 
580         setSBEState(instance, SBE_STATE_BOOTED);
581 
582         return;
583     }
584 
585     setSBEState(instance, SBE_STATE_FAILED);
586 
587     if (sbeCanDump(instance))
588     {
589         log<level::INFO>(
590             std::format("HRESET failed (OCC{}), triggering SBE dump", instance)
591                 .c_str());
592 
593         auto& bus = utils::getBus();
594         uint32_t src6 = instance << 16;
595         uint32_t logId =
596             FFDC::createPEL("org.open_power.Processor.Error.SbeChipOpTimeout",
597                             src6, "SBE command timeout");
598 
599         try
600         {
601             constexpr auto path = "/org/openpower/dump";
602             constexpr auto interface = "xyz.openbmc_project.Dump.Create";
603             constexpr auto function = "CreateDump";
604 
605             std::string service = utils::getService(path, interface);
606             auto method = bus.new_method_call(service.c_str(), path, interface,
607                                               function);
608 
609             std::map<std::string, std::variant<std::string, uint64_t>>
610                 createParams{
611                     {"com.ibm.Dump.Create.CreateParameters.ErrorLogId",
612                      uint64_t(logId)},
613                     {"com.ibm.Dump.Create.CreateParameters.DumpType",
614                      "com.ibm.Dump.Create.DumpType.SBE"},
615                     {"com.ibm.Dump.Create.CreateParameters.FailingUnitId",
616                      uint64_t(instance)},
617                 };
618 
619             method.append(createParams);
620 
621             auto response = bus.call(method);
622         }
623         catch (const sdbusplus::exception_t& e)
624         {
625             constexpr auto ERROR_DUMP_DISABLED =
626                 "xyz.openbmc_project.Dump.Create.Error.Disabled";
627             if (e.name() == ERROR_DUMP_DISABLED)
628             {
629                 log<level::INFO>("Dump is disabled, skipping");
630             }
631             else
632             {
633                 log<level::ERR>("Dump failed");
634             }
635         }
636     }
637 }
638 
639 bool Manager::sbeCanDump(unsigned int instance)
640 {
641     struct pdbg_target* proc = getPdbgTarget(instance);
642 
643     if (!proc)
644     {
645         // allow the dump in the error case
646         return true;
647     }
648 
649     try
650     {
651         if (!openpower::phal::sbe::isDumpAllowed(proc))
652         {
653             return false;
654         }
655 
656         if (openpower::phal::pdbg::isSbeVitalAttnActive(proc))
657         {
658             return false;
659         }
660     }
661     catch (openpower::phal::exception::SbeError& e)
662     {
663         log<level::INFO>("Failed to query SBE state");
664     }
665 
666     // allow the dump in the error case
667     return true;
668 }
669 
670 void Manager::setSBEState(unsigned int instance, enum sbe_state state)
671 {
672     struct pdbg_target* proc = getPdbgTarget(instance);
673 
674     if (!proc)
675     {
676         return;
677     }
678 
679     try
680     {
681         openpower::phal::sbe::setState(proc, state);
682     }
683     catch (const openpower::phal::exception::SbeError& e)
684     {
685         log<level::ERR>("Failed to set SBE state");
686     }
687 }
688 
689 struct pdbg_target* Manager::getPdbgTarget(unsigned int instance)
690 {
691     if (!pdbgInitialized)
692     {
693         try
694         {
695             openpower::phal::pdbg::init();
696             pdbgInitialized = true;
697         }
698         catch (const openpower::phal::exception::PdbgError& e)
699         {
700             log<level::ERR>("pdbg initialization failed");
701             return nullptr;
702         }
703     }
704 
705     struct pdbg_target* proc = nullptr;
706     pdbg_for_each_class_target("proc", proc)
707     {
708         if (pdbg_target_index(proc) == instance)
709         {
710             return proc;
711         }
712     }
713 
714     log<level::ERR>("Failed to get pdbg target");
715     return nullptr;
716 }
717 #endif
718 
719 void Manager::pollerTimerExpired()
720 {
721     if (!_pollTimer)
722     {
723         log<level::ERR>(
724             "Manager::pollerTimerExpired() ERROR: Timer not defined");
725         return;
726     }
727 
728     for (auto& obj : statusObjects)
729     {
730         if (!obj->occActive())
731         {
732             // OCC is not running yet
733 #ifdef READ_OCC_SENSORS
734             auto id = obj->getOccInstanceID();
735             setSensorValueToNaN(id);
736 #endif
737             continue;
738         }
739 
740         // Read sysfs to force kernel to poll OCC
741         obj->readOccState();
742 
743 #ifdef READ_OCC_SENSORS
744         // Read occ sensor values
745         getSensorValues(obj);
746 #endif
747     }
748 
749     if (activeCount > 0)
750     {
751         // Restart OCC poll timer
752         _pollTimer->restartOnce(std::chrono::seconds(pollInterval));
753     }
754     else
755     {
756         // No OCCs running, so poll timer will not be restarted
757         log<level::INFO>(
758             std::format(
759                 "Manager::pollerTimerExpired: poll timer will not be restarted")
760                 .c_str());
761     }
762 }
763 
764 #ifdef READ_OCC_SENSORS
765 void Manager::readTempSensors(const fs::path& path, uint32_t occInstance)
766 {
767     // There may be more than one sensor with the same FRU type
768     // and label so make two passes: the first to read the temps
769     // from sysfs, and the second to put them on D-Bus after
770     // resolving any conflicts.
771     std::map<std::string, double> sensorData;
772 
773     std::regex expr{"temp\\d+_label$"}; // Example: temp5_label
774     for (auto& file : fs::directory_iterator(path))
775     {
776         if (!std::regex_search(file.path().string(), expr))
777         {
778             continue;
779         }
780 
781         uint32_t labelValue{0};
782 
783         try
784         {
785             labelValue = readFile<uint32_t>(file.path());
786         }
787         catch (const std::system_error& e)
788         {
789             log<level::DEBUG>(
790                 std::format("readTempSensors: Failed reading {}, errno = {}",
791                             file.path().string(), e.code().value())
792                     .c_str());
793             continue;
794         }
795 
796         const std::string& tempLabel = "label";
797         const std::string filePathString = file.path().string().substr(
798             0, file.path().string().length() - tempLabel.length());
799 
800         uint32_t fruTypeValue{0};
801         try
802         {
803             fruTypeValue = readFile<uint32_t>(filePathString + fruTypeSuffix);
804         }
805         catch (const std::system_error& e)
806         {
807             log<level::DEBUG>(
808                 std::format("readTempSensors: Failed reading {}, errno = {}",
809                             filePathString + fruTypeSuffix, e.code().value())
810                     .c_str());
811             continue;
812         }
813 
814         std::string sensorPath = OCC_SENSORS_ROOT +
815                                  std::string("/temperature/");
816 
817         std::string dvfsTempPath;
818 
819         if (fruTypeValue == VRMVdd)
820         {
821             sensorPath.append("vrm_vdd" + std::to_string(occInstance) +
822                               "_temp");
823         }
824         else if (fruTypeValue == processorIoRing)
825         {
826             sensorPath.append("proc" + std::to_string(occInstance) +
827                               "_ioring_temp");
828             dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/proc" +
829                            std::to_string(occInstance) + "_ioring_dvfs_temp";
830         }
831         else
832         {
833             uint16_t type = (labelValue & 0xFF000000) >> 24;
834             uint16_t instanceID = labelValue & 0x0000FFFF;
835 
836             if (type == OCC_DIMM_TEMP_SENSOR_TYPE)
837             {
838                 if (fruTypeValue == fruTypeNotAvailable)
839                 {
840                     // Not all DIMM related temps are available to read
841                     // (no _input file in this case)
842                     continue;
843                 }
844                 auto iter = dimmTempSensorName.find(fruTypeValue);
845                 if (iter == dimmTempSensorName.end())
846                 {
847                     log<level::ERR>(
848                         std::format(
849                             "readTempSensors: Fru type error! fruTypeValue = {}) ",
850                             fruTypeValue)
851                             .c_str());
852                     continue;
853                 }
854 
855                 sensorPath.append("dimm" + std::to_string(instanceID) +
856                                   iter->second);
857 
858                 dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/" +
859                                dimmDVFSSensorName.at(fruTypeValue);
860             }
861             else if (type == OCC_CPU_TEMP_SENSOR_TYPE)
862             {
863                 if (fruTypeValue == processorCore)
864                 {
865                     // The OCC reports small core temps, of which there are
866                     // two per big core.  All current P10 systems are in big
867                     // core mode, so use a big core name.
868                     uint16_t coreNum = instanceID / 2;
869                     uint16_t tempNum = instanceID % 2;
870                     sensorPath.append("proc" + std::to_string(occInstance) +
871                                       "_core" + std::to_string(coreNum) + "_" +
872                                       std::to_string(tempNum) + "_temp");
873 
874                     dvfsTempPath =
875                         std::string{OCC_SENSORS_ROOT} + "/temperature/proc" +
876                         std::to_string(occInstance) + "_core_dvfs_temp";
877                 }
878                 else
879                 {
880                     continue;
881                 }
882             }
883             else
884             {
885                 continue;
886             }
887         }
888 
889         // The dvfs temp file only needs to be read once per chip per type.
890         if (!dvfsTempPath.empty() &&
891             !dbus::OccDBusSensors::getOccDBus().hasDvfsTemp(dvfsTempPath))
892         {
893             try
894             {
895                 auto dvfsValue = readFile<double>(filePathString + maxSuffix);
896 
897                 dbus::OccDBusSensors::getOccDBus().setDvfsTemp(
898                     dvfsTempPath, dvfsValue * std::pow(10, -3));
899             }
900             catch (const std::system_error& e)
901             {
902                 log<level::DEBUG>(
903                     std::format(
904                         "readTempSensors: Failed reading {}, errno = {}",
905                         filePathString + maxSuffix, e.code().value())
906                         .c_str());
907             }
908         }
909 
910         uint32_t faultValue{0};
911         try
912         {
913             faultValue = readFile<uint32_t>(filePathString + faultSuffix);
914         }
915         catch (const std::system_error& e)
916         {
917             log<level::DEBUG>(
918                 std::format("readTempSensors: Failed reading {}, errno = {}",
919                             filePathString + faultSuffix, e.code().value())
920                     .c_str());
921             continue;
922         }
923 
924         double tempValue{0};
925         // NOTE: if OCC sends back 0xFF, kernal sets this fault value to 1.
926         if (faultValue != 0)
927         {
928             tempValue = std::numeric_limits<double>::quiet_NaN();
929         }
930         else
931         {
932             // Read the temperature
933             try
934             {
935                 tempValue = readFile<double>(filePathString + inputSuffix);
936             }
937             catch (const std::system_error& e)
938             {
939                 log<level::DEBUG>(
940                     std::format(
941                         "readTempSensors: Failed reading {}, errno = {}",
942                         filePathString + inputSuffix, e.code().value())
943                         .c_str());
944 
945                 // if errno == EAGAIN(Resource temporarily unavailable) then set
946                 // temp to 0, to avoid using old temp, and affecting FAN
947                 // Control.
948                 if (e.code().value() == EAGAIN)
949                 {
950                     tempValue = 0;
951                 }
952                 // else the errno would be something like
953                 //     EBADF(Bad file descriptor)
954                 // or ENOENT(No such file or directory)
955                 else
956                 {
957                     continue;
958                 }
959             }
960         }
961 
962         // If this object path already has a value, only overwite
963         // it if the previous one was an NaN or a smaller value.
964         auto existing = sensorData.find(sensorPath);
965         if (existing != sensorData.end())
966         {
967             // Multiple sensors found for this FRU type
968             if ((std::isnan(existing->second) && (tempValue == 0)) ||
969                 ((existing->second == 0) && std::isnan(tempValue)))
970             {
971                 // One of the redundant sensors has failed (0xFF/nan), and the
972                 // other sensor has no reading (0), so set the FRU to NaN to
973                 // force fan increase
974                 tempValue = std::numeric_limits<double>::quiet_NaN();
975                 existing->second = tempValue;
976             }
977             if (std::isnan(existing->second) || (tempValue > existing->second))
978             {
979                 existing->second = tempValue;
980             }
981         }
982         else
983         {
984             // First sensor for this FRU type
985             sensorData[sensorPath] = tempValue;
986         }
987     }
988 
989     // Now publish the values on D-Bus.
990     for (const auto& [objectPath, value] : sensorData)
991     {
992         dbus::OccDBusSensors::getOccDBus().setValue(objectPath,
993                                                     value * std::pow(10, -3));
994 
995         dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
996             objectPath, !std::isnan(value));
997 
998         if (existingSensors.find(objectPath) == existingSensors.end())
999         {
1000             dbus::OccDBusSensors::getOccDBus().setChassisAssociation(
1001                 objectPath);
1002         }
1003 
1004         existingSensors[objectPath] = occInstance;
1005     }
1006 }
1007 
1008 std::optional<std::string>
1009     Manager::getPowerLabelFunctionID(const std::string& value)
1010 {
1011     // If the value is "system", then the FunctionID is "system".
1012     if (value == "system")
1013     {
1014         return value;
1015     }
1016 
1017     // If the value is not "system", then the label value have 3 numbers, of
1018     // which we only care about the middle one:
1019     // <sensor id>_<function id>_<apss channel>
1020     // eg: The value is "0_10_5" , then the FunctionID is "10".
1021     if (value.find("_") == std::string::npos)
1022     {
1023         return std::nullopt;
1024     }
1025 
1026     auto powerLabelValue = value.substr((value.find("_") + 1));
1027 
1028     if (powerLabelValue.find("_") == std::string::npos)
1029     {
1030         return std::nullopt;
1031     }
1032 
1033     return powerLabelValue.substr(0, powerLabelValue.find("_"));
1034 }
1035 
1036 void Manager::readPowerSensors(const fs::path& path, uint32_t id)
1037 {
1038     std::regex expr{"power\\d+_label$"}; // Example: power5_label
1039     for (auto& file : fs::directory_iterator(path))
1040     {
1041         if (!std::regex_search(file.path().string(), expr))
1042         {
1043             continue;
1044         }
1045 
1046         std::string labelValue;
1047         try
1048         {
1049             labelValue = readFile<std::string>(file.path());
1050         }
1051         catch (const std::system_error& e)
1052         {
1053             log<level::DEBUG>(
1054                 std::format("readPowerSensors: Failed reading {}, errno = {}",
1055                             file.path().string(), e.code().value())
1056                     .c_str());
1057             continue;
1058         }
1059 
1060         auto functionID = getPowerLabelFunctionID(labelValue);
1061         if (functionID == std::nullopt)
1062         {
1063             continue;
1064         }
1065 
1066         const std::string& tempLabel = "label";
1067         const std::string filePathString = file.path().string().substr(
1068             0, file.path().string().length() - tempLabel.length());
1069 
1070         std::string sensorPath = OCC_SENSORS_ROOT + std::string("/power/");
1071 
1072         auto iter = powerSensorName.find(*functionID);
1073         if (iter == powerSensorName.end())
1074         {
1075             continue;
1076         }
1077         sensorPath.append(iter->second);
1078 
1079         double tempValue{0};
1080 
1081         try
1082         {
1083             tempValue = readFile<double>(filePathString + inputSuffix);
1084         }
1085         catch (const std::system_error& e)
1086         {
1087             log<level::DEBUG>(
1088                 std::format("readPowerSensors: Failed reading {}, errno = {}",
1089                             filePathString + inputSuffix, e.code().value())
1090                     .c_str());
1091             continue;
1092         }
1093 
1094         dbus::OccDBusSensors::getOccDBus().setUnit(
1095             sensorPath, "xyz.openbmc_project.Sensor.Value.Unit.Watts");
1096 
1097         dbus::OccDBusSensors::getOccDBus().setValue(
1098             sensorPath, tempValue * std::pow(10, -3) * std::pow(10, -3));
1099 
1100         dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath,
1101                                                                 true);
1102 
1103         if (existingSensors.find(sensorPath) == existingSensors.end())
1104         {
1105             dbus::OccDBusSensors::getOccDBus().setChassisAssociation(
1106                 sensorPath);
1107         }
1108 
1109         existingSensors[sensorPath] = id;
1110     }
1111     return;
1112 }
1113 
1114 void Manager::setSensorValueToNaN(uint32_t id) const
1115 {
1116     for (const auto& [sensorPath, occId] : existingSensors)
1117     {
1118         if (occId == id)
1119         {
1120             dbus::OccDBusSensors::getOccDBus().setValue(
1121                 sensorPath, std::numeric_limits<double>::quiet_NaN());
1122 
1123             dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath,
1124                                                                     true);
1125         }
1126     }
1127     return;
1128 }
1129 
1130 void Manager::setSensorValueToNonFunctional(uint32_t id) const
1131 {
1132     for (const auto& [sensorPath, occId] : existingSensors)
1133     {
1134         if (occId == id)
1135         {
1136             dbus::OccDBusSensors::getOccDBus().setValue(
1137                 sensorPath, std::numeric_limits<double>::quiet_NaN());
1138 
1139             dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath,
1140                                                                     false);
1141         }
1142     }
1143     return;
1144 }
1145 
1146 void Manager::getSensorValues(std::unique_ptr<Status>& occ)
1147 {
1148     static bool tracedError[8] = {0};
1149     const fs::path sensorPath = occ->getHwmonPath();
1150     const uint32_t id = occ->getOccInstanceID();
1151 
1152     if (fs::exists(sensorPath))
1153     {
1154         // Read temperature sensors
1155         readTempSensors(sensorPath, id);
1156 
1157         if (occ->isMasterOcc())
1158         {
1159             // Read power sensors
1160             readPowerSensors(sensorPath, id);
1161         }
1162         tracedError[id] = false;
1163     }
1164     else
1165     {
1166         if (!tracedError[id])
1167         {
1168             log<level::ERR>(
1169                 std::format(
1170                     "Manager::getSensorValues: OCC{} sensor path missing: {}",
1171                     id, sensorPath.c_str())
1172                     .c_str());
1173             tracedError[id] = true;
1174         }
1175     }
1176 
1177     return;
1178 }
1179 #endif
1180 
1181 // Read the altitude from DBus
1182 void Manager::readAltitude()
1183 {
1184     static bool traceAltitudeErr = true;
1185 
1186     utils::PropertyValue altitudeProperty{};
1187     try
1188     {
1189         altitudeProperty = utils::getProperty(ALTITUDE_PATH, ALTITUDE_INTERFACE,
1190                                               ALTITUDE_PROP);
1191         auto sensorVal = std::get<double>(altitudeProperty);
1192         if (sensorVal < 0xFFFF)
1193         {
1194             if (sensorVal < 0)
1195             {
1196                 altitude = 0;
1197             }
1198             else
1199             {
1200                 // Round to nearest meter
1201                 altitude = uint16_t(sensorVal + 0.5);
1202             }
1203             log<level::DEBUG>(std::format("readAltitude: sensor={} ({}m)",
1204                                           sensorVal, altitude)
1205                                   .c_str());
1206             traceAltitudeErr = true;
1207         }
1208         else
1209         {
1210             if (traceAltitudeErr)
1211             {
1212                 traceAltitudeErr = false;
1213                 log<level::DEBUG>(
1214                     std::format("Invalid altitude value: {}", sensorVal)
1215                         .c_str());
1216             }
1217         }
1218     }
1219     catch (const sdbusplus::exception_t& e)
1220     {
1221         if (traceAltitudeErr)
1222         {
1223             traceAltitudeErr = false;
1224             log<level::INFO>(
1225                 std::format("Unable to read Altitude: {}", e.what()).c_str());
1226         }
1227         altitude = 0xFFFF; // not available
1228     }
1229 }
1230 
1231 // Callback function when ambient temperature changes
1232 void Manager::ambientCallback(sdbusplus::message_t& msg)
1233 {
1234     double currentTemp = 0;
1235     uint8_t truncatedTemp = 0xFF;
1236     std::string msgSensor;
1237     std::map<std::string, std::variant<double>> msgData;
1238     msg.read(msgSensor, msgData);
1239 
1240     auto valPropMap = msgData.find(AMBIENT_PROP);
1241     if (valPropMap == msgData.end())
1242     {
1243         log<level::DEBUG>("ambientCallback: Unknown ambient property changed");
1244         return;
1245     }
1246     currentTemp = std::get<double>(valPropMap->second);
1247     if (std::isnan(currentTemp))
1248     {
1249         truncatedTemp = 0xFF;
1250     }
1251     else
1252     {
1253         if (currentTemp < 0)
1254         {
1255             truncatedTemp = 0;
1256         }
1257         else
1258         {
1259             // Round to nearest degree C
1260             truncatedTemp = uint8_t(currentTemp + 0.5);
1261         }
1262     }
1263 
1264     // If ambient changes, notify OCCs
1265     if (truncatedTemp != ambient)
1266     {
1267         log<level::DEBUG>(
1268             std::format("ambientCallback: Ambient change from {} to {}C",
1269                         ambient, currentTemp)
1270                 .c_str());
1271 
1272         ambient = truncatedTemp;
1273         if (altitude == 0xFFFF)
1274         {
1275             // No altitude yet, try reading again
1276             readAltitude();
1277         }
1278 
1279         log<level::DEBUG>(
1280             std::format("ambientCallback: Ambient: {}C, altitude: {}m", ambient,
1281                         altitude)
1282                 .c_str());
1283 #ifdef POWER10
1284         // Send ambient and altitude to all OCCs
1285         for (auto& obj : statusObjects)
1286         {
1287             if (obj->occActive())
1288             {
1289                 obj->sendAmbient(ambient, altitude);
1290             }
1291         }
1292 #endif // POWER10
1293     }
1294 }
1295 
1296 // return the current ambient and altitude readings
1297 void Manager::getAmbientData(bool& ambientValid, uint8_t& ambientTemp,
1298                              uint16_t& altitudeValue) const
1299 {
1300     ambientValid = true;
1301     ambientTemp = ambient;
1302     altitudeValue = altitude;
1303 
1304     if (ambient == 0xFF)
1305     {
1306         ambientValid = false;
1307     }
1308 }
1309 
1310 #ifdef POWER10
1311 // Called when waitForAllOccsTimer expires
1312 // After the first OCC goes active, this timer will be started (60 seconds)
1313 void Manager::occsNotAllRunning()
1314 {
1315     if (activeCount != statusObjects.size())
1316     {
1317         // Not all OCCs went active
1318         log<level::WARNING>(
1319             std::format(
1320                 "occsNotAllRunning: Active OCC count ({}) does not match expected count ({})",
1321                 activeCount, statusObjects.size())
1322                 .c_str());
1323         // Procs may be garded, so may be expected
1324     }
1325 
1326     validateOccMaster();
1327 }
1328 #endif // POWER10
1329 
1330 // Verify single master OCC and start presence monitor
1331 void Manager::validateOccMaster()
1332 {
1333     int masterInstance = -1;
1334     for (auto& obj : statusObjects)
1335     {
1336         auto instance = obj->getOccInstanceID();
1337 #ifdef POWER10
1338         if (!obj->occActive())
1339         {
1340             if (utils::isHostRunning())
1341             {
1342                 // Check if sensor was queued while waiting for discovery
1343                 auto match = queuedActiveState.find(instance);
1344                 if (match != queuedActiveState.end())
1345                 {
1346                     queuedActiveState.erase(match);
1347                     log<level::INFO>(
1348                         std::format(
1349                             "validateOccMaster: OCC{} is ACTIVE (queued)",
1350                             instance)
1351                             .c_str());
1352                     obj->occActive(true);
1353                 }
1354                 else
1355                 {
1356                     // OCC does not appear to be active yet, check active sensor
1357 #ifdef PLDM
1358                     pldmHandle->checkActiveSensor(instance);
1359 #endif
1360                     if (obj->occActive())
1361                     {
1362                         log<level::INFO>(
1363                             std::format(
1364                                 "validateOccMaster: OCC{} is ACTIVE after reading sensor",
1365                                 instance)
1366                                 .c_str());
1367                     }
1368                 }
1369             }
1370             else
1371             {
1372                 log<level::WARNING>(
1373                     std::format(
1374                         "validateOccMaster: HOST is not running (OCC{})",
1375                         instance)
1376                         .c_str());
1377                 return;
1378             }
1379         }
1380 #endif // POWER10
1381 
1382         if (obj->isMasterOcc())
1383         {
1384             obj->addPresenceWatchMaster();
1385 
1386             if (masterInstance == -1)
1387             {
1388                 masterInstance = instance;
1389             }
1390             else
1391             {
1392                 log<level::ERR>(
1393                     std::format(
1394                         "validateOccMaster: Multiple OCC masters! ({} and {})",
1395                         masterInstance, instance)
1396                         .c_str());
1397                 // request reset
1398                 obj->deviceError(Error::Descriptor(PRESENCE_ERROR_PATH));
1399             }
1400         }
1401     }
1402 
1403     if (masterInstance < 0)
1404     {
1405         log<level::ERR>(
1406             std::format("validateOccMaster: Master OCC not found! (of {} OCCs)",
1407                         statusObjects.size())
1408                 .c_str());
1409         // request reset
1410         statusObjects.front()->deviceError(
1411             Error::Descriptor(PRESENCE_ERROR_PATH));
1412     }
1413     else
1414     {
1415         log<level::INFO>(
1416             std::format("validateOccMaster: OCC{} is master of {} OCCs",
1417                         masterInstance, activeCount)
1418                 .c_str());
1419 #ifdef POWER10
1420         pmode->updateDbusSafeMode(false);
1421 #endif
1422     }
1423 }
1424 
1425 void Manager::updatePcapBounds() const
1426 {
1427     if (pcap)
1428     {
1429         pcap->updatePcapBounds();
1430     }
1431 }
1432 
1433 } // namespace occ
1434 } // namespace open_power
1435