1 #include "config.h"
2 
3 #include "occ_manager.hpp"
4 
5 #include "i2c_occ.hpp"
6 #include "occ_dbus.hpp"
7 #include "occ_errors.hpp"
8 #include "utils.hpp"
9 
10 #include <phosphor-logging/elog-errors.hpp>
11 #include <phosphor-logging/log.hpp>
12 #include <xyz/openbmc_project/Common/error.hpp>
13 
14 #include <chrono>
15 #include <cmath>
16 #include <filesystem>
17 #include <fstream>
18 #include <regex>
19 
20 namespace open_power
21 {
22 namespace occ
23 {
24 
25 constexpr uint32_t fruTypeNotAvailable = 0xFF;
26 constexpr auto fruTypeSuffix = "fru_type";
27 constexpr auto faultSuffix = "fault";
28 constexpr auto inputSuffix = "input";
29 constexpr auto maxSuffix = "max";
30 
31 const auto HOST_ON_FILE = "/run/openbmc/host@0-on";
32 
33 using namespace phosphor::logging;
34 using namespace std::literals::chrono_literals;
35 
36 template <typename T>
37 T readFile(const std::string& path)
38 {
39     std::ifstream ifs;
40     ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit |
41                    std::ifstream::eofbit);
42     T data;
43 
44     try
45     {
46         ifs.open(path);
47         ifs >> data;
48         ifs.close();
49     }
50     catch (const std::exception& e)
51     {
52         auto err = errno;
53         throw std::system_error(err, std::generic_category());
54     }
55 
56     return data;
57 }
58 
59 void Manager::findAndCreateObjects()
60 {
61 #ifndef POWER10
62     for (auto id = 0; id < MAX_CPUS; ++id)
63     {
64         // Create one occ per cpu
65         auto occ = std::string(OCC_NAME) + std::to_string(id);
66         createObjects(occ);
67     }
68 #else
69     if (!pmode)
70     {
71         // Create the power mode object
72         pmode = std::make_unique<powermode::PowerMode>(
73             *this, powermode::PMODE_PATH, powermode::PIPS_PATH, event);
74     }
75 
76     if (!fs::exists(HOST_ON_FILE))
77     {
78         static bool statusObjCreated = false;
79         if (!statusObjCreated)
80         {
81             // Create the OCCs based on on the /dev/occX devices
82             auto occs = findOCCsInDev();
83 
84             if (occs.empty() || (prevOCCSearch.size() != occs.size()))
85             {
86                 // Something changed or no OCCs yet, try again in 10s.
87                 // Note on the first pass prevOCCSearch will be empty,
88                 // so there will be at least one delay to give things
89                 // a chance to settle.
90                 prevOCCSearch = occs;
91 
92                 log<level::INFO>(
93                     std::format(
94                         "Manager::findAndCreateObjects(): Waiting for OCCs (currently {})",
95                         occs.size())
96                         .c_str());
97 
98                 discoverTimer->restartOnce(10s);
99             }
100             else
101             {
102                 // All OCCs appear to be available, create status objects
103 
104                 // createObjects requires OCC0 first.
105                 std::sort(occs.begin(), occs.end());
106 
107                 log<level::INFO>(
108                     std::format(
109                         "Manager::findAndCreateObjects(): Creating {} OCC Status Objects",
110                         occs.size())
111                         .c_str());
112                 for (auto id : occs)
113                 {
114                     createObjects(std::string(OCC_NAME) + std::to_string(id));
115                 }
116                 statusObjCreated = true;
117                 waitingForAllOccActiveSensors = true;
118 
119                 // Find/update the processor path associated with each OCC
120                 for (auto& obj : statusObjects)
121                 {
122                     obj->updateProcAssociation();
123                 }
124             }
125         }
126 
127         if (statusObjCreated && waitingForAllOccActiveSensors)
128         {
129             static bool tracedHostWait = false;
130             if (utils::isHostRunning())
131             {
132                 if (tracedHostWait)
133                 {
134                     log<level::INFO>(
135                         "Manager::findAndCreateObjects(): Host is running");
136                     tracedHostWait = false;
137                 }
138                 checkAllActiveSensors();
139             }
140             else
141             {
142                 if (!tracedHostWait)
143                 {
144                     log<level::INFO>(
145                         "Manager::findAndCreateObjects(): Waiting for host to start");
146                     tracedHostWait = true;
147                 }
148                 discoverTimer->restartOnce(30s);
149 #ifdef PLDM
150                 if (throttleTraceTimer->isEnabled())
151                 {
152                     // Host is no longer running, disable throttle timer and
153                     // make sure traces are not throttled
154                     log<level::INFO>(
155                         "findAndCreateObjects(): disabling sensor timer");
156                     throttleTraceTimer->setEnabled(false);
157                     pldmHandle->setTraceThrottle(false);
158                 }
159 #endif
160             }
161         }
162     }
163     else
164     {
165         log<level::INFO>(
166             std::format(
167                 "Manager::findAndCreateObjects(): Waiting for {} to complete...",
168                 HOST_ON_FILE)
169                 .c_str());
170         discoverTimer->restartOnce(10s);
171     }
172 #endif
173 }
174 
175 #ifdef POWER10
176 // Check if all occActive sensors are available
177 void Manager::checkAllActiveSensors()
178 {
179     static bool allActiveSensorAvailable = false;
180     static bool tracedSensorWait = false;
181     static bool waitingForHost = false;
182 
183     if (open_power::occ::utils::isHostRunning())
184     {
185         if (waitingForHost)
186         {
187             waitingForHost = false;
188             log<level::INFO>("checkAllActiveSensors(): Host is now running");
189         }
190 
191         // Start with the assumption that all are available
192         allActiveSensorAvailable = true;
193         for (auto& obj : statusObjects)
194         {
195             if ((!obj->occActive()) && (!obj->getPldmSensorReceived()))
196             {
197                 auto instance = obj->getOccInstanceID();
198                 // Check if sensor was queued while waiting for discovery
199                 auto match = queuedActiveState.find(instance);
200                 if (match != queuedActiveState.end())
201                 {
202                     queuedActiveState.erase(match);
203                     log<level::INFO>(
204                         std::format(
205                             "checkAllActiveSensors(): OCC{} is ACTIVE (queued)",
206                             instance)
207                             .c_str());
208                     obj->occActive(true);
209                 }
210                 else
211                 {
212                     allActiveSensorAvailable = false;
213                     if (!tracedSensorWait)
214                     {
215                         log<level::INFO>(
216                             std::format(
217                                 "checkAllActiveSensors(): Waiting on OCC{} Active sensor",
218                                 instance)
219                                 .c_str());
220                         tracedSensorWait = true;
221 #ifdef PLDM
222                         // Make sure traces are not throttled
223                         pldmHandle->setTraceThrottle(false);
224                         // Start timer to throttle pldm traces when timer
225                         // expires
226                         throttleTraceTimer->restartOnce(40min);
227 #endif
228                     }
229 #ifdef PLDM
230                     pldmHandle->checkActiveSensor(obj->getOccInstanceID());
231 #endif
232                     break;
233                 }
234             }
235         }
236     }
237     else
238     {
239         if (!waitingForHost)
240         {
241             waitingForHost = true;
242             log<level::INFO>(
243                 "checkAllActiveSensors(): Waiting for host to start");
244 #ifdef PLDM
245             if (throttleTraceTimer->isEnabled())
246             {
247                 // Host is no longer running, disable throttle timer and
248                 // make sure traces are not throttled
249                 log<level::INFO>(
250                     "checkAllActiveSensors(): disabling sensor timer");
251                 throttleTraceTimer->setEnabled(false);
252                 pldmHandle->setTraceThrottle(false);
253             }
254 #endif
255         }
256     }
257 
258     if (allActiveSensorAvailable)
259     {
260         // All sensors were found, disable the discovery timer
261         if (discoverTimer->isEnabled())
262         {
263             discoverTimer->setEnabled(false);
264         }
265 #ifdef PLDM
266         if (throttleTraceTimer->isEnabled())
267         {
268             // Disable throttle timer and make sure traces are not throttled
269             throttleTraceTimer->setEnabled(false);
270             pldmHandle->setTraceThrottle(false);
271         }
272 #endif
273 
274         if (waitingForAllOccActiveSensors)
275         {
276             log<level::INFO>(
277                 "checkAllActiveSensors(): OCC Active sensors are available");
278             waitingForAllOccActiveSensors = false;
279         }
280         queuedActiveState.clear();
281         tracedSensorWait = false;
282     }
283     else
284     {
285         // Not all sensors were available, so keep waiting
286         if (!tracedSensorWait)
287         {
288             log<level::INFO>(
289                 "checkAllActiveSensors(): Waiting for OCC Active sensors to become available");
290             tracedSensorWait = true;
291         }
292         discoverTimer->restartOnce(10s);
293     }
294 }
295 #endif
296 
297 std::vector<int> Manager::findOCCsInDev()
298 {
299     std::vector<int> occs;
300     std::regex expr{R"(occ(\d+)$)"};
301 
302     for (auto& file : fs::directory_iterator("/dev"))
303     {
304         std::smatch match;
305         std::string path{file.path().string()};
306         if (std::regex_search(path, match, expr))
307         {
308             auto num = std::stoi(match[1].str());
309 
310             // /dev numbering starts at 1, ours starts at 0.
311             occs.push_back(num - 1);
312         }
313     }
314 
315     return occs;
316 }
317 
318 int Manager::cpuCreated(sdbusplus::message_t& msg)
319 {
320     namespace fs = std::filesystem;
321 
322     sdbusplus::message::object_path o;
323     msg.read(o);
324     fs::path cpuPath(std::string(std::move(o)));
325 
326     auto name = cpuPath.filename().string();
327     auto index = name.find(CPU_NAME);
328     name.replace(index, std::strlen(CPU_NAME), OCC_NAME);
329 
330     createObjects(name);
331 
332     return 0;
333 }
334 
335 void Manager::createObjects(const std::string& occ)
336 {
337     auto path = fs::path(OCC_CONTROL_ROOT) / occ;
338 
339     statusObjects.emplace_back(std::make_unique<Status>(
340         event, path.c_str(), *this,
341 #ifdef POWER10
342         pmode,
343 #endif
344         std::bind(std::mem_fn(&Manager::statusCallBack), this,
345                   std::placeholders::_1, std::placeholders::_2)
346 #ifdef PLDM
347             ,
348         std::bind(std::mem_fn(&pldm::Interface::resetOCC), pldmHandle.get(),
349                   std::placeholders::_1)
350 #endif
351             ));
352 
353     // Create the power cap monitor object
354     if (!pcap)
355     {
356         pcap = std::make_unique<open_power::occ::powercap::PowerCap>(
357             *statusObjects.back());
358     }
359 
360     if (statusObjects.back()->isMasterOcc())
361     {
362         log<level::INFO>(
363             std::format("Manager::createObjects(): OCC{} is the master",
364                         statusObjects.back()->getOccInstanceID())
365                 .c_str());
366         _pollTimer->setEnabled(false);
367 
368 #ifdef POWER10
369         // Set the master OCC on the PowerMode object
370         pmode->setMasterOcc(path);
371 #endif
372     }
373 
374     passThroughObjects.emplace_back(std::make_unique<PassThrough>(path.c_str()
375 #ifdef POWER10
376                                                                       ,
377                                                                   pmode
378 #endif
379                                                                   ));
380 }
381 
382 void Manager::statusCallBack(instanceID instance, bool status)
383 {
384     if (status == true)
385     {
386         // OCC went active
387         ++activeCount;
388 
389 #ifdef POWER10
390         if (activeCount == 1)
391         {
392             // First OCC went active (allow some time for all OCCs to go active)
393             waitForAllOccsTimer->restartOnce(60s);
394         }
395 #endif
396 
397         if (activeCount == statusObjects.size())
398         {
399 #ifdef POWER10
400             // All OCCs are now running
401             if (waitForAllOccsTimer->isEnabled())
402             {
403                 // stop occ wait timer
404                 waitForAllOccsTimer->setEnabled(false);
405             }
406 #endif
407 
408             // Verify master OCC and start presence monitor
409             validateOccMaster();
410         }
411 
412         // Start poll timer if not already started
413         if (!_pollTimer->isEnabled())
414         {
415             log<level::INFO>(
416                 std::format("Manager: OCCs will be polled every {} seconds",
417                             pollInterval)
418                     .c_str());
419 
420             // Send poll and start OCC poll timer
421             pollerTimerExpired();
422         }
423     }
424     else
425     {
426         // OCC went away
427         if (activeCount > 0)
428         {
429             --activeCount;
430         }
431         else
432         {
433             log<level::ERR>(
434                 std::format("OCC{} disabled, but currently no active OCCs",
435                             instance)
436                     .c_str());
437         }
438 
439         if (activeCount == 0)
440         {
441             // No OCCs are running
442 
443             // Stop OCC poll timer
444             if (_pollTimer->isEnabled())
445             {
446                 log<level::INFO>(
447                     "Manager::statusCallBack(): OCCs are not running, stopping poll timer");
448                 _pollTimer->setEnabled(false);
449             }
450 
451 #ifdef POWER10
452             // stop wait timer
453             if (waitForAllOccsTimer->isEnabled())
454             {
455                 waitForAllOccsTimer->setEnabled(false);
456             }
457 #endif
458         }
459 #ifdef READ_OCC_SENSORS
460         // Clear OCC sensors
461         setSensorValueToNaN(instance);
462 #endif
463     }
464 
465 #ifdef POWER10
466     if (waitingForAllOccActiveSensors)
467     {
468         if (utils::isHostRunning())
469         {
470             checkAllActiveSensors();
471         }
472     }
473 #endif
474 }
475 
476 #ifdef I2C_OCC
477 void Manager::initStatusObjects()
478 {
479     // Make sure we have a valid path string
480     static_assert(sizeof(DEV_PATH) != 0);
481 
482     auto deviceNames = i2c_occ::getOccHwmonDevices(DEV_PATH);
483     for (auto& name : deviceNames)
484     {
485         i2c_occ::i2cToDbus(name);
486         name = std::string(OCC_NAME) + '_' + name;
487         auto path = fs::path(OCC_CONTROL_ROOT) / name;
488         statusObjects.emplace_back(
489             std::make_unique<Status>(event, path.c_str(), *this));
490     }
491     // The first device is master occ
492     pcap = std::make_unique<open_power::occ::powercap::PowerCap>(
493         *statusObjects.front());
494 #ifdef POWER10
495     pmode = std::make_unique<powermode::PowerMode>(*this, powermode::PMODE_PATH,
496                                                    powermode::PIPS_PATH);
497     // Set the master OCC on the PowerMode object
498     pmode->setMasterOcc(path);
499 #endif
500 }
501 #endif
502 
503 #ifdef PLDM
504 void Manager::sbeTimeout(unsigned int instance)
505 {
506     auto obj = std::find_if(statusObjects.begin(), statusObjects.end(),
507                             [instance](const auto& obj) {
508         return instance == obj->getOccInstanceID();
509     });
510 
511     if (obj != statusObjects.end() && (*obj)->occActive())
512     {
513         log<level::INFO>(
514             std::format("SBE timeout, requesting HRESET (OCC{})", instance)
515                 .c_str());
516 
517         setSBEState(instance, SBE_STATE_NOT_USABLE);
518 
519         pldmHandle->sendHRESET(instance);
520     }
521 }
522 
523 bool Manager::updateOCCActive(instanceID instance, bool status)
524 {
525     auto obj = std::find_if(statusObjects.begin(), statusObjects.end(),
526                             [instance](const auto& obj) {
527         return instance == obj->getOccInstanceID();
528     });
529 
530     const bool hostRunning = open_power::occ::utils::isHostRunning();
531     if (obj != statusObjects.end())
532     {
533         if (!hostRunning && (status == true))
534         {
535             log<level::WARNING>(
536                 std::format(
537                     "updateOCCActive: Host is not running yet (OCC{} active={}), clearing sensor received",
538                     instance, status)
539                     .c_str());
540             (*obj)->setPldmSensorReceived(false);
541             if (!waitingForAllOccActiveSensors)
542             {
543                 log<level::INFO>(
544                     "updateOCCActive: Waiting for Host and all OCC Active Sensors");
545                 waitingForAllOccActiveSensors = true;
546             }
547 #ifdef POWER10
548             discoverTimer->restartOnce(30s);
549 #endif
550             return false;
551         }
552         else
553         {
554             log<level::INFO>(std::format("updateOCCActive: OCC{} active={}",
555                                          instance, status)
556                                  .c_str());
557             (*obj)->setPldmSensorReceived(true);
558             return (*obj)->occActive(status);
559         }
560     }
561     else
562     {
563         if (hostRunning)
564         {
565             log<level::WARNING>(
566                 std::format(
567                     "updateOCCActive: No status object to update for OCC{} (active={})",
568                     instance, status)
569                     .c_str());
570         }
571         else
572         {
573             if (status == true)
574             {
575                 log<level::WARNING>(
576                     std::format(
577                         "updateOCCActive: No status objects and Host is not running yet (OCC{} active={})",
578                         instance, status)
579                         .c_str());
580             }
581         }
582         if (status == true)
583         {
584             // OCC went active
585             queuedActiveState.insert(instance);
586         }
587         else
588         {
589             auto match = queuedActiveState.find(instance);
590             if (match != queuedActiveState.end())
591             {
592                 // OCC was disabled
593                 queuedActiveState.erase(match);
594             }
595         }
596         return false;
597     }
598 }
599 
600 // Called upon pldm event To set powermode Safe Mode State for system.
601 void Manager::updateOccSafeMode(bool safeMode)
602 {
603 #ifdef POWER10
604     pmode->updateDbusSafeMode(safeMode);
605 #endif
606     // Update the processor throttle status on dbus
607     for (auto& obj : statusObjects)
608     {
609         obj->updateThrottle(safeMode, THROTTLED_SAFE);
610     }
611 }
612 
613 void Manager::sbeHRESETResult(instanceID instance, bool success)
614 {
615     if (success)
616     {
617         log<level::INFO>(
618             std::format("HRESET succeeded (OCC{})", instance).c_str());
619 
620         setSBEState(instance, SBE_STATE_BOOTED);
621 
622         return;
623     }
624 
625     setSBEState(instance, SBE_STATE_FAILED);
626 
627     if (sbeCanDump(instance))
628     {
629         log<level::INFO>(
630             std::format("HRESET failed (OCC{}), triggering SBE dump", instance)
631                 .c_str());
632 
633         auto& bus = utils::getBus();
634         uint32_t src6 = instance << 16;
635         uint32_t logId =
636             FFDC::createPEL("org.open_power.Processor.Error.SbeChipOpTimeout",
637                             src6, "SBE command timeout");
638 
639         try
640         {
641             constexpr auto path = "/org/openpower/dump";
642             constexpr auto interface = "xyz.openbmc_project.Dump.Create";
643             constexpr auto function = "CreateDump";
644 
645             std::string service = utils::getService(path, interface);
646             auto method = bus.new_method_call(service.c_str(), path, interface,
647                                               function);
648 
649             std::map<std::string, std::variant<std::string, uint64_t>>
650                 createParams{
651                     {"com.ibm.Dump.Create.CreateParameters.ErrorLogId",
652                      uint64_t(logId)},
653                     {"com.ibm.Dump.Create.CreateParameters.DumpType",
654                      "com.ibm.Dump.Create.DumpType.SBE"},
655                     {"com.ibm.Dump.Create.CreateParameters.FailingUnitId",
656                      uint64_t(instance)},
657                 };
658 
659             method.append(createParams);
660 
661             auto response = bus.call(method);
662         }
663         catch (const sdbusplus::exception_t& e)
664         {
665             constexpr auto ERROR_DUMP_DISABLED =
666                 "xyz.openbmc_project.Dump.Create.Error.Disabled";
667             if (e.name() == ERROR_DUMP_DISABLED)
668             {
669                 log<level::INFO>("Dump is disabled, skipping");
670             }
671             else
672             {
673                 log<level::ERR>("Dump failed");
674             }
675         }
676     }
677 }
678 
679 bool Manager::sbeCanDump(unsigned int instance)
680 {
681     struct pdbg_target* proc = getPdbgTarget(instance);
682 
683     if (!proc)
684     {
685         // allow the dump in the error case
686         return true;
687     }
688 
689     try
690     {
691         if (!openpower::phal::sbe::isDumpAllowed(proc))
692         {
693             return false;
694         }
695 
696         if (openpower::phal::pdbg::isSbeVitalAttnActive(proc))
697         {
698             return false;
699         }
700     }
701     catch (openpower::phal::exception::SbeError& e)
702     {
703         log<level::INFO>("Failed to query SBE state");
704     }
705 
706     // allow the dump in the error case
707     return true;
708 }
709 
710 void Manager::setSBEState(unsigned int instance, enum sbe_state state)
711 {
712     struct pdbg_target* proc = getPdbgTarget(instance);
713 
714     if (!proc)
715     {
716         return;
717     }
718 
719     try
720     {
721         openpower::phal::sbe::setState(proc, state);
722     }
723     catch (const openpower::phal::exception::SbeError& e)
724     {
725         log<level::ERR>("Failed to set SBE state");
726     }
727 }
728 
729 struct pdbg_target* Manager::getPdbgTarget(unsigned int instance)
730 {
731     if (!pdbgInitialized)
732     {
733         try
734         {
735             openpower::phal::pdbg::init();
736             pdbgInitialized = true;
737         }
738         catch (const openpower::phal::exception::PdbgError& e)
739         {
740             log<level::ERR>("pdbg initialization failed");
741             return nullptr;
742         }
743     }
744 
745     struct pdbg_target* proc = nullptr;
746     pdbg_for_each_class_target("proc", proc)
747     {
748         if (pdbg_target_index(proc) == instance)
749         {
750             return proc;
751         }
752     }
753 
754     log<level::ERR>("Failed to get pdbg target");
755     return nullptr;
756 }
757 #endif
758 
759 void Manager::pollerTimerExpired()
760 {
761     if (!_pollTimer)
762     {
763         log<level::ERR>(
764             "Manager::pollerTimerExpired() ERROR: Timer not defined");
765         return;
766     }
767 
768     for (auto& obj : statusObjects)
769     {
770         if (!obj->occActive())
771         {
772             // OCC is not running yet
773 #ifdef READ_OCC_SENSORS
774             auto id = obj->getOccInstanceID();
775             setSensorValueToNaN(id);
776 #endif
777             continue;
778         }
779 
780         // Read sysfs to force kernel to poll OCC
781         obj->readOccState();
782 
783 #ifdef READ_OCC_SENSORS
784         // Read occ sensor values
785         getSensorValues(obj);
786 #endif
787     }
788 
789     if (activeCount > 0)
790     {
791         // Restart OCC poll timer
792         _pollTimer->restartOnce(std::chrono::seconds(pollInterval));
793     }
794     else
795     {
796         // No OCCs running, so poll timer will not be restarted
797         log<level::INFO>(
798             std::format(
799                 "Manager::pollerTimerExpired: poll timer will not be restarted")
800                 .c_str());
801     }
802 }
803 
804 #ifdef READ_OCC_SENSORS
805 void Manager::readTempSensors(const fs::path& path, uint32_t occInstance)
806 {
807     // There may be more than one sensor with the same FRU type
808     // and label so make two passes: the first to read the temps
809     // from sysfs, and the second to put them on D-Bus after
810     // resolving any conflicts.
811     std::map<std::string, double> sensorData;
812 
813     std::regex expr{"temp\\d+_label$"}; // Example: temp5_label
814     for (auto& file : fs::directory_iterator(path))
815     {
816         if (!std::regex_search(file.path().string(), expr))
817         {
818             continue;
819         }
820 
821         uint32_t labelValue{0};
822 
823         try
824         {
825             labelValue = readFile<uint32_t>(file.path());
826         }
827         catch (const std::system_error& e)
828         {
829             log<level::DEBUG>(
830                 std::format("readTempSensors: Failed reading {}, errno = {}",
831                             file.path().string(), e.code().value())
832                     .c_str());
833             continue;
834         }
835 
836         const std::string& tempLabel = "label";
837         const std::string filePathString = file.path().string().substr(
838             0, file.path().string().length() - tempLabel.length());
839 
840         uint32_t fruTypeValue{0};
841         try
842         {
843             fruTypeValue = readFile<uint32_t>(filePathString + fruTypeSuffix);
844         }
845         catch (const std::system_error& e)
846         {
847             log<level::DEBUG>(
848                 std::format("readTempSensors: Failed reading {}, errno = {}",
849                             filePathString + fruTypeSuffix, e.code().value())
850                     .c_str());
851             continue;
852         }
853 
854         std::string sensorPath = OCC_SENSORS_ROOT +
855                                  std::string("/temperature/");
856 
857         std::string dvfsTempPath;
858 
859         if (fruTypeValue == VRMVdd)
860         {
861             sensorPath.append("vrm_vdd" + std::to_string(occInstance) +
862                               "_temp");
863         }
864         else if (fruTypeValue == processorIoRing)
865         {
866             sensorPath.append("proc" + std::to_string(occInstance) +
867                               "_ioring_temp");
868             dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/proc" +
869                            std::to_string(occInstance) + "_ioring_dvfs_temp";
870         }
871         else
872         {
873             uint16_t type = (labelValue & 0xFF000000) >> 24;
874             uint16_t instanceID = labelValue & 0x0000FFFF;
875 
876             if (type == OCC_DIMM_TEMP_SENSOR_TYPE)
877             {
878                 if (fruTypeValue == fruTypeNotAvailable)
879                 {
880                     // Not all DIMM related temps are available to read
881                     // (no _input file in this case)
882                     continue;
883                 }
884                 auto iter = dimmTempSensorName.find(fruTypeValue);
885                 if (iter == dimmTempSensorName.end())
886                 {
887                     log<level::ERR>(
888                         std::format(
889                             "readTempSensors: Fru type error! fruTypeValue = {}) ",
890                             fruTypeValue)
891                             .c_str());
892                     continue;
893                 }
894 
895                 sensorPath.append("dimm" + std::to_string(instanceID) +
896                                   iter->second);
897 
898                 dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/" +
899                                dimmDVFSSensorName.at(fruTypeValue);
900             }
901             else if (type == OCC_CPU_TEMP_SENSOR_TYPE)
902             {
903                 if (fruTypeValue == processorCore)
904                 {
905                     // The OCC reports small core temps, of which there are
906                     // two per big core.  All current P10 systems are in big
907                     // core mode, so use a big core name.
908                     uint16_t coreNum = instanceID / 2;
909                     uint16_t tempNum = instanceID % 2;
910                     sensorPath.append("proc" + std::to_string(occInstance) +
911                                       "_core" + std::to_string(coreNum) + "_" +
912                                       std::to_string(tempNum) + "_temp");
913 
914                     dvfsTempPath =
915                         std::string{OCC_SENSORS_ROOT} + "/temperature/proc" +
916                         std::to_string(occInstance) + "_core_dvfs_temp";
917                 }
918                 else
919                 {
920                     continue;
921                 }
922             }
923             else
924             {
925                 continue;
926             }
927         }
928 
929         // The dvfs temp file only needs to be read once per chip per type.
930         if (!dvfsTempPath.empty() &&
931             !dbus::OccDBusSensors::getOccDBus().hasDvfsTemp(dvfsTempPath))
932         {
933             try
934             {
935                 auto dvfsValue = readFile<double>(filePathString + maxSuffix);
936 
937                 dbus::OccDBusSensors::getOccDBus().setDvfsTemp(
938                     dvfsTempPath, dvfsValue * std::pow(10, -3));
939             }
940             catch (const std::system_error& e)
941             {
942                 log<level::DEBUG>(
943                     std::format(
944                         "readTempSensors: Failed reading {}, errno = {}",
945                         filePathString + maxSuffix, e.code().value())
946                         .c_str());
947             }
948         }
949 
950         uint32_t faultValue{0};
951         try
952         {
953             faultValue = readFile<uint32_t>(filePathString + faultSuffix);
954         }
955         catch (const std::system_error& e)
956         {
957             log<level::DEBUG>(
958                 std::format("readTempSensors: Failed reading {}, errno = {}",
959                             filePathString + faultSuffix, e.code().value())
960                     .c_str());
961             continue;
962         }
963 
964         double tempValue{0};
965         // NOTE: if OCC sends back 0xFF, kernal sets this fault value to 1.
966         if (faultValue != 0)
967         {
968             tempValue = std::numeric_limits<double>::quiet_NaN();
969         }
970         else
971         {
972             // Read the temperature
973             try
974             {
975                 tempValue = readFile<double>(filePathString + inputSuffix);
976             }
977             catch (const std::system_error& e)
978             {
979                 log<level::DEBUG>(
980                     std::format(
981                         "readTempSensors: Failed reading {}, errno = {}",
982                         filePathString + inputSuffix, e.code().value())
983                         .c_str());
984 
985                 // if errno == EAGAIN(Resource temporarily unavailable) then set
986                 // temp to 0, to avoid using old temp, and affecting FAN
987                 // Control.
988                 if (e.code().value() == EAGAIN)
989                 {
990                     tempValue = 0;
991                 }
992                 // else the errno would be something like
993                 //     EBADF(Bad file descriptor)
994                 // or ENOENT(No such file or directory)
995                 else
996                 {
997                     continue;
998                 }
999             }
1000         }
1001 
1002         // If this object path already has a value, only overwite
1003         // it if the previous one was an NaN or a smaller value.
1004         auto existing = sensorData.find(sensorPath);
1005         if (existing != sensorData.end())
1006         {
1007             // Multiple sensors found for this FRU type
1008             if ((std::isnan(existing->second) && (tempValue == 0)) ||
1009                 ((existing->second == 0) && std::isnan(tempValue)))
1010             {
1011                 // One of the redundant sensors has failed (0xFF/nan), and the
1012                 // other sensor has no reading (0), so set the FRU to NaN to
1013                 // force fan increase
1014                 tempValue = std::numeric_limits<double>::quiet_NaN();
1015                 existing->second = tempValue;
1016             }
1017             if (std::isnan(existing->second) || (tempValue > existing->second))
1018             {
1019                 existing->second = tempValue;
1020             }
1021         }
1022         else
1023         {
1024             // First sensor for this FRU type
1025             sensorData[sensorPath] = tempValue;
1026         }
1027     }
1028 
1029     // Now publish the values on D-Bus.
1030     for (const auto& [objectPath, value] : sensorData)
1031     {
1032         dbus::OccDBusSensors::getOccDBus().setValue(objectPath,
1033                                                     value * std::pow(10, -3));
1034 
1035         dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1036             objectPath, !std::isnan(value));
1037 
1038         if (existingSensors.find(objectPath) == existingSensors.end())
1039         {
1040             dbus::OccDBusSensors::getOccDBus().setChassisAssociation(
1041                 objectPath);
1042         }
1043 
1044         existingSensors[objectPath] = occInstance;
1045     }
1046 }
1047 
1048 std::optional<std::string>
1049     Manager::getPowerLabelFunctionID(const std::string& value)
1050 {
1051     // If the value is "system", then the FunctionID is "system".
1052     if (value == "system")
1053     {
1054         return value;
1055     }
1056 
1057     // If the value is not "system", then the label value have 3 numbers, of
1058     // which we only care about the middle one:
1059     // <sensor id>_<function id>_<apss channel>
1060     // eg: The value is "0_10_5" , then the FunctionID is "10".
1061     if (value.find("_") == std::string::npos)
1062     {
1063         return std::nullopt;
1064     }
1065 
1066     auto powerLabelValue = value.substr((value.find("_") + 1));
1067 
1068     if (powerLabelValue.find("_") == std::string::npos)
1069     {
1070         return std::nullopt;
1071     }
1072 
1073     return powerLabelValue.substr(0, powerLabelValue.find("_"));
1074 }
1075 
1076 void Manager::readPowerSensors(const fs::path& path, uint32_t id)
1077 {
1078     std::regex expr{"power\\d+_label$"}; // Example: power5_label
1079     for (auto& file : fs::directory_iterator(path))
1080     {
1081         if (!std::regex_search(file.path().string(), expr))
1082         {
1083             continue;
1084         }
1085 
1086         std::string labelValue;
1087         try
1088         {
1089             labelValue = readFile<std::string>(file.path());
1090         }
1091         catch (const std::system_error& e)
1092         {
1093             log<level::DEBUG>(
1094                 std::format("readPowerSensors: Failed reading {}, errno = {}",
1095                             file.path().string(), e.code().value())
1096                     .c_str());
1097             continue;
1098         }
1099 
1100         auto functionID = getPowerLabelFunctionID(labelValue);
1101         if (functionID == std::nullopt)
1102         {
1103             continue;
1104         }
1105 
1106         const std::string& tempLabel = "label";
1107         const std::string filePathString = file.path().string().substr(
1108             0, file.path().string().length() - tempLabel.length());
1109 
1110         std::string sensorPath = OCC_SENSORS_ROOT + std::string("/power/");
1111 
1112         auto iter = powerSensorName.find(*functionID);
1113         if (iter == powerSensorName.end())
1114         {
1115             continue;
1116         }
1117         sensorPath.append(iter->second);
1118 
1119         double tempValue{0};
1120 
1121         try
1122         {
1123             tempValue = readFile<double>(filePathString + inputSuffix);
1124         }
1125         catch (const std::system_error& e)
1126         {
1127             log<level::DEBUG>(
1128                 std::format("readPowerSensors: Failed reading {}, errno = {}",
1129                             filePathString + inputSuffix, e.code().value())
1130                     .c_str());
1131             continue;
1132         }
1133 
1134         dbus::OccDBusSensors::getOccDBus().setUnit(
1135             sensorPath, "xyz.openbmc_project.Sensor.Value.Unit.Watts");
1136 
1137         dbus::OccDBusSensors::getOccDBus().setValue(
1138             sensorPath, tempValue * std::pow(10, -3) * std::pow(10, -3));
1139 
1140         dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath,
1141                                                                 true);
1142 
1143         if (existingSensors.find(sensorPath) == existingSensors.end())
1144         {
1145             dbus::OccDBusSensors::getOccDBus().setChassisAssociation(
1146                 sensorPath);
1147         }
1148 
1149         existingSensors[sensorPath] = id;
1150     }
1151     return;
1152 }
1153 
1154 void Manager::setSensorValueToNaN(uint32_t id) const
1155 {
1156     for (const auto& [sensorPath, occId] : existingSensors)
1157     {
1158         if (occId == id)
1159         {
1160             dbus::OccDBusSensors::getOccDBus().setValue(
1161                 sensorPath, std::numeric_limits<double>::quiet_NaN());
1162 
1163             dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath,
1164                                                                     true);
1165         }
1166     }
1167     return;
1168 }
1169 
1170 void Manager::setSensorValueToNonFunctional(uint32_t id) const
1171 {
1172     for (const auto& [sensorPath, occId] : existingSensors)
1173     {
1174         if (occId == id)
1175         {
1176             dbus::OccDBusSensors::getOccDBus().setValue(
1177                 sensorPath, std::numeric_limits<double>::quiet_NaN());
1178 
1179             dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath,
1180                                                                     false);
1181         }
1182     }
1183     return;
1184 }
1185 
1186 void Manager::getSensorValues(std::unique_ptr<Status>& occ)
1187 {
1188     static bool tracedError[8] = {0};
1189     const fs::path sensorPath = occ->getHwmonPath();
1190     const uint32_t id = occ->getOccInstanceID();
1191 
1192     if (fs::exists(sensorPath))
1193     {
1194         // Read temperature sensors
1195         readTempSensors(sensorPath, id);
1196 
1197         if (occ->isMasterOcc())
1198         {
1199             // Read power sensors
1200             readPowerSensors(sensorPath, id);
1201         }
1202         tracedError[id] = false;
1203     }
1204     else
1205     {
1206         if (!tracedError[id])
1207         {
1208             log<level::ERR>(
1209                 std::format(
1210                     "Manager::getSensorValues: OCC{} sensor path missing: {}",
1211                     id, sensorPath.c_str())
1212                     .c_str());
1213             tracedError[id] = true;
1214         }
1215     }
1216 
1217     return;
1218 }
1219 #endif
1220 
1221 // Read the altitude from DBus
1222 void Manager::readAltitude()
1223 {
1224     static bool traceAltitudeErr = true;
1225 
1226     utils::PropertyValue altitudeProperty{};
1227     try
1228     {
1229         altitudeProperty = utils::getProperty(ALTITUDE_PATH, ALTITUDE_INTERFACE,
1230                                               ALTITUDE_PROP);
1231         auto sensorVal = std::get<double>(altitudeProperty);
1232         if (sensorVal < 0xFFFF)
1233         {
1234             if (sensorVal < 0)
1235             {
1236                 altitude = 0;
1237             }
1238             else
1239             {
1240                 // Round to nearest meter
1241                 altitude = uint16_t(sensorVal + 0.5);
1242             }
1243             log<level::DEBUG>(std::format("readAltitude: sensor={} ({}m)",
1244                                           sensorVal, altitude)
1245                                   .c_str());
1246             traceAltitudeErr = true;
1247         }
1248         else
1249         {
1250             if (traceAltitudeErr)
1251             {
1252                 traceAltitudeErr = false;
1253                 log<level::DEBUG>(
1254                     std::format("Invalid altitude value: {}", sensorVal)
1255                         .c_str());
1256             }
1257         }
1258     }
1259     catch (const sdbusplus::exception_t& e)
1260     {
1261         if (traceAltitudeErr)
1262         {
1263             traceAltitudeErr = false;
1264             log<level::INFO>(
1265                 std::format("Unable to read Altitude: {}", e.what()).c_str());
1266         }
1267         altitude = 0xFFFF; // not available
1268     }
1269 }
1270 
1271 // Callback function when ambient temperature changes
1272 void Manager::ambientCallback(sdbusplus::message_t& msg)
1273 {
1274     double currentTemp = 0;
1275     uint8_t truncatedTemp = 0xFF;
1276     std::string msgSensor;
1277     std::map<std::string, std::variant<double>> msgData;
1278     msg.read(msgSensor, msgData);
1279 
1280     auto valPropMap = msgData.find(AMBIENT_PROP);
1281     if (valPropMap == msgData.end())
1282     {
1283         log<level::DEBUG>("ambientCallback: Unknown ambient property changed");
1284         return;
1285     }
1286     currentTemp = std::get<double>(valPropMap->second);
1287     if (std::isnan(currentTemp))
1288     {
1289         truncatedTemp = 0xFF;
1290     }
1291     else
1292     {
1293         if (currentTemp < 0)
1294         {
1295             truncatedTemp = 0;
1296         }
1297         else
1298         {
1299             // Round to nearest degree C
1300             truncatedTemp = uint8_t(currentTemp + 0.5);
1301         }
1302     }
1303 
1304     // If ambient changes, notify OCCs
1305     if (truncatedTemp != ambient)
1306     {
1307         log<level::DEBUG>(
1308             std::format("ambientCallback: Ambient change from {} to {}C",
1309                         ambient, currentTemp)
1310                 .c_str());
1311 
1312         ambient = truncatedTemp;
1313         if (altitude == 0xFFFF)
1314         {
1315             // No altitude yet, try reading again
1316             readAltitude();
1317         }
1318 
1319         log<level::DEBUG>(
1320             std::format("ambientCallback: Ambient: {}C, altitude: {}m", ambient,
1321                         altitude)
1322                 .c_str());
1323 #ifdef POWER10
1324         // Send ambient and altitude to all OCCs
1325         for (auto& obj : statusObjects)
1326         {
1327             if (obj->occActive())
1328             {
1329                 obj->sendAmbient(ambient, altitude);
1330             }
1331         }
1332 #endif // POWER10
1333     }
1334 }
1335 
1336 // return the current ambient and altitude readings
1337 void Manager::getAmbientData(bool& ambientValid, uint8_t& ambientTemp,
1338                              uint16_t& altitudeValue) const
1339 {
1340     ambientValid = true;
1341     ambientTemp = ambient;
1342     altitudeValue = altitude;
1343 
1344     if (ambient == 0xFF)
1345     {
1346         ambientValid = false;
1347     }
1348 }
1349 
1350 #ifdef POWER10
1351 // Called when waitForAllOccsTimer expires
1352 // After the first OCC goes active, this timer will be started (60 seconds)
1353 void Manager::occsNotAllRunning()
1354 {
1355     if (activeCount != statusObjects.size())
1356     {
1357         // Not all OCCs went active
1358         log<level::WARNING>(
1359             std::format(
1360                 "occsNotAllRunning: Active OCC count ({}) does not match expected count ({})",
1361                 activeCount, statusObjects.size())
1362                 .c_str());
1363         // Procs may be garded, so may be expected
1364     }
1365 
1366     validateOccMaster();
1367 }
1368 
1369 #ifdef PLDM
1370 // Called when throttleTraceTimer expires.
1371 // If this timer expires, that indicates there are no OCC active sensor PDRs
1372 // found which will trigger pldm traces to be throttled and PEL to be created
1373 void Manager::throttleTraceExpired()
1374 {
1375     if (utils::isHostRunning())
1376     {
1377         // Throttle traces
1378         pldmHandle->setTraceThrottle(true);
1379         // Create PEL
1380         createPldmSensorPEL();
1381     }
1382     else
1383     {
1384         // Make sure traces are not throttled
1385         pldmHandle->setTraceThrottle(false);
1386         log<level::INFO>(
1387             "throttleTraceExpired(): host it not running ignoring sensor timer");
1388     }
1389 }
1390 
1391 void Manager::createPldmSensorPEL()
1392 {
1393     Error::Descriptor d = Error::Descriptor(MISSING_OCC_SENSORS_PATH);
1394     std::map<std::string, std::string> additionalData;
1395 
1396     additionalData.emplace("_PID", std::to_string(getpid()));
1397 
1398     log<level::INFO>(
1399         std::format(
1400             "createPldmSensorPEL(): Unable to find PLDM sensors for the OCCs")
1401             .c_str());
1402 
1403     auto& bus = utils::getBus();
1404 
1405     try
1406     {
1407         FFDCFiles ffdc;
1408         // Add occ-control journal traces to PEL FFDC
1409         auto occJournalFile =
1410             FFDC::addJournalEntries(ffdc, "openpower-occ-control", 40);
1411 
1412         static constexpr auto loggingObjectPath =
1413             "/xyz/openbmc_project/logging";
1414         static constexpr auto opLoggingInterface = "org.open_power.Logging.PEL";
1415         std::string service = utils::getService(loggingObjectPath,
1416                                                 opLoggingInterface);
1417         auto method = bus.new_method_call(service.c_str(), loggingObjectPath,
1418                                           opLoggingInterface,
1419                                           "CreatePELWithFFDCFiles");
1420 
1421         // Set level to Warning (Predictive).
1422         auto level =
1423             sdbusplus::xyz::openbmc_project::Logging::server::convertForMessage(
1424                 sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level::
1425                     Warning);
1426 
1427         method.append(d.path, level, additionalData, ffdc);
1428         bus.call(method);
1429     }
1430     catch (const sdbusplus::exception_t& e)
1431     {
1432         log<level::ERR>(
1433             std::format("Failed to create MISSING_OCC_SENSORS PEL: {}",
1434                         e.what())
1435                 .c_str());
1436     }
1437 }
1438 #endif // PLDM
1439 #endif // POWER10
1440 
1441 // Verify single master OCC and start presence monitor
1442 void Manager::validateOccMaster()
1443 {
1444     int masterInstance = -1;
1445     for (auto& obj : statusObjects)
1446     {
1447         auto instance = obj->getOccInstanceID();
1448 #ifdef POWER10
1449         if (!obj->occActive())
1450         {
1451             if (utils::isHostRunning())
1452             {
1453                 // Check if sensor was queued while waiting for discovery
1454                 auto match = queuedActiveState.find(instance);
1455                 if (match != queuedActiveState.end())
1456                 {
1457                     queuedActiveState.erase(match);
1458                     log<level::INFO>(
1459                         std::format(
1460                             "validateOccMaster: OCC{} is ACTIVE (queued)",
1461                             instance)
1462                             .c_str());
1463                     obj->occActive(true);
1464                 }
1465                 else
1466                 {
1467                     // OCC does not appear to be active yet, check active sensor
1468 #ifdef PLDM
1469                     pldmHandle->checkActiveSensor(instance);
1470 #endif
1471                     if (obj->occActive())
1472                     {
1473                         log<level::INFO>(
1474                             std::format(
1475                                 "validateOccMaster: OCC{} is ACTIVE after reading sensor",
1476                                 instance)
1477                                 .c_str());
1478                     }
1479                 }
1480             }
1481             else
1482             {
1483                 log<level::WARNING>(
1484                     std::format(
1485                         "validateOccMaster: HOST is not running (OCC{})",
1486                         instance)
1487                         .c_str());
1488                 return;
1489             }
1490         }
1491 #endif // POWER10
1492 
1493         if (obj->isMasterOcc())
1494         {
1495             obj->addPresenceWatchMaster();
1496 
1497             if (masterInstance == -1)
1498             {
1499                 masterInstance = instance;
1500             }
1501             else
1502             {
1503                 log<level::ERR>(
1504                     std::format(
1505                         "validateOccMaster: Multiple OCC masters! ({} and {})",
1506                         masterInstance, instance)
1507                         .c_str());
1508                 // request reset
1509                 obj->deviceError(Error::Descriptor(PRESENCE_ERROR_PATH));
1510             }
1511         }
1512     }
1513 
1514     if (masterInstance < 0)
1515     {
1516         log<level::ERR>(
1517             std::format("validateOccMaster: Master OCC not found! (of {} OCCs)",
1518                         statusObjects.size())
1519                 .c_str());
1520         // request reset
1521         statusObjects.front()->deviceError(
1522             Error::Descriptor(PRESENCE_ERROR_PATH));
1523     }
1524     else
1525     {
1526         log<level::INFO>(
1527             std::format("validateOccMaster: OCC{} is master of {} OCCs",
1528                         masterInstance, activeCount)
1529                 .c_str());
1530 #ifdef POWER10
1531         pmode->updateDbusSafeMode(false);
1532 #endif
1533     }
1534 }
1535 
1536 void Manager::updatePcapBounds() const
1537 {
1538     if (pcap)
1539     {
1540         pcap->updatePcapBounds();
1541     }
1542 }
1543 
1544 } // namespace occ
1545 } // namespace open_power
1546