1 #include "config.h"
2 
3 #include "occ_manager.hpp"
4 
5 #include "i2c_occ.hpp"
6 #include "occ_dbus.hpp"
7 #include "occ_errors.hpp"
8 #include "utils.hpp"
9 
10 #include <phosphor-logging/elog-errors.hpp>
11 #include <phosphor-logging/log.hpp>
12 #include <xyz/openbmc_project/Common/error.hpp>
13 
14 #include <chrono>
15 #include <cmath>
16 #include <filesystem>
17 #include <fstream>
18 #include <regex>
19 
20 namespace open_power
21 {
22 namespace occ
23 {
24 
25 constexpr uint32_t fruTypeNotAvailable = 0xFF;
26 constexpr auto fruTypeSuffix = "fru_type";
27 constexpr auto faultSuffix = "fault";
28 constexpr auto inputSuffix = "input";
29 constexpr auto maxSuffix = "max";
30 
31 const auto HOST_ON_FILE = "/run/openbmc/host@0-on";
32 
33 using namespace phosphor::logging;
34 using namespace std::literals::chrono_literals;
35 
36 template <typename T>
37 T readFile(const std::string& path)
38 {
39     std::ifstream ifs;
40     ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit |
41                    std::ifstream::eofbit);
42     T data;
43 
44     try
45     {
46         ifs.open(path);
47         ifs >> data;
48         ifs.close();
49     }
50     catch (const std::exception& e)
51     {
52         auto err = errno;
53         throw std::system_error(err, std::generic_category());
54     }
55 
56     return data;
57 }
58 
59 // findAndCreateObjects():
60 // Takes care of getting the required objects created and
61 // finds the available devices/processors.
62 // (function is called everytime the discoverTimer expires)
63 // - create the PowerMode object to control OCC modes
64 // - create statusObjects for each OCC device found
65 // - waits for OCC Active sensors PDRs to become available
66 // - restart discoverTimer if all data is not available yet
67 void Manager::findAndCreateObjects()
68 {
69 #ifndef POWER10
70     for (auto id = 0; id < MAX_CPUS; ++id)
71     {
72         // Create one occ per cpu
73         auto occ = std::string(OCC_NAME) + std::to_string(id);
74         createObjects(occ);
75     }
76 #else
77     if (!pmode)
78     {
79         // Create the power mode object
80         pmode = std::make_unique<powermode::PowerMode>(
81             *this, powermode::PMODE_PATH, powermode::PIPS_PATH, event);
82     }
83 
84     if (!fs::exists(HOST_ON_FILE))
85     {
86         static bool statusObjCreated = false;
87         if (!statusObjCreated)
88         {
89             // Create the OCCs based on on the /dev/occX devices
90             auto occs = findOCCsInDev();
91 
92             if (occs.empty() || (prevOCCSearch.size() != occs.size()))
93             {
94                 // Something changed or no OCCs yet, try again in 10s.
95                 // Note on the first pass prevOCCSearch will be empty,
96                 // so there will be at least one delay to give things
97                 // a chance to settle.
98                 prevOCCSearch = occs;
99 
100                 log<level::INFO>(
101                     std::format(
102                         "Manager::findAndCreateObjects(): Waiting for OCCs (currently {})",
103                         occs.size())
104                         .c_str());
105 
106                 discoverTimer->restartOnce(10s);
107             }
108             else
109             {
110                 // All OCCs appear to be available, create status objects
111 
112                 // createObjects requires OCC0 first.
113                 std::sort(occs.begin(), occs.end());
114 
115                 log<level::INFO>(
116                     std::format(
117                         "Manager::findAndCreateObjects(): Creating {} OCC Status Objects",
118                         occs.size())
119                         .c_str());
120                 for (auto id : occs)
121                 {
122                     createObjects(std::string(OCC_NAME) + std::to_string(id));
123                 }
124                 statusObjCreated = true;
125                 waitingForAllOccActiveSensors = true;
126 
127                 // Find/update the processor path associated with each OCC
128                 for (auto& obj : statusObjects)
129                 {
130                     obj->updateProcAssociation();
131                 }
132             }
133         }
134 
135         if (statusObjCreated && waitingForAllOccActiveSensors)
136         {
137             static bool tracedHostWait = false;
138             if (utils::isHostRunning())
139             {
140                 if (tracedHostWait)
141                 {
142                     log<level::INFO>(
143                         "Manager::findAndCreateObjects(): Host is running");
144                     tracedHostWait = false;
145                 }
146                 checkAllActiveSensors();
147             }
148             else
149             {
150                 if (!tracedHostWait)
151                 {
152                     log<level::INFO>(
153                         "Manager::findAndCreateObjects(): Waiting for host to start");
154                     tracedHostWait = true;
155                 }
156                 discoverTimer->restartOnce(30s);
157 #ifdef PLDM
158                 if (throttlePldmTraceTimer->isEnabled())
159                 {
160                     // Host is no longer running, disable throttle timer and
161                     // make sure traces are not throttled
162                     log<level::INFO>(
163                         "findAndCreateObjects(): disabling sensor timer");
164                     throttlePldmTraceTimer->setEnabled(false);
165                     pldmHandle->setTraceThrottle(false);
166                 }
167 #endif
168             }
169         }
170     }
171     else
172     {
173         log<level::INFO>(
174             std::format(
175                 "Manager::findAndCreateObjects(): Waiting for {} to complete...",
176                 HOST_ON_FILE)
177                 .c_str());
178         discoverTimer->restartOnce(10s);
179     }
180 #endif
181 }
182 
183 #ifdef POWER10
184 // Check if all occActive sensors are available
185 void Manager::checkAllActiveSensors()
186 {
187     static bool allActiveSensorAvailable = false;
188     static bool tracedSensorWait = false;
189     static bool waitingForHost = false;
190 
191     if (open_power::occ::utils::isHostRunning())
192     {
193         if (waitingForHost)
194         {
195             waitingForHost = false;
196             log<level::INFO>("checkAllActiveSensors(): Host is now running");
197         }
198 
199         // Start with the assumption that all are available
200         allActiveSensorAvailable = true;
201         for (auto& obj : statusObjects)
202         {
203             if ((!obj->occActive()) && (!obj->getPldmSensorReceived()))
204             {
205                 auto instance = obj->getOccInstanceID();
206                 // Check if sensor was queued while waiting for discovery
207                 auto match = queuedActiveState.find(instance);
208                 if (match != queuedActiveState.end())
209                 {
210                     queuedActiveState.erase(match);
211                     log<level::INFO>(
212                         std::format(
213                             "checkAllActiveSensors(): OCC{} is ACTIVE (queued)",
214                             instance)
215                             .c_str());
216                     obj->occActive(true);
217                 }
218                 else
219                 {
220                     allActiveSensorAvailable = false;
221                     if (!tracedSensorWait)
222                     {
223                         log<level::INFO>(
224                             std::format(
225                                 "checkAllActiveSensors(): Waiting on OCC{} Active sensor",
226                                 instance)
227                                 .c_str());
228                         tracedSensorWait = true;
229 #ifdef PLDM
230                         // Make sure PLDM traces are not throttled
231                         pldmHandle->setTraceThrottle(false);
232                         // Start timer to throttle PLDM traces when timer
233                         // expires
234                         onPldmTimeoutCreatePel = false;
235                         throttlePldmTraceTimer->restartOnce(5min);
236 #endif
237                     }
238 #ifdef PLDM
239                     pldmHandle->checkActiveSensor(obj->getOccInstanceID());
240 #endif
241                     break;
242                 }
243             }
244         }
245     }
246     else
247     {
248         if (!waitingForHost)
249         {
250             waitingForHost = true;
251             log<level::INFO>(
252                 "checkAllActiveSensors(): Waiting for host to start");
253 #ifdef PLDM
254             if (throttlePldmTraceTimer->isEnabled())
255             {
256                 // Host is no longer running, disable throttle timer and
257                 // make sure traces are not throttled
258                 log<level::INFO>(
259                     "checkAllActiveSensors(): disabling sensor timer");
260                 throttlePldmTraceTimer->setEnabled(false);
261                 pldmHandle->setTraceThrottle(false);
262             }
263 #endif
264         }
265     }
266 
267     if (allActiveSensorAvailable)
268     {
269         // All sensors were found, disable the discovery timer
270         if (discoverTimer->isEnabled())
271         {
272             discoverTimer->setEnabled(false);
273         }
274 #ifdef PLDM
275         if (throttlePldmTraceTimer->isEnabled())
276         {
277             // Disable throttle timer and make sure traces are not throttled
278             throttlePldmTraceTimer->setEnabled(false);
279             pldmHandle->setTraceThrottle(false);
280         }
281 #endif
282         if (waitingForAllOccActiveSensors)
283         {
284             log<level::INFO>(
285                 "checkAllActiveSensors(): OCC Active sensors are available");
286             waitingForAllOccActiveSensors = false;
287         }
288         queuedActiveState.clear();
289         tracedSensorWait = false;
290     }
291     else
292     {
293         // Not all sensors were available, so keep waiting
294         if (!tracedSensorWait)
295         {
296             log<level::INFO>(
297                 "checkAllActiveSensors(): Waiting for OCC Active sensors to become available");
298             tracedSensorWait = true;
299         }
300         discoverTimer->restartOnce(10s);
301     }
302 }
303 #endif
304 
305 std::vector<int> Manager::findOCCsInDev()
306 {
307     std::vector<int> occs;
308     std::regex expr{R"(occ(\d+)$)"};
309 
310     for (auto& file : fs::directory_iterator("/dev"))
311     {
312         std::smatch match;
313         std::string path{file.path().string()};
314         if (std::regex_search(path, match, expr))
315         {
316             auto num = std::stoi(match[1].str());
317 
318             // /dev numbering starts at 1, ours starts at 0.
319             occs.push_back(num - 1);
320         }
321     }
322 
323     return occs;
324 }
325 
326 int Manager::cpuCreated(sdbusplus::message_t& msg)
327 {
328     namespace fs = std::filesystem;
329 
330     sdbusplus::message::object_path o;
331     msg.read(o);
332     fs::path cpuPath(std::string(std::move(o)));
333 
334     auto name = cpuPath.filename().string();
335     auto index = name.find(CPU_NAME);
336     name.replace(index, std::strlen(CPU_NAME), OCC_NAME);
337 
338     createObjects(name);
339 
340     return 0;
341 }
342 
343 void Manager::createObjects(const std::string& occ)
344 {
345     auto path = fs::path(OCC_CONTROL_ROOT) / occ;
346 
347     statusObjects.emplace_back(std::make_unique<Status>(
348         event, path.c_str(), *this,
349 #ifdef POWER10
350         pmode,
351 #endif
352         std::bind(std::mem_fn(&Manager::statusCallBack), this,
353                   std::placeholders::_1, std::placeholders::_2)
354 #ifdef PLDM
355             ,
356         std::bind(std::mem_fn(&pldm::Interface::resetOCC), pldmHandle.get(),
357                   std::placeholders::_1)
358 #endif
359             ));
360 
361     // Create the power cap monitor object
362     if (!pcap)
363     {
364         pcap = std::make_unique<open_power::occ::powercap::PowerCap>(
365             *statusObjects.back());
366     }
367 
368     if (statusObjects.back()->isMasterOcc())
369     {
370         log<level::INFO>(
371             std::format("Manager::createObjects(): OCC{} is the master",
372                         statusObjects.back()->getOccInstanceID())
373                 .c_str());
374         _pollTimer->setEnabled(false);
375 
376 #ifdef POWER10
377         // Set the master OCC on the PowerMode object
378         pmode->setMasterOcc(path);
379 #endif
380     }
381 
382     passThroughObjects.emplace_back(std::make_unique<PassThrough>(
383         path.c_str()
384 #ifdef POWER10
385             ,
386         pmode
387 #endif
388         ));
389 }
390 
391 void Manager::statusCallBack(instanceID instance, bool status)
392 {
393     if (status == true)
394     {
395         // OCC went active
396         ++activeCount;
397 
398 #ifdef POWER10
399         if (activeCount == 1)
400         {
401             // First OCC went active (allow some time for all OCCs to go active)
402             waitForAllOccsTimer->restartOnce(60s);
403         }
404 #endif
405 
406         if (activeCount == statusObjects.size())
407         {
408 #ifdef POWER10
409             // All OCCs are now running
410             if (waitForAllOccsTimer->isEnabled())
411             {
412                 // stop occ wait timer
413                 waitForAllOccsTimer->setEnabled(false);
414             }
415 #endif
416 
417             // Verify master OCC and start presence monitor
418             validateOccMaster();
419         }
420 
421         // Start poll timer if not already started
422         if (!_pollTimer->isEnabled())
423         {
424             log<level::INFO>(
425                 std::format("Manager: OCCs will be polled every {} seconds",
426                             pollInterval)
427                     .c_str());
428 
429             // Send poll and start OCC poll timer
430             pollerTimerExpired();
431         }
432     }
433     else
434     {
435         // OCC went away
436         if (activeCount > 0)
437         {
438             --activeCount;
439         }
440         else
441         {
442             log<level::ERR>(
443                 std::format("OCC{} disabled, but currently no active OCCs",
444                             instance)
445                     .c_str());
446         }
447 
448         if (activeCount == 0)
449         {
450             // No OCCs are running
451 
452             // Stop OCC poll timer
453             if (_pollTimer->isEnabled())
454             {
455                 log<level::INFO>(
456                     "Manager::statusCallBack(): OCCs are not running, stopping poll timer");
457                 _pollTimer->setEnabled(false);
458             }
459 
460 #ifdef POWER10
461             // stop wait timer
462             if (waitForAllOccsTimer->isEnabled())
463             {
464                 waitForAllOccsTimer->setEnabled(false);
465             }
466 #endif
467         }
468 #ifdef READ_OCC_SENSORS
469         // Clear OCC sensors
470         setSensorValueToNaN(instance);
471 #endif
472     }
473 
474 #ifdef POWER10
475     if (waitingForAllOccActiveSensors)
476     {
477         if (utils::isHostRunning())
478         {
479             checkAllActiveSensors();
480         }
481     }
482 #endif
483 }
484 
485 #ifdef I2C_OCC
486 void Manager::initStatusObjects()
487 {
488     // Make sure we have a valid path string
489     static_assert(sizeof(DEV_PATH) != 0);
490 
491     auto deviceNames = i2c_occ::getOccHwmonDevices(DEV_PATH);
492     for (auto& name : deviceNames)
493     {
494         i2c_occ::i2cToDbus(name);
495         name = std::string(OCC_NAME) + '_' + name;
496         auto path = fs::path(OCC_CONTROL_ROOT) / name;
497         statusObjects.emplace_back(
498             std::make_unique<Status>(event, path.c_str(), *this));
499     }
500     // The first device is master occ
501     pcap = std::make_unique<open_power::occ::powercap::PowerCap>(
502         *statusObjects.front());
503 #ifdef POWER10
504     pmode = std::make_unique<powermode::PowerMode>(*this, powermode::PMODE_PATH,
505                                                    powermode::PIPS_PATH);
506     // Set the master OCC on the PowerMode object
507     pmode->setMasterOcc(path);
508 #endif
509 }
510 #endif
511 
512 #ifdef PLDM
513 void Manager::sbeTimeout(unsigned int instance)
514 {
515     auto obj = std::find_if(statusObjects.begin(), statusObjects.end(),
516                             [instance](const auto& obj) {
517                                 return instance == obj->getOccInstanceID();
518                             });
519 
520     if (obj != statusObjects.end() && (*obj)->occActive())
521     {
522         log<level::INFO>(
523             std::format("SBE timeout, requesting HRESET (OCC{})", instance)
524                 .c_str());
525 
526         setSBEState(instance, SBE_STATE_NOT_USABLE);
527 
528         pldmHandle->sendHRESET(instance);
529     }
530 }
531 
532 bool Manager::updateOCCActive(instanceID instance, bool status)
533 {
534     auto obj = std::find_if(statusObjects.begin(), statusObjects.end(),
535                             [instance](const auto& obj) {
536                                 return instance == obj->getOccInstanceID();
537                             });
538 
539     const bool hostRunning = open_power::occ::utils::isHostRunning();
540     if (obj != statusObjects.end())
541     {
542         if (!hostRunning && (status == true))
543         {
544             log<level::WARNING>(
545                 std::format(
546                     "updateOCCActive: Host is not running yet (OCC{} active={}), clearing sensor received",
547                     instance, status)
548                     .c_str());
549             (*obj)->setPldmSensorReceived(false);
550             if (!waitingForAllOccActiveSensors)
551             {
552                 log<level::INFO>(
553                     "updateOCCActive: Waiting for Host and all OCC Active Sensors");
554                 waitingForAllOccActiveSensors = true;
555             }
556 #ifdef POWER10
557             discoverTimer->restartOnce(30s);
558 #endif
559             return false;
560         }
561         else
562         {
563             log<level::INFO>(std::format("updateOCCActive: OCC{} active={}",
564                                          instance, status)
565                                  .c_str());
566             (*obj)->setPldmSensorReceived(true);
567             return (*obj)->occActive(status);
568         }
569     }
570     else
571     {
572         if (hostRunning)
573         {
574             log<level::WARNING>(
575                 std::format(
576                     "updateOCCActive: No status object to update for OCC{} (active={})",
577                     instance, status)
578                     .c_str());
579         }
580         else
581         {
582             if (status == true)
583             {
584                 log<level::WARNING>(
585                     std::format(
586                         "updateOCCActive: No status objects and Host is not running yet (OCC{} active={})",
587                         instance, status)
588                         .c_str());
589             }
590         }
591         if (status == true)
592         {
593             // OCC went active
594             queuedActiveState.insert(instance);
595         }
596         else
597         {
598             auto match = queuedActiveState.find(instance);
599             if (match != queuedActiveState.end())
600             {
601                 // OCC was disabled
602                 queuedActiveState.erase(match);
603             }
604         }
605         return false;
606     }
607 }
608 
609 // Called upon pldm event To set powermode Safe Mode State for system.
610 void Manager::updateOccSafeMode(bool safeMode)
611 {
612 #ifdef POWER10
613     pmode->updateDbusSafeMode(safeMode);
614 #endif
615     // Update the processor throttle status on dbus
616     for (auto& obj : statusObjects)
617     {
618         obj->updateThrottle(safeMode, THROTTLED_SAFE);
619     }
620 }
621 
622 void Manager::sbeHRESETResult(instanceID instance, bool success)
623 {
624     if (success)
625     {
626         log<level::INFO>(
627             std::format("HRESET succeeded (OCC{})", instance).c_str());
628 
629         setSBEState(instance, SBE_STATE_BOOTED);
630 
631         return;
632     }
633 
634     setSBEState(instance, SBE_STATE_FAILED);
635 
636     if (sbeCanDump(instance))
637     {
638         log<level::INFO>(
639             std::format("HRESET failed (OCC{}), triggering SBE dump", instance)
640                 .c_str());
641 
642         auto& bus = utils::getBus();
643         uint32_t src6 = instance << 16;
644         uint32_t logId =
645             FFDC::createPEL("org.open_power.Processor.Error.SbeChipOpTimeout",
646                             src6, "SBE command timeout");
647 
648         try
649         {
650             constexpr auto interface = "xyz.openbmc_project.Dump.Create";
651             constexpr auto function = "CreateDump";
652 
653             std::string service =
654                 utils::getService(OP_DUMP_OBJ_PATH, interface);
655             auto method = bus.new_method_call(service.c_str(), OP_DUMP_OBJ_PATH,
656                                               interface, function);
657 
658             std::map<std::string, std::variant<std::string, uint64_t>>
659                 createParams{
660                     {"com.ibm.Dump.Create.CreateParameters.ErrorLogId",
661                      uint64_t(logId)},
662                     {"com.ibm.Dump.Create.CreateParameters.DumpType",
663                      "com.ibm.Dump.Create.DumpType.SBE"},
664                     {"com.ibm.Dump.Create.CreateParameters.FailingUnitId",
665                      uint64_t(instance)},
666                 };
667 
668             method.append(createParams);
669 
670             auto response = bus.call(method);
671         }
672         catch (const sdbusplus::exception_t& e)
673         {
674             constexpr auto ERROR_DUMP_DISABLED =
675                 "xyz.openbmc_project.Dump.Create.Error.Disabled";
676             if (e.name() == ERROR_DUMP_DISABLED)
677             {
678                 log<level::INFO>("Dump is disabled, skipping");
679             }
680             else
681             {
682                 log<level::ERR>("Dump failed");
683             }
684         }
685     }
686 }
687 
688 bool Manager::sbeCanDump(unsigned int instance)
689 {
690     struct pdbg_target* proc = getPdbgTarget(instance);
691 
692     if (!proc)
693     {
694         // allow the dump in the error case
695         return true;
696     }
697 
698     try
699     {
700         if (!openpower::phal::sbe::isDumpAllowed(proc))
701         {
702             return false;
703         }
704 
705         if (openpower::phal::pdbg::isSbeVitalAttnActive(proc))
706         {
707             return false;
708         }
709     }
710     catch (openpower::phal::exception::SbeError& e)
711     {
712         log<level::INFO>("Failed to query SBE state");
713     }
714 
715     // allow the dump in the error case
716     return true;
717 }
718 
719 void Manager::setSBEState(unsigned int instance, enum sbe_state state)
720 {
721     struct pdbg_target* proc = getPdbgTarget(instance);
722 
723     if (!proc)
724     {
725         return;
726     }
727 
728     try
729     {
730         openpower::phal::sbe::setState(proc, state);
731     }
732     catch (const openpower::phal::exception::SbeError& e)
733     {
734         log<level::ERR>("Failed to set SBE state");
735     }
736 }
737 
738 struct pdbg_target* Manager::getPdbgTarget(unsigned int instance)
739 {
740     if (!pdbgInitialized)
741     {
742         try
743         {
744             openpower::phal::pdbg::init();
745             pdbgInitialized = true;
746         }
747         catch (const openpower::phal::exception::PdbgError& e)
748         {
749             log<level::ERR>("pdbg initialization failed");
750             return nullptr;
751         }
752     }
753 
754     struct pdbg_target* proc = nullptr;
755     pdbg_for_each_class_target("proc", proc)
756     {
757         if (pdbg_target_index(proc) == instance)
758         {
759             return proc;
760         }
761     }
762 
763     log<level::ERR>("Failed to get pdbg target");
764     return nullptr;
765 }
766 #endif
767 
768 void Manager::pollerTimerExpired()
769 {
770     if (!_pollTimer)
771     {
772         log<level::ERR>(
773             "Manager::pollerTimerExpired() ERROR: Timer not defined");
774         return;
775     }
776 
777     for (auto& obj : statusObjects)
778     {
779         if (!obj->occActive())
780         {
781             // OCC is not running yet
782 #ifdef READ_OCC_SENSORS
783             auto id = obj->getOccInstanceID();
784             setSensorValueToNaN(id);
785 #endif
786             continue;
787         }
788 
789         // Read sysfs to force kernel to poll OCC
790         obj->readOccState();
791 
792 #ifdef READ_OCC_SENSORS
793         // Read occ sensor values
794         getSensorValues(obj);
795 #endif
796     }
797 
798     if (activeCount > 0)
799     {
800         // Restart OCC poll timer
801         _pollTimer->restartOnce(std::chrono::seconds(pollInterval));
802     }
803     else
804     {
805         // No OCCs running, so poll timer will not be restarted
806         log<level::INFO>(
807             std::format(
808                 "Manager::pollerTimerExpired: poll timer will not be restarted")
809                 .c_str());
810     }
811 }
812 
813 #ifdef READ_OCC_SENSORS
814 void Manager::readTempSensors(const fs::path& path, uint32_t occInstance)
815 {
816     // There may be more than one sensor with the same FRU type
817     // and label so make two passes: the first to read the temps
818     // from sysfs, and the second to put them on D-Bus after
819     // resolving any conflicts.
820     std::map<std::string, double> sensorData;
821 
822     std::regex expr{"temp\\d+_label$"}; // Example: temp5_label
823     for (auto& file : fs::directory_iterator(path))
824     {
825         if (!std::regex_search(file.path().string(), expr))
826         {
827             continue;
828         }
829 
830         uint32_t labelValue{0};
831 
832         try
833         {
834             labelValue = readFile<uint32_t>(file.path());
835         }
836         catch (const std::system_error& e)
837         {
838             log<level::DEBUG>(
839                 std::format("readTempSensors: Failed reading {}, errno = {}",
840                             file.path().string(), e.code().value())
841                     .c_str());
842             continue;
843         }
844 
845         const std::string& tempLabel = "label";
846         const std::string filePathString = file.path().string().substr(
847             0, file.path().string().length() - tempLabel.length());
848 
849         uint32_t fruTypeValue{0};
850         try
851         {
852             fruTypeValue = readFile<uint32_t>(filePathString + fruTypeSuffix);
853         }
854         catch (const std::system_error& e)
855         {
856             log<level::DEBUG>(
857                 std::format("readTempSensors: Failed reading {}, errno = {}",
858                             filePathString + fruTypeSuffix, e.code().value())
859                     .c_str());
860             continue;
861         }
862 
863         std::string sensorPath =
864             OCC_SENSORS_ROOT + std::string("/temperature/");
865 
866         std::string dvfsTempPath;
867 
868         if (fruTypeValue == VRMVdd)
869         {
870             sensorPath.append(
871                 "vrm_vdd" + std::to_string(occInstance) + "_temp");
872         }
873         else if (fruTypeValue == processorIoRing)
874         {
875             sensorPath.append(
876                 "proc" + std::to_string(occInstance) + "_ioring_temp");
877             dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/proc" +
878                            std::to_string(occInstance) + "_ioring_dvfs_temp";
879         }
880         else
881         {
882             uint16_t type = (labelValue & 0xFF000000) >> 24;
883             uint16_t instanceID = labelValue & 0x0000FFFF;
884 
885             if (type == OCC_DIMM_TEMP_SENSOR_TYPE)
886             {
887                 if (fruTypeValue == fruTypeNotAvailable)
888                 {
889                     // Not all DIMM related temps are available to read
890                     // (no _input file in this case)
891                     continue;
892                 }
893                 auto iter = dimmTempSensorName.find(fruTypeValue);
894                 if (iter == dimmTempSensorName.end())
895                 {
896                     log<level::ERR>(
897                         std::format(
898                             "readTempSensors: Fru type error! fruTypeValue = {}) ",
899                             fruTypeValue)
900                             .c_str());
901                     continue;
902                 }
903 
904                 sensorPath.append(
905                     "dimm" + std::to_string(instanceID) + iter->second);
906 
907                 dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/" +
908                                dimmDVFSSensorName.at(fruTypeValue);
909             }
910             else if (type == OCC_CPU_TEMP_SENSOR_TYPE)
911             {
912                 if (fruTypeValue == processorCore)
913                 {
914                     // The OCC reports small core temps, of which there are
915                     // two per big core.  All current P10 systems are in big
916                     // core mode, so use a big core name.
917                     uint16_t coreNum = instanceID / 2;
918                     uint16_t tempNum = instanceID % 2;
919                     sensorPath.append("proc" + std::to_string(occInstance) +
920                                       "_core" + std::to_string(coreNum) + "_" +
921                                       std::to_string(tempNum) + "_temp");
922 
923                     dvfsTempPath =
924                         std::string{OCC_SENSORS_ROOT} + "/temperature/proc" +
925                         std::to_string(occInstance) + "_core_dvfs_temp";
926                 }
927                 else
928                 {
929                     continue;
930                 }
931             }
932             else
933             {
934                 continue;
935             }
936         }
937 
938         // The dvfs temp file only needs to be read once per chip per type.
939         if (!dvfsTempPath.empty() &&
940             !dbus::OccDBusSensors::getOccDBus().hasDvfsTemp(dvfsTempPath))
941         {
942             try
943             {
944                 auto dvfsValue = readFile<double>(filePathString + maxSuffix);
945 
946                 dbus::OccDBusSensors::getOccDBus().setDvfsTemp(
947                     dvfsTempPath, dvfsValue * std::pow(10, -3));
948             }
949             catch (const std::system_error& e)
950             {
951                 log<level::DEBUG>(
952                     std::format(
953                         "readTempSensors: Failed reading {}, errno = {}",
954                         filePathString + maxSuffix, e.code().value())
955                         .c_str());
956             }
957         }
958 
959         uint32_t faultValue{0};
960         try
961         {
962             faultValue = readFile<uint32_t>(filePathString + faultSuffix);
963         }
964         catch (const std::system_error& e)
965         {
966             log<level::DEBUG>(
967                 std::format("readTempSensors: Failed reading {}, errno = {}",
968                             filePathString + faultSuffix, e.code().value())
969                     .c_str());
970             continue;
971         }
972 
973         double tempValue{0};
974         // NOTE: if OCC sends back 0xFF, kernal sets this fault value to 1.
975         if (faultValue != 0)
976         {
977             tempValue = std::numeric_limits<double>::quiet_NaN();
978         }
979         else
980         {
981             // Read the temperature
982             try
983             {
984                 tempValue = readFile<double>(filePathString + inputSuffix);
985             }
986             catch (const std::system_error& e)
987             {
988                 log<level::DEBUG>(
989                     std::format(
990                         "readTempSensors: Failed reading {}, errno = {}",
991                         filePathString + inputSuffix, e.code().value())
992                         .c_str());
993 
994                 // if errno == EAGAIN(Resource temporarily unavailable) then set
995                 // temp to 0, to avoid using old temp, and affecting FAN
996                 // Control.
997                 if (e.code().value() == EAGAIN)
998                 {
999                     tempValue = 0;
1000                 }
1001                 // else the errno would be something like
1002                 //     EBADF(Bad file descriptor)
1003                 // or ENOENT(No such file or directory)
1004                 else
1005                 {
1006                     continue;
1007                 }
1008             }
1009         }
1010 
1011         // If this object path already has a value, only overwite
1012         // it if the previous one was an NaN or a smaller value.
1013         auto existing = sensorData.find(sensorPath);
1014         if (existing != sensorData.end())
1015         {
1016             // Multiple sensors found for this FRU type
1017             if ((std::isnan(existing->second) && (tempValue == 0)) ||
1018                 ((existing->second == 0) && std::isnan(tempValue)))
1019             {
1020                 // One of the redundant sensors has failed (0xFF/nan), and the
1021                 // other sensor has no reading (0), so set the FRU to NaN to
1022                 // force fan increase
1023                 tempValue = std::numeric_limits<double>::quiet_NaN();
1024                 existing->second = tempValue;
1025             }
1026             if (std::isnan(existing->second) || (tempValue > existing->second))
1027             {
1028                 existing->second = tempValue;
1029             }
1030         }
1031         else
1032         {
1033             // First sensor for this FRU type
1034             sensorData[sensorPath] = tempValue;
1035         }
1036     }
1037 
1038     // Now publish the values on D-Bus.
1039     for (const auto& [objectPath, value] : sensorData)
1040     {
1041         dbus::OccDBusSensors::getOccDBus().setValue(objectPath,
1042                                                     value * std::pow(10, -3));
1043 
1044         dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1045             objectPath, !std::isnan(value));
1046 
1047         if (existingSensors.find(objectPath) == existingSensors.end())
1048         {
1049             dbus::OccDBusSensors::getOccDBus().setChassisAssociation(
1050                 objectPath);
1051         }
1052 
1053         existingSensors[objectPath] = occInstance;
1054     }
1055 }
1056 
1057 std::optional<std::string>
1058     Manager::getPowerLabelFunctionID(const std::string& value)
1059 {
1060     // If the value is "system", then the FunctionID is "system".
1061     if (value == "system")
1062     {
1063         return value;
1064     }
1065 
1066     // If the value is not "system", then the label value have 3 numbers, of
1067     // which we only care about the middle one:
1068     // <sensor id>_<function id>_<apss channel>
1069     // eg: The value is "0_10_5" , then the FunctionID is "10".
1070     if (value.find("_") == std::string::npos)
1071     {
1072         return std::nullopt;
1073     }
1074 
1075     auto powerLabelValue = value.substr((value.find("_") + 1));
1076 
1077     if (powerLabelValue.find("_") == std::string::npos)
1078     {
1079         return std::nullopt;
1080     }
1081 
1082     return powerLabelValue.substr(0, powerLabelValue.find("_"));
1083 }
1084 
1085 void Manager::readPowerSensors(const fs::path& path, uint32_t id)
1086 {
1087     std::regex expr{"power\\d+_label$"}; // Example: power5_label
1088     for (auto& file : fs::directory_iterator(path))
1089     {
1090         if (!std::regex_search(file.path().string(), expr))
1091         {
1092             continue;
1093         }
1094 
1095         std::string labelValue;
1096         try
1097         {
1098             labelValue = readFile<std::string>(file.path());
1099         }
1100         catch (const std::system_error& e)
1101         {
1102             log<level::DEBUG>(
1103                 std::format("readPowerSensors: Failed reading {}, errno = {}",
1104                             file.path().string(), e.code().value())
1105                     .c_str());
1106             continue;
1107         }
1108 
1109         auto functionID = getPowerLabelFunctionID(labelValue);
1110         if (functionID == std::nullopt)
1111         {
1112             continue;
1113         }
1114 
1115         const std::string& tempLabel = "label";
1116         const std::string filePathString = file.path().string().substr(
1117             0, file.path().string().length() - tempLabel.length());
1118 
1119         std::string sensorPath = OCC_SENSORS_ROOT + std::string("/power/");
1120 
1121         auto iter = powerSensorName.find(*functionID);
1122         if (iter == powerSensorName.end())
1123         {
1124             continue;
1125         }
1126         sensorPath.append(iter->second);
1127 
1128         double tempValue{0};
1129 
1130         try
1131         {
1132             tempValue = readFile<double>(filePathString + inputSuffix);
1133         }
1134         catch (const std::system_error& e)
1135         {
1136             log<level::DEBUG>(
1137                 std::format("readPowerSensors: Failed reading {}, errno = {}",
1138                             filePathString + inputSuffix, e.code().value())
1139                     .c_str());
1140             continue;
1141         }
1142 
1143         dbus::OccDBusSensors::getOccDBus().setUnit(
1144             sensorPath, "xyz.openbmc_project.Sensor.Value.Unit.Watts");
1145 
1146         dbus::OccDBusSensors::getOccDBus().setValue(
1147             sensorPath, tempValue * std::pow(10, -3) * std::pow(10, -3));
1148 
1149         dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1150             sensorPath, true);
1151 
1152         if (existingSensors.find(sensorPath) == existingSensors.end())
1153         {
1154             dbus::OccDBusSensors::getOccDBus().setChassisAssociation(
1155                 sensorPath);
1156         }
1157 
1158         existingSensors[sensorPath] = id;
1159     }
1160     return;
1161 }
1162 
1163 void Manager::setSensorValueToNaN(uint32_t id) const
1164 {
1165     for (const auto& [sensorPath, occId] : existingSensors)
1166     {
1167         if (occId == id)
1168         {
1169             dbus::OccDBusSensors::getOccDBus().setValue(
1170                 sensorPath, std::numeric_limits<double>::quiet_NaN());
1171 
1172             dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1173                 sensorPath, true);
1174         }
1175     }
1176     return;
1177 }
1178 
1179 void Manager::setSensorValueToNonFunctional(uint32_t id) const
1180 {
1181     for (const auto& [sensorPath, occId] : existingSensors)
1182     {
1183         if (occId == id)
1184         {
1185             dbus::OccDBusSensors::getOccDBus().setValue(
1186                 sensorPath, std::numeric_limits<double>::quiet_NaN());
1187 
1188             dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1189                 sensorPath, false);
1190         }
1191     }
1192     return;
1193 }
1194 
1195 void Manager::getSensorValues(std::unique_ptr<Status>& occ)
1196 {
1197     static bool tracedError[8] = {0};
1198     const fs::path sensorPath = occ->getHwmonPath();
1199     const uint32_t id = occ->getOccInstanceID();
1200 
1201     if (fs::exists(sensorPath))
1202     {
1203         // Read temperature sensors
1204         readTempSensors(sensorPath, id);
1205 
1206         if (occ->isMasterOcc())
1207         {
1208             // Read power sensors
1209             readPowerSensors(sensorPath, id);
1210         }
1211         tracedError[id] = false;
1212     }
1213     else
1214     {
1215         if (!tracedError[id])
1216         {
1217             log<level::ERR>(
1218                 std::format(
1219                     "Manager::getSensorValues: OCC{} sensor path missing: {}",
1220                     id, sensorPath.c_str())
1221                     .c_str());
1222             tracedError[id] = true;
1223         }
1224     }
1225 
1226     return;
1227 }
1228 #endif
1229 
1230 // Read the altitude from DBus
1231 void Manager::readAltitude()
1232 {
1233     static bool traceAltitudeErr = true;
1234 
1235     utils::PropertyValue altitudeProperty{};
1236     try
1237     {
1238         altitudeProperty = utils::getProperty(ALTITUDE_PATH, ALTITUDE_INTERFACE,
1239                                               ALTITUDE_PROP);
1240         auto sensorVal = std::get<double>(altitudeProperty);
1241         if (sensorVal < 0xFFFF)
1242         {
1243             if (sensorVal < 0)
1244             {
1245                 altitude = 0;
1246             }
1247             else
1248             {
1249                 // Round to nearest meter
1250                 altitude = uint16_t(sensorVal + 0.5);
1251             }
1252             log<level::DEBUG>(std::format("readAltitude: sensor={} ({}m)",
1253                                           sensorVal, altitude)
1254                                   .c_str());
1255             traceAltitudeErr = true;
1256         }
1257         else
1258         {
1259             if (traceAltitudeErr)
1260             {
1261                 traceAltitudeErr = false;
1262                 log<level::DEBUG>(
1263                     std::format("Invalid altitude value: {}", sensorVal)
1264                         .c_str());
1265             }
1266         }
1267     }
1268     catch (const sdbusplus::exception_t& e)
1269     {
1270         if (traceAltitudeErr)
1271         {
1272             traceAltitudeErr = false;
1273             log<level::INFO>(
1274                 std::format("Unable to read Altitude: {}", e.what()).c_str());
1275         }
1276         altitude = 0xFFFF; // not available
1277     }
1278 }
1279 
1280 // Callback function when ambient temperature changes
1281 void Manager::ambientCallback(sdbusplus::message_t& msg)
1282 {
1283     double currentTemp = 0;
1284     uint8_t truncatedTemp = 0xFF;
1285     std::string msgSensor;
1286     std::map<std::string, std::variant<double>> msgData;
1287     msg.read(msgSensor, msgData);
1288 
1289     auto valPropMap = msgData.find(AMBIENT_PROP);
1290     if (valPropMap == msgData.end())
1291     {
1292         log<level::DEBUG>("ambientCallback: Unknown ambient property changed");
1293         return;
1294     }
1295     currentTemp = std::get<double>(valPropMap->second);
1296     if (std::isnan(currentTemp))
1297     {
1298         truncatedTemp = 0xFF;
1299     }
1300     else
1301     {
1302         if (currentTemp < 0)
1303         {
1304             truncatedTemp = 0;
1305         }
1306         else
1307         {
1308             // Round to nearest degree C
1309             truncatedTemp = uint8_t(currentTemp + 0.5);
1310         }
1311     }
1312 
1313     // If ambient changes, notify OCCs
1314     if (truncatedTemp != ambient)
1315     {
1316         log<level::DEBUG>(
1317             std::format("ambientCallback: Ambient change from {} to {}C",
1318                         ambient, currentTemp)
1319                 .c_str());
1320 
1321         ambient = truncatedTemp;
1322         if (altitude == 0xFFFF)
1323         {
1324             // No altitude yet, try reading again
1325             readAltitude();
1326         }
1327 
1328         log<level::DEBUG>(
1329             std::format("ambientCallback: Ambient: {}C, altitude: {}m", ambient,
1330                         altitude)
1331                 .c_str());
1332 #ifdef POWER10
1333         // Send ambient and altitude to all OCCs
1334         for (auto& obj : statusObjects)
1335         {
1336             if (obj->occActive())
1337             {
1338                 obj->sendAmbient(ambient, altitude);
1339             }
1340         }
1341 #endif // POWER10
1342     }
1343 }
1344 
1345 // return the current ambient and altitude readings
1346 void Manager::getAmbientData(bool& ambientValid, uint8_t& ambientTemp,
1347                              uint16_t& altitudeValue) const
1348 {
1349     ambientValid = true;
1350     ambientTemp = ambient;
1351     altitudeValue = altitude;
1352 
1353     if (ambient == 0xFF)
1354     {
1355         ambientValid = false;
1356     }
1357 }
1358 
1359 #ifdef POWER10
1360 // Called when waitForAllOccsTimer expires
1361 // After the first OCC goes active, this timer will be started (60 seconds)
1362 void Manager::occsNotAllRunning()
1363 {
1364     if (activeCount != statusObjects.size())
1365     {
1366         // Not all OCCs went active
1367         log<level::WARNING>(
1368             std::format(
1369                 "occsNotAllRunning: Active OCC count ({}) does not match expected count ({})",
1370                 activeCount, statusObjects.size())
1371                 .c_str());
1372         // Procs may be garded, so may be expected
1373     }
1374 
1375     validateOccMaster();
1376 }
1377 
1378 #ifdef PLDM
1379 // Called when throttlePldmTraceTimer expires.
1380 // If this timer expires, that indicates there are no OCC active sensor PDRs
1381 // found which will trigger pldm traces to be throttled.
1382 // The second time this timer expires, a PEL will get created.
1383 void Manager::throttlePldmTraceExpired()
1384 {
1385     if (utils::isHostRunning())
1386     {
1387         if (!onPldmTimeoutCreatePel)
1388         {
1389             // Throttle traces
1390             pldmHandle->setTraceThrottle(true);
1391             // Restart timer to log a PEL when timer expires
1392             onPldmTimeoutCreatePel = true;
1393             throttlePldmTraceTimer->restartOnce(40min);
1394         }
1395         else
1396         {
1397             log<level::ERR>(
1398                 "throttlePldmTraceExpired(): OCC active sensors still not available!");
1399             // Create PEL
1400             createPldmSensorPEL();
1401         }
1402     }
1403     else
1404     {
1405         // Make sure traces are not throttled
1406         pldmHandle->setTraceThrottle(false);
1407         log<level::INFO>(
1408             "throttlePldmTraceExpired(): host it not running ignoring sensor timer");
1409     }
1410 }
1411 
1412 void Manager::createPldmSensorPEL()
1413 {
1414     Error::Descriptor d = Error::Descriptor(MISSING_OCC_SENSORS_PATH);
1415     std::map<std::string, std::string> additionalData;
1416 
1417     additionalData.emplace("_PID", std::to_string(getpid()));
1418 
1419     log<level::INFO>(
1420         std::format(
1421             "createPldmSensorPEL(): Unable to find PLDM sensors for the OCCs")
1422             .c_str());
1423 
1424     auto& bus = utils::getBus();
1425 
1426     try
1427     {
1428         FFDCFiles ffdc;
1429         // Add occ-control journal traces to PEL FFDC
1430         auto occJournalFile =
1431             FFDC::addJournalEntries(ffdc, "openpower-occ-control", 40);
1432 
1433         static constexpr auto loggingObjectPath =
1434             "/xyz/openbmc_project/logging";
1435         static constexpr auto opLoggingInterface = "org.open_power.Logging.PEL";
1436         std::string service =
1437             utils::getService(loggingObjectPath, opLoggingInterface);
1438         auto method =
1439             bus.new_method_call(service.c_str(), loggingObjectPath,
1440                                 opLoggingInterface, "CreatePELWithFFDCFiles");
1441 
1442         // Set level to Warning (Predictive).
1443         auto level =
1444             sdbusplus::xyz::openbmc_project::Logging::server::convertForMessage(
1445                 sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level::
1446                     Warning);
1447 
1448         method.append(d.path, level, additionalData, ffdc);
1449         bus.call(method);
1450     }
1451     catch (const sdbusplus::exception_t& e)
1452     {
1453         log<level::ERR>(
1454             std::format("Failed to create MISSING_OCC_SENSORS PEL: {}",
1455                         e.what())
1456                 .c_str());
1457     }
1458 }
1459 #endif // PLDM
1460 #endif // POWER10
1461 
1462 // Verify single master OCC and start presence monitor
1463 void Manager::validateOccMaster()
1464 {
1465     int masterInstance = -1;
1466     for (auto& obj : statusObjects)
1467     {
1468         auto instance = obj->getOccInstanceID();
1469 #ifdef POWER10
1470         if (!obj->occActive())
1471         {
1472             if (utils::isHostRunning())
1473             {
1474                 // Check if sensor was queued while waiting for discovery
1475                 auto match = queuedActiveState.find(instance);
1476                 if (match != queuedActiveState.end())
1477                 {
1478                     queuedActiveState.erase(match);
1479                     log<level::INFO>(
1480                         std::format(
1481                             "validateOccMaster: OCC{} is ACTIVE (queued)",
1482                             instance)
1483                             .c_str());
1484                     obj->occActive(true);
1485                 }
1486                 else
1487                 {
1488                     // OCC does not appear to be active yet, check active sensor
1489 #ifdef PLDM
1490                     pldmHandle->checkActiveSensor(instance);
1491 #endif
1492                     if (obj->occActive())
1493                     {
1494                         log<level::INFO>(
1495                             std::format(
1496                                 "validateOccMaster: OCC{} is ACTIVE after reading sensor",
1497                                 instance)
1498                                 .c_str());
1499                     }
1500                 }
1501             }
1502             else
1503             {
1504                 log<level::WARNING>(
1505                     std::format(
1506                         "validateOccMaster: HOST is not running (OCC{})",
1507                         instance)
1508                         .c_str());
1509                 return;
1510             }
1511         }
1512 #endif // POWER10
1513 
1514         if (obj->isMasterOcc())
1515         {
1516             obj->addPresenceWatchMaster();
1517 
1518             if (masterInstance == -1)
1519             {
1520                 masterInstance = instance;
1521             }
1522             else
1523             {
1524                 log<level::ERR>(
1525                     std::format(
1526                         "validateOccMaster: Multiple OCC masters! ({} and {})",
1527                         masterInstance, instance)
1528                         .c_str());
1529                 // request reset
1530                 obj->deviceError(Error::Descriptor(PRESENCE_ERROR_PATH));
1531             }
1532         }
1533     }
1534 
1535     if (masterInstance < 0)
1536     {
1537         log<level::ERR>(
1538             std::format("validateOccMaster: Master OCC not found! (of {} OCCs)",
1539                         statusObjects.size())
1540                 .c_str());
1541         // request reset
1542         statusObjects.front()->deviceError(
1543             Error::Descriptor(PRESENCE_ERROR_PATH));
1544     }
1545     else
1546     {
1547         log<level::INFO>(
1548             std::format("validateOccMaster: OCC{} is master of {} OCCs",
1549                         masterInstance, activeCount)
1550                 .c_str());
1551 #ifdef POWER10
1552         pmode->updateDbusSafeMode(false);
1553 #endif
1554     }
1555 }
1556 
1557 void Manager::updatePcapBounds() const
1558 {
1559     if (pcap)
1560     {
1561         pcap->updatePcapBounds();
1562     }
1563 }
1564 
1565 } // namespace occ
1566 } // namespace open_power
1567