1 #include "config.h"
2 
3 #include "occ_manager.hpp"
4 
5 #include "i2c_occ.hpp"
6 #include "occ_dbus.hpp"
7 #include "occ_errors.hpp"
8 #include "utils.hpp"
9 
10 #include <phosphor-logging/elog-errors.hpp>
11 #include <phosphor-logging/log.hpp>
12 #include <xyz/openbmc_project/Common/error.hpp>
13 
14 #include <chrono>
15 #include <cmath>
16 #include <filesystem>
17 #include <fstream>
18 #include <regex>
19 
20 namespace open_power
21 {
22 namespace occ
23 {
24 
25 constexpr uint32_t fruTypeNotAvailable = 0xFF;
26 constexpr auto fruTypeSuffix = "fru_type";
27 constexpr auto faultSuffix = "fault";
28 constexpr auto inputSuffix = "input";
29 constexpr auto maxSuffix = "max";
30 
31 const auto HOST_ON_FILE = "/run/openbmc/host@0-on";
32 
33 using namespace phosphor::logging;
34 using namespace std::literals::chrono_literals;
35 
36 template <typename T>
readFile(const std::string & path)37 T readFile(const std::string& path)
38 {
39     std::ifstream ifs;
40     ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit |
41                    std::ifstream::eofbit);
42     T data;
43 
44     try
45     {
46         ifs.open(path);
47         ifs >> data;
48         ifs.close();
49     }
50     catch (const std::exception& e)
51     {
52         auto err = errno;
53         throw std::system_error(err, std::generic_category());
54     }
55 
56     return data;
57 }
58 
59 // findAndCreateObjects():
60 // Takes care of getting the required objects created and
61 // finds the available devices/processors.
62 // (function is called everytime the discoverTimer expires)
63 // - create the PowerMode object to control OCC modes
64 // - create statusObjects for each OCC device found
65 // - waits for OCC Active sensors PDRs to become available
66 // - restart discoverTimer if all data is not available yet
findAndCreateObjects()67 void Manager::findAndCreateObjects()
68 {
69 #ifndef POWER10
70     for (auto id = 0; id < MAX_CPUS; ++id)
71     {
72         // Create one occ per cpu
73         auto occ = std::string(OCC_NAME) + std::to_string(id);
74         createObjects(occ);
75     }
76 #else
77     if (!pmode)
78     {
79         // Create the power mode object
80         pmode = std::make_unique<powermode::PowerMode>(
81             *this, powermode::PMODE_PATH, powermode::PIPS_PATH, event);
82     }
83 
84     if (!fs::exists(HOST_ON_FILE))
85     {
86         static bool statusObjCreated = false;
87         if (!statusObjCreated)
88         {
89             // Create the OCCs based on on the /dev/occX devices
90             auto occs = findOCCsInDev();
91 
92             if (occs.empty() || (prevOCCSearch.size() != occs.size()))
93             {
94                 // Something changed or no OCCs yet, try again in 10s.
95                 // Note on the first pass prevOCCSearch will be empty,
96                 // so there will be at least one delay to give things
97                 // a chance to settle.
98                 prevOCCSearch = occs;
99 
100                 log<level::INFO>(
101                     std::format(
102                         "Manager::findAndCreateObjects(): Waiting for OCCs (currently {})",
103                         occs.size())
104                         .c_str());
105 
106                 discoverTimer->restartOnce(10s);
107             }
108             else
109             {
110                 // All OCCs appear to be available, create status objects
111 
112                 // createObjects requires OCC0 first.
113                 std::sort(occs.begin(), occs.end());
114 
115                 log<level::INFO>(
116                     std::format(
117                         "Manager::findAndCreateObjects(): Creating {} OCC Status Objects",
118                         occs.size())
119                         .c_str());
120                 for (auto id : occs)
121                 {
122                     createObjects(std::string(OCC_NAME) + std::to_string(id));
123                 }
124                 statusObjCreated = true;
125                 waitingForAllOccActiveSensors = true;
126 
127                 // Find/update the processor path associated with each OCC
128                 for (auto& obj : statusObjects)
129                 {
130                     obj->updateProcAssociation();
131                 }
132             }
133         }
134 
135         if (statusObjCreated && waitingForAllOccActiveSensors)
136         {
137             static bool tracedHostWait = false;
138             if (utils::isHostRunning())
139             {
140                 if (tracedHostWait)
141                 {
142                     log<level::INFO>(
143                         "Manager::findAndCreateObjects(): Host is running");
144                     tracedHostWait = false;
145                 }
146                 checkAllActiveSensors();
147             }
148             else
149             {
150                 if (!tracedHostWait)
151                 {
152                     log<level::INFO>(
153                         "Manager::findAndCreateObjects(): Waiting for host to start");
154                     tracedHostWait = true;
155                 }
156                 discoverTimer->restartOnce(30s);
157 #ifdef PLDM
158                 if (throttlePldmTraceTimer->isEnabled())
159                 {
160                     // Host is no longer running, disable throttle timer and
161                     // make sure traces are not throttled
162                     log<level::INFO>(
163                         "findAndCreateObjects(): disabling sensor timer");
164                     throttlePldmTraceTimer->setEnabled(false);
165                     pldmHandle->setTraceThrottle(false);
166                 }
167 #endif
168             }
169         }
170     }
171     else
172     {
173         log<level::INFO>(
174             std::format(
175                 "Manager::findAndCreateObjects(): Waiting for {} to complete...",
176                 HOST_ON_FILE)
177                 .c_str());
178         discoverTimer->restartOnce(10s);
179     }
180 #endif
181 }
182 
183 #ifdef POWER10
184 // Check if all occActive sensors are available
checkAllActiveSensors()185 void Manager::checkAllActiveSensors()
186 {
187     static bool allActiveSensorAvailable = false;
188     static bool tracedSensorWait = false;
189     static bool waitingForHost = false;
190 
191     if (open_power::occ::utils::isHostRunning())
192     {
193         if (waitingForHost)
194         {
195             waitingForHost = false;
196             log<level::INFO>("checkAllActiveSensors(): Host is now running");
197         }
198 
199         // Start with the assumption that all are available
200         allActiveSensorAvailable = true;
201         for (auto& obj : statusObjects)
202         {
203             if ((!obj->occActive()) && (!obj->getPldmSensorReceived()))
204             {
205                 auto instance = obj->getOccInstanceID();
206                 // Check if sensor was queued while waiting for discovery
207                 auto match = queuedActiveState.find(instance);
208                 if (match != queuedActiveState.end())
209                 {
210                     queuedActiveState.erase(match);
211                     log<level::INFO>(
212                         std::format(
213                             "checkAllActiveSensors(): OCC{} is ACTIVE (queued)",
214                             instance)
215                             .c_str());
216                     obj->occActive(true);
217                 }
218                 else
219                 {
220                     allActiveSensorAvailable = false;
221                     if (!tracedSensorWait)
222                     {
223                         log<level::INFO>(
224                             std::format(
225                                 "checkAllActiveSensors(): Waiting on OCC{} Active sensor",
226                                 instance)
227                                 .c_str());
228                         tracedSensorWait = true;
229 #ifdef PLDM
230                         // Make sure PLDM traces are not throttled
231                         pldmHandle->setTraceThrottle(false);
232                         // Start timer to throttle PLDM traces when timer
233                         // expires
234                         onPldmTimeoutCreatePel = false;
235                         throttlePldmTraceTimer->restartOnce(5min);
236 #endif
237                     }
238 #ifdef PLDM
239                     pldmHandle->checkActiveSensor(obj->getOccInstanceID());
240 #endif
241                     break;
242                 }
243             }
244         }
245     }
246     else
247     {
248         if (!waitingForHost)
249         {
250             waitingForHost = true;
251             log<level::INFO>(
252                 "checkAllActiveSensors(): Waiting for host to start");
253 #ifdef PLDM
254             if (throttlePldmTraceTimer->isEnabled())
255             {
256                 // Host is no longer running, disable throttle timer and
257                 // make sure traces are not throttled
258                 log<level::INFO>(
259                     "checkAllActiveSensors(): disabling sensor timer");
260                 throttlePldmTraceTimer->setEnabled(false);
261                 pldmHandle->setTraceThrottle(false);
262             }
263 #endif
264         }
265     }
266 
267     if (allActiveSensorAvailable)
268     {
269         // All sensors were found, disable the discovery timer
270         if (discoverTimer->isEnabled())
271         {
272             discoverTimer->setEnabled(false);
273         }
274 #ifdef PLDM
275         if (throttlePldmTraceTimer->isEnabled())
276         {
277             // Disable throttle timer and make sure traces are not throttled
278             throttlePldmTraceTimer->setEnabled(false);
279             pldmHandle->setTraceThrottle(false);
280         }
281 #endif
282         if (waitingForAllOccActiveSensors)
283         {
284             log<level::INFO>(
285                 "checkAllActiveSensors(): OCC Active sensors are available");
286             waitingForAllOccActiveSensors = false;
287         }
288         queuedActiveState.clear();
289         tracedSensorWait = false;
290     }
291     else
292     {
293         // Not all sensors were available, so keep waiting
294         if (!tracedSensorWait)
295         {
296             log<level::INFO>(
297                 "checkAllActiveSensors(): Waiting for OCC Active sensors to become available");
298             tracedSensorWait = true;
299         }
300         discoverTimer->restartOnce(10s);
301     }
302 }
303 #endif
304 
findOCCsInDev()305 std::vector<int> Manager::findOCCsInDev()
306 {
307     std::vector<int> occs;
308     std::regex expr{R"(occ(\d+)$)"};
309 
310     for (auto& file : fs::directory_iterator("/dev"))
311     {
312         std::smatch match;
313         std::string path{file.path().string()};
314         if (std::regex_search(path, match, expr))
315         {
316             auto num = std::stoi(match[1].str());
317 
318             // /dev numbering starts at 1, ours starts at 0.
319             occs.push_back(num - 1);
320         }
321     }
322 
323     return occs;
324 }
325 
cpuCreated(sdbusplus::message_t & msg)326 int Manager::cpuCreated(sdbusplus::message_t& msg)
327 {
328     namespace fs = std::filesystem;
329 
330     sdbusplus::message::object_path o;
331     msg.read(o);
332     fs::path cpuPath(std::string(std::move(o)));
333 
334     auto name = cpuPath.filename().string();
335     auto index = name.find(CPU_NAME);
336     name.replace(index, std::strlen(CPU_NAME), OCC_NAME);
337 
338     createObjects(name);
339 
340     return 0;
341 }
342 
createObjects(const std::string & occ)343 void Manager::createObjects(const std::string& occ)
344 {
345     auto path = fs::path(OCC_CONTROL_ROOT) / occ;
346 
347     statusObjects.emplace_back(std::make_unique<Status>(
348         event, path.c_str(), *this,
349 #ifdef POWER10
350         pmode,
351 #endif
352         std::bind(std::mem_fn(&Manager::statusCallBack), this,
353                   std::placeholders::_1, std::placeholders::_2)
354 #ifdef PLDM
355             ,
356         std::bind(std::mem_fn(&pldm::Interface::resetOCC), pldmHandle.get(),
357                   std::placeholders::_1)
358 #endif
359             ));
360 
361     // Create the power cap monitor object
362     if (!pcap)
363     {
364         pcap = std::make_unique<open_power::occ::powercap::PowerCap>(
365             *statusObjects.back());
366     }
367 
368     if (statusObjects.back()->isMasterOcc())
369     {
370         log<level::INFO>(
371             std::format("Manager::createObjects(): OCC{} is the master",
372                         statusObjects.back()->getOccInstanceID())
373                 .c_str());
374         _pollTimer->setEnabled(false);
375 
376 #ifdef POWER10
377         // Set the master OCC on the PowerMode object
378         pmode->setMasterOcc(path);
379 #endif
380     }
381 
382     passThroughObjects.emplace_back(std::make_unique<PassThrough>(path.c_str()
383 #ifdef POWER10
384                                                                       ,
385                                                                   pmode
386 #endif
387                                                                   ));
388 }
389 
statusCallBack(instanceID instance,bool status)390 void Manager::statusCallBack(instanceID instance, bool status)
391 {
392     if (status == true)
393     {
394         // OCC went active
395         ++activeCount;
396 
397 #ifdef POWER10
398         if (activeCount == 1)
399         {
400             // First OCC went active (allow some time for all OCCs to go active)
401             waitForAllOccsTimer->restartOnce(60s);
402         }
403 #endif
404 
405         if (activeCount == statusObjects.size())
406         {
407 #ifdef POWER10
408             // All OCCs are now running
409             if (waitForAllOccsTimer->isEnabled())
410             {
411                 // stop occ wait timer
412                 waitForAllOccsTimer->setEnabled(false);
413             }
414 #endif
415 
416             // Verify master OCC and start presence monitor
417             validateOccMaster();
418         }
419 
420         // Start poll timer if not already started
421         if (!_pollTimer->isEnabled())
422         {
423             log<level::INFO>(
424                 std::format("Manager: OCCs will be polled every {} seconds",
425                             pollInterval)
426                     .c_str());
427 
428             // Send poll and start OCC poll timer
429             pollerTimerExpired();
430         }
431     }
432     else
433     {
434         // OCC went away
435         if (activeCount > 0)
436         {
437             --activeCount;
438         }
439         else
440         {
441             log<level::ERR>(
442                 std::format("OCC{} disabled, but currently no active OCCs",
443                             instance)
444                     .c_str());
445         }
446 
447         if (activeCount == 0)
448         {
449             // No OCCs are running
450 
451             // Stop OCC poll timer
452             if (_pollTimer->isEnabled())
453             {
454                 log<level::INFO>(
455                     "Manager::statusCallBack(): OCCs are not running, stopping poll timer");
456                 _pollTimer->setEnabled(false);
457             }
458 
459 #ifdef POWER10
460             // stop wait timer
461             if (waitForAllOccsTimer->isEnabled())
462             {
463                 waitForAllOccsTimer->setEnabled(false);
464             }
465 #endif
466         }
467 #ifdef READ_OCC_SENSORS
468         // Clear OCC sensors
469         setSensorValueToNaN(instance);
470 #endif
471     }
472 
473 #ifdef POWER10
474     if (waitingForAllOccActiveSensors)
475     {
476         if (utils::isHostRunning())
477         {
478             checkAllActiveSensors();
479         }
480     }
481 #endif
482 }
483 
484 #ifdef I2C_OCC
initStatusObjects()485 void Manager::initStatusObjects()
486 {
487     // Make sure we have a valid path string
488     static_assert(sizeof(DEV_PATH) != 0);
489 
490     auto deviceNames = i2c_occ::getOccHwmonDevices(DEV_PATH);
491     for (auto& name : deviceNames)
492     {
493         i2c_occ::i2cToDbus(name);
494         name = std::string(OCC_NAME) + '_' + name;
495         auto path = fs::path(OCC_CONTROL_ROOT) / name;
496         statusObjects.emplace_back(
497             std::make_unique<Status>(event, path.c_str(), *this));
498     }
499     // The first device is master occ
500     pcap = std::make_unique<open_power::occ::powercap::PowerCap>(
501         *statusObjects.front());
502 #ifdef POWER10
503     pmode = std::make_unique<powermode::PowerMode>(*this, powermode::PMODE_PATH,
504                                                    powermode::PIPS_PATH);
505     // Set the master OCC on the PowerMode object
506     pmode->setMasterOcc(path);
507 #endif
508 }
509 #endif
510 
511 #ifdef PLDM
sbeTimeout(unsigned int instance)512 void Manager::sbeTimeout(unsigned int instance)
513 {
514     auto obj = std::find_if(statusObjects.begin(), statusObjects.end(),
515                             [instance](const auto& obj) {
516         return instance == obj->getOccInstanceID();
517     });
518 
519     if (obj != statusObjects.end() && (*obj)->occActive())
520     {
521         log<level::INFO>(
522             std::format("SBE timeout, requesting HRESET (OCC{})", instance)
523                 .c_str());
524 
525         setSBEState(instance, SBE_STATE_NOT_USABLE);
526 
527         pldmHandle->sendHRESET(instance);
528     }
529 }
530 
updateOCCActive(instanceID instance,bool status)531 bool Manager::updateOCCActive(instanceID instance, bool status)
532 {
533     auto obj = std::find_if(statusObjects.begin(), statusObjects.end(),
534                             [instance](const auto& obj) {
535         return instance == obj->getOccInstanceID();
536     });
537 
538     const bool hostRunning = open_power::occ::utils::isHostRunning();
539     if (obj != statusObjects.end())
540     {
541         if (!hostRunning && (status == true))
542         {
543             log<level::WARNING>(
544                 std::format(
545                     "updateOCCActive: Host is not running yet (OCC{} active={}), clearing sensor received",
546                     instance, status)
547                     .c_str());
548             (*obj)->setPldmSensorReceived(false);
549             if (!waitingForAllOccActiveSensors)
550             {
551                 log<level::INFO>(
552                     "updateOCCActive: Waiting for Host and all OCC Active Sensors");
553                 waitingForAllOccActiveSensors = true;
554             }
555 #ifdef POWER10
556             discoverTimer->restartOnce(30s);
557 #endif
558             return false;
559         }
560         else
561         {
562             log<level::INFO>(std::format("updateOCCActive: OCC{} active={}",
563                                          instance, status)
564                                  .c_str());
565             (*obj)->setPldmSensorReceived(true);
566             return (*obj)->occActive(status);
567         }
568     }
569     else
570     {
571         if (hostRunning)
572         {
573             log<level::WARNING>(
574                 std::format(
575                     "updateOCCActive: No status object to update for OCC{} (active={})",
576                     instance, status)
577                     .c_str());
578         }
579         else
580         {
581             if (status == true)
582             {
583                 log<level::WARNING>(
584                     std::format(
585                         "updateOCCActive: No status objects and Host is not running yet (OCC{} active={})",
586                         instance, status)
587                         .c_str());
588             }
589         }
590         if (status == true)
591         {
592             // OCC went active
593             queuedActiveState.insert(instance);
594         }
595         else
596         {
597             auto match = queuedActiveState.find(instance);
598             if (match != queuedActiveState.end())
599             {
600                 // OCC was disabled
601                 queuedActiveState.erase(match);
602             }
603         }
604         return false;
605     }
606 }
607 
608 // Called upon pldm event To set powermode Safe Mode State for system.
updateOccSafeMode(bool safeMode)609 void Manager::updateOccSafeMode(bool safeMode)
610 {
611 #ifdef POWER10
612     pmode->updateDbusSafeMode(safeMode);
613 #endif
614     // Update the processor throttle status on dbus
615     for (auto& obj : statusObjects)
616     {
617         obj->updateThrottle(safeMode, THROTTLED_SAFE);
618     }
619 }
620 
sbeHRESETResult(instanceID instance,bool success)621 void Manager::sbeHRESETResult(instanceID instance, bool success)
622 {
623     if (success)
624     {
625         log<level::INFO>(
626             std::format("HRESET succeeded (OCC{})", instance).c_str());
627 
628         setSBEState(instance, SBE_STATE_BOOTED);
629 
630         return;
631     }
632 
633     setSBEState(instance, SBE_STATE_FAILED);
634 
635     if (sbeCanDump(instance))
636     {
637         log<level::INFO>(
638             std::format("HRESET failed (OCC{}), triggering SBE dump", instance)
639                 .c_str());
640 
641         auto& bus = utils::getBus();
642         uint32_t src6 = instance << 16;
643         uint32_t logId =
644             FFDC::createPEL("org.open_power.Processor.Error.SbeChipOpTimeout",
645                             src6, "SBE command timeout");
646 
647         try
648         {
649             constexpr auto interface = "xyz.openbmc_project.Dump.Create";
650             constexpr auto function = "CreateDump";
651 
652             std::string service = utils::getService(OP_DUMP_OBJ_PATH,
653                                                     interface);
654             auto method = bus.new_method_call(service.c_str(), OP_DUMP_OBJ_PATH,
655                                               interface, function);
656 
657             std::map<std::string, std::variant<std::string, uint64_t>>
658                 createParams{
659                     {"com.ibm.Dump.Create.CreateParameters.ErrorLogId",
660                      uint64_t(logId)},
661                     {"com.ibm.Dump.Create.CreateParameters.DumpType",
662                      "com.ibm.Dump.Create.DumpType.SBE"},
663                     {"com.ibm.Dump.Create.CreateParameters.FailingUnitId",
664                      uint64_t(instance)},
665                 };
666 
667             method.append(createParams);
668 
669             auto response = bus.call(method);
670         }
671         catch (const sdbusplus::exception_t& e)
672         {
673             constexpr auto ERROR_DUMP_DISABLED =
674                 "xyz.openbmc_project.Dump.Create.Error.Disabled";
675             if (e.name() == ERROR_DUMP_DISABLED)
676             {
677                 log<level::INFO>("Dump is disabled, skipping");
678             }
679             else
680             {
681                 log<level::ERR>("Dump failed");
682             }
683         }
684     }
685 }
686 
sbeCanDump(unsigned int instance)687 bool Manager::sbeCanDump(unsigned int instance)
688 {
689     struct pdbg_target* proc = getPdbgTarget(instance);
690 
691     if (!proc)
692     {
693         // allow the dump in the error case
694         return true;
695     }
696 
697     try
698     {
699         if (!openpower::phal::sbe::isDumpAllowed(proc))
700         {
701             return false;
702         }
703 
704         if (openpower::phal::pdbg::isSbeVitalAttnActive(proc))
705         {
706             return false;
707         }
708     }
709     catch (openpower::phal::exception::SbeError& e)
710     {
711         log<level::INFO>("Failed to query SBE state");
712     }
713 
714     // allow the dump in the error case
715     return true;
716 }
717 
setSBEState(unsigned int instance,enum sbe_state state)718 void Manager::setSBEState(unsigned int instance, enum sbe_state state)
719 {
720     struct pdbg_target* proc = getPdbgTarget(instance);
721 
722     if (!proc)
723     {
724         return;
725     }
726 
727     try
728     {
729         openpower::phal::sbe::setState(proc, state);
730     }
731     catch (const openpower::phal::exception::SbeError& e)
732     {
733         log<level::ERR>("Failed to set SBE state");
734     }
735 }
736 
getPdbgTarget(unsigned int instance)737 struct pdbg_target* Manager::getPdbgTarget(unsigned int instance)
738 {
739     if (!pdbgInitialized)
740     {
741         try
742         {
743             openpower::phal::pdbg::init();
744             pdbgInitialized = true;
745         }
746         catch (const openpower::phal::exception::PdbgError& e)
747         {
748             log<level::ERR>("pdbg initialization failed");
749             return nullptr;
750         }
751     }
752 
753     struct pdbg_target* proc = nullptr;
754     pdbg_for_each_class_target("proc", proc)
755     {
756         if (pdbg_target_index(proc) == instance)
757         {
758             return proc;
759         }
760     }
761 
762     log<level::ERR>("Failed to get pdbg target");
763     return nullptr;
764 }
765 #endif
766 
pollerTimerExpired()767 void Manager::pollerTimerExpired()
768 {
769     if (!_pollTimer)
770     {
771         log<level::ERR>(
772             "Manager::pollerTimerExpired() ERROR: Timer not defined");
773         return;
774     }
775 
776     for (auto& obj : statusObjects)
777     {
778         if (!obj->occActive())
779         {
780             // OCC is not running yet
781 #ifdef READ_OCC_SENSORS
782             auto id = obj->getOccInstanceID();
783             setSensorValueToNaN(id);
784 #endif
785             continue;
786         }
787 
788         // Read sysfs to force kernel to poll OCC
789         obj->readOccState();
790 
791 #ifdef READ_OCC_SENSORS
792         // Read occ sensor values
793         getSensorValues(obj);
794 #endif
795     }
796 
797     if (activeCount > 0)
798     {
799         // Restart OCC poll timer
800         _pollTimer->restartOnce(std::chrono::seconds(pollInterval));
801     }
802     else
803     {
804         // No OCCs running, so poll timer will not be restarted
805         log<level::INFO>(
806             std::format(
807                 "Manager::pollerTimerExpired: poll timer will not be restarted")
808                 .c_str());
809     }
810 }
811 
812 #ifdef READ_OCC_SENSORS
readTempSensors(const fs::path & path,uint32_t occInstance)813 void Manager::readTempSensors(const fs::path& path, uint32_t occInstance)
814 {
815     // There may be more than one sensor with the same FRU type
816     // and label so make two passes: the first to read the temps
817     // from sysfs, and the second to put them on D-Bus after
818     // resolving any conflicts.
819     std::map<std::string, double> sensorData;
820 
821     std::regex expr{"temp\\d+_label$"}; // Example: temp5_label
822     for (auto& file : fs::directory_iterator(path))
823     {
824         if (!std::regex_search(file.path().string(), expr))
825         {
826             continue;
827         }
828 
829         uint32_t labelValue{0};
830 
831         try
832         {
833             labelValue = readFile<uint32_t>(file.path());
834         }
835         catch (const std::system_error& e)
836         {
837             log<level::DEBUG>(
838                 std::format("readTempSensors: Failed reading {}, errno = {}",
839                             file.path().string(), e.code().value())
840                     .c_str());
841             continue;
842         }
843 
844         const std::string& tempLabel = "label";
845         const std::string filePathString = file.path().string().substr(
846             0, file.path().string().length() - tempLabel.length());
847 
848         uint32_t fruTypeValue{0};
849         try
850         {
851             fruTypeValue = readFile<uint32_t>(filePathString + fruTypeSuffix);
852         }
853         catch (const std::system_error& e)
854         {
855             log<level::DEBUG>(
856                 std::format("readTempSensors: Failed reading {}, errno = {}",
857                             filePathString + fruTypeSuffix, e.code().value())
858                     .c_str());
859             continue;
860         }
861 
862         std::string sensorPath = OCC_SENSORS_ROOT +
863                                  std::string("/temperature/");
864 
865         std::string dvfsTempPath;
866 
867         if (fruTypeValue == VRMVdd)
868         {
869             sensorPath.append("vrm_vdd" + std::to_string(occInstance) +
870                               "_temp");
871         }
872         else if (fruTypeValue == processorIoRing)
873         {
874             sensorPath.append("proc" + std::to_string(occInstance) +
875                               "_ioring_temp");
876             dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/proc" +
877                            std::to_string(occInstance) + "_ioring_dvfs_temp";
878         }
879         else
880         {
881             uint16_t type = (labelValue & 0xFF000000) >> 24;
882             uint16_t instanceID = labelValue & 0x0000FFFF;
883 
884             if (type == OCC_DIMM_TEMP_SENSOR_TYPE)
885             {
886                 if (fruTypeValue == fruTypeNotAvailable)
887                 {
888                     // Not all DIMM related temps are available to read
889                     // (no _input file in this case)
890                     continue;
891                 }
892                 auto iter = dimmTempSensorName.find(fruTypeValue);
893                 if (iter == dimmTempSensorName.end())
894                 {
895                     log<level::ERR>(
896                         std::format(
897                             "readTempSensors: Fru type error! fruTypeValue = {}) ",
898                             fruTypeValue)
899                             .c_str());
900                     continue;
901                 }
902 
903                 sensorPath.append("dimm" + std::to_string(instanceID) +
904                                   iter->second);
905 
906                 dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/" +
907                                dimmDVFSSensorName.at(fruTypeValue);
908             }
909             else if (type == OCC_CPU_TEMP_SENSOR_TYPE)
910             {
911                 if (fruTypeValue == processorCore)
912                 {
913                     // The OCC reports small core temps, of which there are
914                     // two per big core.  All current P10 systems are in big
915                     // core mode, so use a big core name.
916                     uint16_t coreNum = instanceID / 2;
917                     uint16_t tempNum = instanceID % 2;
918                     sensorPath.append("proc" + std::to_string(occInstance) +
919                                       "_core" + std::to_string(coreNum) + "_" +
920                                       std::to_string(tempNum) + "_temp");
921 
922                     dvfsTempPath =
923                         std::string{OCC_SENSORS_ROOT} + "/temperature/proc" +
924                         std::to_string(occInstance) + "_core_dvfs_temp";
925                 }
926                 else
927                 {
928                     continue;
929                 }
930             }
931             else
932             {
933                 continue;
934             }
935         }
936 
937         // The dvfs temp file only needs to be read once per chip per type.
938         if (!dvfsTempPath.empty() &&
939             !dbus::OccDBusSensors::getOccDBus().hasDvfsTemp(dvfsTempPath))
940         {
941             try
942             {
943                 auto dvfsValue = readFile<double>(filePathString + maxSuffix);
944 
945                 dbus::OccDBusSensors::getOccDBus().setDvfsTemp(
946                     dvfsTempPath, dvfsValue * std::pow(10, -3));
947             }
948             catch (const std::system_error& e)
949             {
950                 log<level::DEBUG>(
951                     std::format(
952                         "readTempSensors: Failed reading {}, errno = {}",
953                         filePathString + maxSuffix, e.code().value())
954                         .c_str());
955             }
956         }
957 
958         uint32_t faultValue{0};
959         try
960         {
961             faultValue = readFile<uint32_t>(filePathString + faultSuffix);
962         }
963         catch (const std::system_error& e)
964         {
965             log<level::DEBUG>(
966                 std::format("readTempSensors: Failed reading {}, errno = {}",
967                             filePathString + faultSuffix, e.code().value())
968                     .c_str());
969             continue;
970         }
971 
972         double tempValue{0};
973         // NOTE: if OCC sends back 0xFF, kernal sets this fault value to 1.
974         if (faultValue != 0)
975         {
976             tempValue = std::numeric_limits<double>::quiet_NaN();
977         }
978         else
979         {
980             // Read the temperature
981             try
982             {
983                 tempValue = readFile<double>(filePathString + inputSuffix);
984             }
985             catch (const std::system_error& e)
986             {
987                 log<level::DEBUG>(
988                     std::format(
989                         "readTempSensors: Failed reading {}, errno = {}",
990                         filePathString + inputSuffix, e.code().value())
991                         .c_str());
992 
993                 // if errno == EAGAIN(Resource temporarily unavailable) then set
994                 // temp to 0, to avoid using old temp, and affecting FAN
995                 // Control.
996                 if (e.code().value() == EAGAIN)
997                 {
998                     tempValue = 0;
999                 }
1000                 // else the errno would be something like
1001                 //     EBADF(Bad file descriptor)
1002                 // or ENOENT(No such file or directory)
1003                 else
1004                 {
1005                     continue;
1006                 }
1007             }
1008         }
1009 
1010         // If this object path already has a value, only overwite
1011         // it if the previous one was an NaN or a smaller value.
1012         auto existing = sensorData.find(sensorPath);
1013         if (existing != sensorData.end())
1014         {
1015             // Multiple sensors found for this FRU type
1016             if ((std::isnan(existing->second) && (tempValue == 0)) ||
1017                 ((existing->second == 0) && std::isnan(tempValue)))
1018             {
1019                 // One of the redundant sensors has failed (0xFF/nan), and the
1020                 // other sensor has no reading (0), so set the FRU to NaN to
1021                 // force fan increase
1022                 tempValue = std::numeric_limits<double>::quiet_NaN();
1023                 existing->second = tempValue;
1024             }
1025             if (std::isnan(existing->second) || (tempValue > existing->second))
1026             {
1027                 existing->second = tempValue;
1028             }
1029         }
1030         else
1031         {
1032             // First sensor for this FRU type
1033             sensorData[sensorPath] = tempValue;
1034         }
1035     }
1036 
1037     // Now publish the values on D-Bus.
1038     for (const auto& [objectPath, value] : sensorData)
1039     {
1040         dbus::OccDBusSensors::getOccDBus().setValue(objectPath,
1041                                                     value * std::pow(10, -3));
1042 
1043         dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1044             objectPath, !std::isnan(value));
1045 
1046         if (existingSensors.find(objectPath) == existingSensors.end())
1047         {
1048             dbus::OccDBusSensors::getOccDBus().setChassisAssociation(
1049                 objectPath);
1050         }
1051 
1052         existingSensors[objectPath] = occInstance;
1053     }
1054 }
1055 
1056 std::optional<std::string>
getPowerLabelFunctionID(const std::string & value)1057     Manager::getPowerLabelFunctionID(const std::string& value)
1058 {
1059     // If the value is "system", then the FunctionID is "system".
1060     if (value == "system")
1061     {
1062         return value;
1063     }
1064 
1065     // If the value is not "system", then the label value have 3 numbers, of
1066     // which we only care about the middle one:
1067     // <sensor id>_<function id>_<apss channel>
1068     // eg: The value is "0_10_5" , then the FunctionID is "10".
1069     if (value.find("_") == std::string::npos)
1070     {
1071         return std::nullopt;
1072     }
1073 
1074     auto powerLabelValue = value.substr((value.find("_") + 1));
1075 
1076     if (powerLabelValue.find("_") == std::string::npos)
1077     {
1078         return std::nullopt;
1079     }
1080 
1081     return powerLabelValue.substr(0, powerLabelValue.find("_"));
1082 }
1083 
readPowerSensors(const fs::path & path,uint32_t id)1084 void Manager::readPowerSensors(const fs::path& path, uint32_t id)
1085 {
1086     std::regex expr{"power\\d+_label$"}; // Example: power5_label
1087     for (auto& file : fs::directory_iterator(path))
1088     {
1089         if (!std::regex_search(file.path().string(), expr))
1090         {
1091             continue;
1092         }
1093 
1094         std::string labelValue;
1095         try
1096         {
1097             labelValue = readFile<std::string>(file.path());
1098         }
1099         catch (const std::system_error& e)
1100         {
1101             log<level::DEBUG>(
1102                 std::format("readPowerSensors: Failed reading {}, errno = {}",
1103                             file.path().string(), e.code().value())
1104                     .c_str());
1105             continue;
1106         }
1107 
1108         auto functionID = getPowerLabelFunctionID(labelValue);
1109         if (functionID == std::nullopt)
1110         {
1111             continue;
1112         }
1113 
1114         const std::string& tempLabel = "label";
1115         const std::string filePathString = file.path().string().substr(
1116             0, file.path().string().length() - tempLabel.length());
1117 
1118         std::string sensorPath = OCC_SENSORS_ROOT + std::string("/power/");
1119 
1120         auto iter = powerSensorName.find(*functionID);
1121         if (iter == powerSensorName.end())
1122         {
1123             continue;
1124         }
1125         sensorPath.append(iter->second);
1126 
1127         double tempValue{0};
1128 
1129         try
1130         {
1131             tempValue = readFile<double>(filePathString + inputSuffix);
1132         }
1133         catch (const std::system_error& e)
1134         {
1135             log<level::DEBUG>(
1136                 std::format("readPowerSensors: Failed reading {}, errno = {}",
1137                             filePathString + inputSuffix, e.code().value())
1138                     .c_str());
1139             continue;
1140         }
1141 
1142         dbus::OccDBusSensors::getOccDBus().setUnit(
1143             sensorPath, "xyz.openbmc_project.Sensor.Value.Unit.Watts");
1144 
1145         dbus::OccDBusSensors::getOccDBus().setValue(
1146             sensorPath, tempValue * std::pow(10, -3) * std::pow(10, -3));
1147 
1148         dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath,
1149                                                                 true);
1150 
1151         if (existingSensors.find(sensorPath) == existingSensors.end())
1152         {
1153             dbus::OccDBusSensors::getOccDBus().setChassisAssociation(
1154                 sensorPath);
1155         }
1156 
1157         existingSensors[sensorPath] = id;
1158     }
1159     return;
1160 }
1161 
setSensorValueToNaN(uint32_t id) const1162 void Manager::setSensorValueToNaN(uint32_t id) const
1163 {
1164     for (const auto& [sensorPath, occId] : existingSensors)
1165     {
1166         if (occId == id)
1167         {
1168             dbus::OccDBusSensors::getOccDBus().setValue(
1169                 sensorPath, std::numeric_limits<double>::quiet_NaN());
1170 
1171             dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath,
1172                                                                     true);
1173         }
1174     }
1175     return;
1176 }
1177 
setSensorValueToNonFunctional(uint32_t id) const1178 void Manager::setSensorValueToNonFunctional(uint32_t id) const
1179 {
1180     for (const auto& [sensorPath, occId] : existingSensors)
1181     {
1182         if (occId == id)
1183         {
1184             dbus::OccDBusSensors::getOccDBus().setValue(
1185                 sensorPath, std::numeric_limits<double>::quiet_NaN());
1186 
1187             dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath,
1188                                                                     false);
1189         }
1190     }
1191     return;
1192 }
1193 
getSensorValues(std::unique_ptr<Status> & occ)1194 void Manager::getSensorValues(std::unique_ptr<Status>& occ)
1195 {
1196     static bool tracedError[8] = {0};
1197     const fs::path sensorPath = occ->getHwmonPath();
1198     const uint32_t id = occ->getOccInstanceID();
1199 
1200     if (fs::exists(sensorPath))
1201     {
1202         // Read temperature sensors
1203         readTempSensors(sensorPath, id);
1204 
1205         if (occ->isMasterOcc())
1206         {
1207             // Read power sensors
1208             readPowerSensors(sensorPath, id);
1209         }
1210         tracedError[id] = false;
1211     }
1212     else
1213     {
1214         if (!tracedError[id])
1215         {
1216             log<level::ERR>(
1217                 std::format(
1218                     "Manager::getSensorValues: OCC{} sensor path missing: {}",
1219                     id, sensorPath.c_str())
1220                     .c_str());
1221             tracedError[id] = true;
1222         }
1223     }
1224 
1225     return;
1226 }
1227 #endif
1228 
1229 // Read the altitude from DBus
readAltitude()1230 void Manager::readAltitude()
1231 {
1232     static bool traceAltitudeErr = true;
1233 
1234     utils::PropertyValue altitudeProperty{};
1235     try
1236     {
1237         altitudeProperty = utils::getProperty(ALTITUDE_PATH, ALTITUDE_INTERFACE,
1238                                               ALTITUDE_PROP);
1239         auto sensorVal = std::get<double>(altitudeProperty);
1240         if (sensorVal < 0xFFFF)
1241         {
1242             if (sensorVal < 0)
1243             {
1244                 altitude = 0;
1245             }
1246             else
1247             {
1248                 // Round to nearest meter
1249                 altitude = uint16_t(sensorVal + 0.5);
1250             }
1251             log<level::DEBUG>(std::format("readAltitude: sensor={} ({}m)",
1252                                           sensorVal, altitude)
1253                                   .c_str());
1254             traceAltitudeErr = true;
1255         }
1256         else
1257         {
1258             if (traceAltitudeErr)
1259             {
1260                 traceAltitudeErr = false;
1261                 log<level::DEBUG>(
1262                     std::format("Invalid altitude value: {}", sensorVal)
1263                         .c_str());
1264             }
1265         }
1266     }
1267     catch (const sdbusplus::exception_t& e)
1268     {
1269         if (traceAltitudeErr)
1270         {
1271             traceAltitudeErr = false;
1272             log<level::INFO>(
1273                 std::format("Unable to read Altitude: {}", e.what()).c_str());
1274         }
1275         altitude = 0xFFFF; // not available
1276     }
1277 }
1278 
1279 // Callback function when ambient temperature changes
ambientCallback(sdbusplus::message_t & msg)1280 void Manager::ambientCallback(sdbusplus::message_t& msg)
1281 {
1282     double currentTemp = 0;
1283     uint8_t truncatedTemp = 0xFF;
1284     std::string msgSensor;
1285     std::map<std::string, std::variant<double>> msgData;
1286     msg.read(msgSensor, msgData);
1287 
1288     auto valPropMap = msgData.find(AMBIENT_PROP);
1289     if (valPropMap == msgData.end())
1290     {
1291         log<level::DEBUG>("ambientCallback: Unknown ambient property changed");
1292         return;
1293     }
1294     currentTemp = std::get<double>(valPropMap->second);
1295     if (std::isnan(currentTemp))
1296     {
1297         truncatedTemp = 0xFF;
1298     }
1299     else
1300     {
1301         if (currentTemp < 0)
1302         {
1303             truncatedTemp = 0;
1304         }
1305         else
1306         {
1307             // Round to nearest degree C
1308             truncatedTemp = uint8_t(currentTemp + 0.5);
1309         }
1310     }
1311 
1312     // If ambient changes, notify OCCs
1313     if (truncatedTemp != ambient)
1314     {
1315         log<level::DEBUG>(
1316             std::format("ambientCallback: Ambient change from {} to {}C",
1317                         ambient, currentTemp)
1318                 .c_str());
1319 
1320         ambient = truncatedTemp;
1321         if (altitude == 0xFFFF)
1322         {
1323             // No altitude yet, try reading again
1324             readAltitude();
1325         }
1326 
1327         log<level::DEBUG>(
1328             std::format("ambientCallback: Ambient: {}C, altitude: {}m", ambient,
1329                         altitude)
1330                 .c_str());
1331 #ifdef POWER10
1332         // Send ambient and altitude to all OCCs
1333         for (auto& obj : statusObjects)
1334         {
1335             if (obj->occActive())
1336             {
1337                 obj->sendAmbient(ambient, altitude);
1338             }
1339         }
1340 #endif // POWER10
1341     }
1342 }
1343 
1344 // return the current ambient and altitude readings
getAmbientData(bool & ambientValid,uint8_t & ambientTemp,uint16_t & altitudeValue) const1345 void Manager::getAmbientData(bool& ambientValid, uint8_t& ambientTemp,
1346                              uint16_t& altitudeValue) const
1347 {
1348     ambientValid = true;
1349     ambientTemp = ambient;
1350     altitudeValue = altitude;
1351 
1352     if (ambient == 0xFF)
1353     {
1354         ambientValid = false;
1355     }
1356 }
1357 
1358 #ifdef POWER10
1359 // Called when waitForAllOccsTimer expires
1360 // After the first OCC goes active, this timer will be started (60 seconds)
occsNotAllRunning()1361 void Manager::occsNotAllRunning()
1362 {
1363     if (activeCount != statusObjects.size())
1364     {
1365         // Not all OCCs went active
1366         log<level::WARNING>(
1367             std::format(
1368                 "occsNotAllRunning: Active OCC count ({}) does not match expected count ({})",
1369                 activeCount, statusObjects.size())
1370                 .c_str());
1371         // Procs may be garded, so may be expected
1372     }
1373 
1374     validateOccMaster();
1375 }
1376 
1377 #ifdef PLDM
1378 // Called when throttlePldmTraceTimer expires.
1379 // If this timer expires, that indicates there are no OCC active sensor PDRs
1380 // found which will trigger pldm traces to be throttled.
1381 // The second time this timer expires, a PEL will get created.
throttlePldmTraceExpired()1382 void Manager::throttlePldmTraceExpired()
1383 {
1384     if (utils::isHostRunning())
1385     {
1386         if (!onPldmTimeoutCreatePel)
1387         {
1388             // Throttle traces
1389             pldmHandle->setTraceThrottle(true);
1390             // Restart timer to log a PEL when timer expires
1391             onPldmTimeoutCreatePel = true;
1392             throttlePldmTraceTimer->restartOnce(40min);
1393         }
1394         else
1395         {
1396             log<level::ERR>(
1397                 "throttlePldmTraceExpired(): OCC active sensors still not available!");
1398             // Create PEL
1399             createPldmSensorPEL();
1400         }
1401     }
1402     else
1403     {
1404         // Make sure traces are not throttled
1405         pldmHandle->setTraceThrottle(false);
1406         log<level::INFO>(
1407             "throttlePldmTraceExpired(): host it not running ignoring sensor timer");
1408     }
1409 }
1410 
createPldmSensorPEL()1411 void Manager::createPldmSensorPEL()
1412 {
1413     Error::Descriptor d = Error::Descriptor(MISSING_OCC_SENSORS_PATH);
1414     std::map<std::string, std::string> additionalData;
1415 
1416     additionalData.emplace("_PID", std::to_string(getpid()));
1417 
1418     log<level::INFO>(
1419         std::format(
1420             "createPldmSensorPEL(): Unable to find PLDM sensors for the OCCs")
1421             .c_str());
1422 
1423     auto& bus = utils::getBus();
1424 
1425     try
1426     {
1427         FFDCFiles ffdc;
1428         // Add occ-control journal traces to PEL FFDC
1429         auto occJournalFile =
1430             FFDC::addJournalEntries(ffdc, "openpower-occ-control", 40);
1431 
1432         static constexpr auto loggingObjectPath =
1433             "/xyz/openbmc_project/logging";
1434         static constexpr auto opLoggingInterface = "org.open_power.Logging.PEL";
1435         std::string service = utils::getService(loggingObjectPath,
1436                                                 opLoggingInterface);
1437         auto method = bus.new_method_call(service.c_str(), loggingObjectPath,
1438                                           opLoggingInterface,
1439                                           "CreatePELWithFFDCFiles");
1440 
1441         // Set level to Warning (Predictive).
1442         auto level =
1443             sdbusplus::xyz::openbmc_project::Logging::server::convertForMessage(
1444                 sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level::
1445                     Warning);
1446 
1447         method.append(d.path, level, additionalData, ffdc);
1448         bus.call(method);
1449     }
1450     catch (const sdbusplus::exception_t& e)
1451     {
1452         log<level::ERR>(
1453             std::format("Failed to create MISSING_OCC_SENSORS PEL: {}",
1454                         e.what())
1455                 .c_str());
1456     }
1457 }
1458 #endif // PLDM
1459 #endif // POWER10
1460 
1461 // Verify single master OCC and start presence monitor
validateOccMaster()1462 void Manager::validateOccMaster()
1463 {
1464     int masterInstance = -1;
1465     for (auto& obj : statusObjects)
1466     {
1467         auto instance = obj->getOccInstanceID();
1468 #ifdef POWER10
1469         if (!obj->occActive())
1470         {
1471             if (utils::isHostRunning())
1472             {
1473                 // Check if sensor was queued while waiting for discovery
1474                 auto match = queuedActiveState.find(instance);
1475                 if (match != queuedActiveState.end())
1476                 {
1477                     queuedActiveState.erase(match);
1478                     log<level::INFO>(
1479                         std::format(
1480                             "validateOccMaster: OCC{} is ACTIVE (queued)",
1481                             instance)
1482                             .c_str());
1483                     obj->occActive(true);
1484                 }
1485                 else
1486                 {
1487                     // OCC does not appear to be active yet, check active sensor
1488 #ifdef PLDM
1489                     pldmHandle->checkActiveSensor(instance);
1490 #endif
1491                     if (obj->occActive())
1492                     {
1493                         log<level::INFO>(
1494                             std::format(
1495                                 "validateOccMaster: OCC{} is ACTIVE after reading sensor",
1496                                 instance)
1497                                 .c_str());
1498                     }
1499                 }
1500             }
1501             else
1502             {
1503                 log<level::WARNING>(
1504                     std::format(
1505                         "validateOccMaster: HOST is not running (OCC{})",
1506                         instance)
1507                         .c_str());
1508                 return;
1509             }
1510         }
1511 #endif // POWER10
1512 
1513         if (obj->isMasterOcc())
1514         {
1515             obj->addPresenceWatchMaster();
1516 
1517             if (masterInstance == -1)
1518             {
1519                 masterInstance = instance;
1520             }
1521             else
1522             {
1523                 log<level::ERR>(
1524                     std::format(
1525                         "validateOccMaster: Multiple OCC masters! ({} and {})",
1526                         masterInstance, instance)
1527                         .c_str());
1528                 // request reset
1529                 obj->deviceError(Error::Descriptor(PRESENCE_ERROR_PATH));
1530             }
1531         }
1532     }
1533 
1534     if (masterInstance < 0)
1535     {
1536         log<level::ERR>(
1537             std::format("validateOccMaster: Master OCC not found! (of {} OCCs)",
1538                         statusObjects.size())
1539                 .c_str());
1540         // request reset
1541         statusObjects.front()->deviceError(
1542             Error::Descriptor(PRESENCE_ERROR_PATH));
1543     }
1544     else
1545     {
1546         log<level::INFO>(
1547             std::format("validateOccMaster: OCC{} is master of {} OCCs",
1548                         masterInstance, activeCount)
1549                 .c_str());
1550 #ifdef POWER10
1551         pmode->updateDbusSafeMode(false);
1552 #endif
1553     }
1554 }
1555 
updatePcapBounds() const1556 void Manager::updatePcapBounds() const
1557 {
1558     if (pcap)
1559     {
1560         pcap->updatePcapBounds();
1561     }
1562 }
1563 
1564 } // namespace occ
1565 } // namespace open_power
1566