1 #include "config.h"
2 
3 #include "occ_manager.hpp"
4 
5 #include "i2c_occ.hpp"
6 #include "occ_dbus.hpp"
7 #include "occ_errors.hpp"
8 #include "utils.hpp"
9 
10 #include <phosphor-logging/elog-errors.hpp>
11 #include <phosphor-logging/log.hpp>
12 #include <xyz/openbmc_project/Common/error.hpp>
13 
14 #include <chrono>
15 #include <cmath>
16 #include <filesystem>
17 #include <fstream>
18 #include <regex>
19 
20 namespace open_power
21 {
22 namespace occ
23 {
24 
25 constexpr uint32_t fruTypeNotAvailable = 0xFF;
26 constexpr auto fruTypeSuffix = "fru_type";
27 constexpr auto faultSuffix = "fault";
28 constexpr auto inputSuffix = "input";
29 constexpr auto maxSuffix = "max";
30 
31 const auto HOST_ON_FILE = "/run/openbmc/host@0-on";
32 
33 using namespace phosphor::logging;
34 using namespace std::literals::chrono_literals;
35 
36 template <typename T>
readFile(const std::string & path)37 T readFile(const std::string& path)
38 {
39     std::ifstream ifs;
40     ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit |
41                    std::ifstream::eofbit);
42     T data;
43 
44     try
45     {
46         ifs.open(path);
47         ifs >> data;
48         ifs.close();
49     }
50     catch (const std::exception& e)
51     {
52         auto err = errno;
53         throw std::system_error(err, std::generic_category());
54     }
55 
56     return data;
57 }
58 
59 // findAndCreateObjects():
60 // Takes care of getting the required objects created and
61 // finds the available devices/processors.
62 // (function is called everytime the discoverTimer expires)
63 // - create the PowerMode object to control OCC modes
64 // - create statusObjects for each OCC device found
65 // - waits for OCC Active sensors PDRs to become available
66 // - restart discoverTimer if all data is not available yet
findAndCreateObjects()67 void Manager::findAndCreateObjects()
68 {
69 #ifndef POWER10
70     for (auto id = 0; id < MAX_CPUS; ++id)
71     {
72         // Create one occ per cpu
73         auto occ = std::string(OCC_NAME) + std::to_string(id);
74         createObjects(occ);
75     }
76 #else
77     if (!pmode)
78     {
79         // Create the power mode object
80         pmode = std::make_unique<powermode::PowerMode>(
81             *this, powermode::PMODE_PATH, powermode::PIPS_PATH, event);
82     }
83 
84     if (!fs::exists(HOST_ON_FILE))
85     {
86         static bool statusObjCreated = false;
87         if (!statusObjCreated)
88         {
89             // Create the OCCs based on on the /dev/occX devices
90             auto occs = findOCCsInDev();
91 
92             if (occs.empty() || (prevOCCSearch.size() != occs.size()))
93             {
94                 // Something changed or no OCCs yet, try again in 10s.
95                 // Note on the first pass prevOCCSearch will be empty,
96                 // so there will be at least one delay to give things
97                 // a chance to settle.
98                 prevOCCSearch = occs;
99 
100                 log<level::INFO>(
101                     std::format(
102                         "Manager::findAndCreateObjects(): Waiting for OCCs (currently {})",
103                         occs.size())
104                         .c_str());
105 
106                 discoverTimer->restartOnce(10s);
107             }
108             else
109             {
110                 // All OCCs appear to be available, create status objects
111 
112                 // createObjects requires OCC0 first.
113                 std::sort(occs.begin(), occs.end());
114 
115                 log<level::INFO>(
116                     std::format(
117                         "Manager::findAndCreateObjects(): Creating {} OCC Status Objects",
118                         occs.size())
119                         .c_str());
120                 for (auto id : occs)
121                 {
122                     createObjects(std::string(OCC_NAME) + std::to_string(id));
123                 }
124                 statusObjCreated = true;
125                 waitingForAllOccActiveSensors = true;
126 
127                 // Find/update the processor path associated with each OCC
128                 for (auto& obj : statusObjects)
129                 {
130                     obj->updateProcAssociation();
131                 }
132             }
133         }
134 
135         if (statusObjCreated && waitingForAllOccActiveSensors)
136         {
137             static bool tracedHostWait = false;
138             if (utils::isHostRunning())
139             {
140                 if (tracedHostWait)
141                 {
142                     log<level::INFO>(
143                         "Manager::findAndCreateObjects(): Host is running");
144                     tracedHostWait = false;
145                 }
146                 checkAllActiveSensors();
147             }
148             else
149             {
150                 if (!tracedHostWait)
151                 {
152                     log<level::INFO>(
153                         "Manager::findAndCreateObjects(): Waiting for host to start");
154                     tracedHostWait = true;
155                 }
156                 discoverTimer->restartOnce(30s);
157 #ifdef PLDM
158                 if (throttlePldmTraceTimer->isEnabled())
159                 {
160                     // Host is no longer running, disable throttle timer and
161                     // make sure traces are not throttled
162                     log<level::INFO>(
163                         "findAndCreateObjects(): disabling sensor timer");
164                     throttlePldmTraceTimer->setEnabled(false);
165                     pldmHandle->setTraceThrottle(false);
166                 }
167 #endif
168             }
169         }
170     }
171     else
172     {
173         log<level::INFO>(
174             std::format(
175                 "Manager::findAndCreateObjects(): Waiting for {} to complete...",
176                 HOST_ON_FILE)
177                 .c_str());
178         discoverTimer->restartOnce(10s);
179     }
180 #endif
181 }
182 
183 #ifdef POWER10
184 // Check if all occActive sensors are available
checkAllActiveSensors()185 void Manager::checkAllActiveSensors()
186 {
187     static bool allActiveSensorAvailable = false;
188     static bool tracedSensorWait = false;
189     static bool waitingForHost = false;
190 
191     if (open_power::occ::utils::isHostRunning())
192     {
193         if (waitingForHost)
194         {
195             waitingForHost = false;
196             log<level::INFO>("checkAllActiveSensors(): Host is now running");
197         }
198 
199         // Start with the assumption that all are available
200         allActiveSensorAvailable = true;
201         for (auto& obj : statusObjects)
202         {
203             if ((!obj->occActive()) && (!obj->getPldmSensorReceived()))
204             {
205                 auto instance = obj->getOccInstanceID();
206                 // Check if sensor was queued while waiting for discovery
207                 auto match = queuedActiveState.find(instance);
208                 if (match != queuedActiveState.end())
209                 {
210                     queuedActiveState.erase(match);
211                     log<level::INFO>(
212                         std::format(
213                             "checkAllActiveSensors(): OCC{} is ACTIVE (queued)",
214                             instance)
215                             .c_str());
216                     obj->occActive(true);
217                 }
218                 else
219                 {
220                     allActiveSensorAvailable = false;
221                     if (!tracedSensorWait)
222                     {
223                         log<level::INFO>(
224                             std::format(
225                                 "checkAllActiveSensors(): Waiting on OCC{} Active sensor",
226                                 instance)
227                                 .c_str());
228                         tracedSensorWait = true;
229 #ifdef PLDM
230                         // Make sure PLDM traces are not throttled
231                         pldmHandle->setTraceThrottle(false);
232                         // Start timer to throttle PLDM traces when timer
233                         // expires
234                         onPldmTimeoutCreatePel = false;
235                         throttlePldmTraceTimer->restartOnce(5min);
236 #endif
237                     }
238 #ifdef PLDM
239                     pldmHandle->checkActiveSensor(obj->getOccInstanceID());
240 #endif
241                     break;
242                 }
243             }
244         }
245     }
246     else
247     {
248         if (!waitingForHost)
249         {
250             waitingForHost = true;
251             log<level::INFO>(
252                 "checkAllActiveSensors(): Waiting for host to start");
253 #ifdef PLDM
254             if (throttlePldmTraceTimer->isEnabled())
255             {
256                 // Host is no longer running, disable throttle timer and
257                 // make sure traces are not throttled
258                 log<level::INFO>(
259                     "checkAllActiveSensors(): disabling sensor timer");
260                 throttlePldmTraceTimer->setEnabled(false);
261                 pldmHandle->setTraceThrottle(false);
262             }
263 #endif
264         }
265     }
266 
267     if (allActiveSensorAvailable)
268     {
269         // All sensors were found, disable the discovery timer
270         if (discoverTimer->isEnabled())
271         {
272             discoverTimer->setEnabled(false);
273         }
274 #ifdef PLDM
275         if (throttlePldmTraceTimer->isEnabled())
276         {
277             // Disable throttle timer and make sure traces are not throttled
278             throttlePldmTraceTimer->setEnabled(false);
279             pldmHandle->setTraceThrottle(false);
280         }
281 #endif
282         if (waitingForAllOccActiveSensors)
283         {
284             log<level::INFO>(
285                 "checkAllActiveSensors(): OCC Active sensors are available");
286             waitingForAllOccActiveSensors = false;
287         }
288         queuedActiveState.clear();
289         tracedSensorWait = false;
290     }
291     else
292     {
293         // Not all sensors were available, so keep waiting
294         if (!tracedSensorWait)
295         {
296             log<level::INFO>(
297                 "checkAllActiveSensors(): Waiting for OCC Active sensors to become available");
298             tracedSensorWait = true;
299         }
300         discoverTimer->restartOnce(10s);
301     }
302 }
303 #endif
304 
findOCCsInDev()305 std::vector<int> Manager::findOCCsInDev()
306 {
307     std::vector<int> occs;
308     std::regex expr{R"(occ(\d+)$)"};
309 
310     for (auto& file : fs::directory_iterator("/dev"))
311     {
312         std::smatch match;
313         std::string path{file.path().string()};
314         if (std::regex_search(path, match, expr))
315         {
316             auto num = std::stoi(match[1].str());
317 
318             // /dev numbering starts at 1, ours starts at 0.
319             occs.push_back(num - 1);
320         }
321     }
322 
323     return occs;
324 }
325 
cpuCreated(sdbusplus::message_t & msg)326 int Manager::cpuCreated(sdbusplus::message_t& msg)
327 {
328     namespace fs = std::filesystem;
329 
330     sdbusplus::message::object_path o;
331     msg.read(o);
332     fs::path cpuPath(std::string(std::move(o)));
333 
334     auto name = cpuPath.filename().string();
335     auto index = name.find(CPU_NAME);
336     name.replace(index, std::strlen(CPU_NAME), OCC_NAME);
337 
338     createObjects(name);
339 
340     return 0;
341 }
342 
createObjects(const std::string & occ)343 void Manager::createObjects(const std::string& occ)
344 {
345     auto path = fs::path(OCC_CONTROL_ROOT) / occ;
346 
347     statusObjects.emplace_back(std::make_unique<Status>(
348         event, path.c_str(), *this,
349 #ifdef POWER10
350         pmode,
351 #endif
352         std::bind(std::mem_fn(&Manager::statusCallBack), this,
353                   std::placeholders::_1, std::placeholders::_2)
354 #ifdef PLDM
355             ,
356         std::bind(std::mem_fn(&pldm::Interface::resetOCC), pldmHandle.get(),
357                   std::placeholders::_1)
358 #endif
359             ));
360 
361     // Create the power cap monitor object
362     if (!pcap)
363     {
364         pcap = std::make_unique<open_power::occ::powercap::PowerCap>(
365             *statusObjects.back());
366     }
367 
368     if (statusObjects.back()->isMasterOcc())
369     {
370         log<level::INFO>(
371             std::format("Manager::createObjects(): OCC{} is the master",
372                         statusObjects.back()->getOccInstanceID())
373                 .c_str());
374         _pollTimer->setEnabled(false);
375 
376 #ifdef POWER10
377         // Set the master OCC on the PowerMode object
378         pmode->setMasterOcc(path);
379 #endif
380     }
381 
382     passThroughObjects.emplace_back(std::make_unique<PassThrough>(
383         path.c_str()
384 #ifdef POWER10
385             ,
386         pmode
387 #endif
388         ));
389 }
390 
statusCallBack(instanceID instance,bool status)391 void Manager::statusCallBack(instanceID instance, bool status)
392 {
393     if (status == true)
394     {
395         // OCC went active
396         ++activeCount;
397 
398 #ifdef POWER10
399         if (activeCount == 1)
400         {
401             // First OCC went active (allow some time for all OCCs to go active)
402             waitForAllOccsTimer->restartOnce(60s);
403         }
404 #endif
405 
406         if (activeCount == statusObjects.size())
407         {
408 #ifdef POWER10
409             // All OCCs are now running
410             if (waitForAllOccsTimer->isEnabled())
411             {
412                 // stop occ wait timer
413                 waitForAllOccsTimer->setEnabled(false);
414             }
415 #endif
416 
417             // Verify master OCC and start presence monitor
418             validateOccMaster();
419         }
420 
421         // Start poll timer if not already started
422         if (!_pollTimer->isEnabled())
423         {
424             log<level::INFO>(
425                 std::format("Manager: OCCs will be polled every {} seconds",
426                             pollInterval)
427                     .c_str());
428 
429             // Send poll and start OCC poll timer
430             pollerTimerExpired();
431         }
432     }
433     else
434     {
435         // OCC went away
436         if (activeCount > 0)
437         {
438             --activeCount;
439         }
440         else
441         {
442             log<level::ERR>(
443                 std::format("OCC{} disabled, but currently no active OCCs",
444                             instance)
445                     .c_str());
446         }
447 
448         if (activeCount == 0)
449         {
450             // No OCCs are running
451 
452             // Stop OCC poll timer
453             if (_pollTimer->isEnabled())
454             {
455                 log<level::INFO>(
456                     "Manager::statusCallBack(): OCCs are not running, stopping poll timer");
457                 _pollTimer->setEnabled(false);
458             }
459 
460 #ifdef POWER10
461             // stop wait timer
462             if (waitForAllOccsTimer->isEnabled())
463             {
464                 waitForAllOccsTimer->setEnabled(false);
465             }
466 #endif
467         }
468 #ifdef READ_OCC_SENSORS
469         // Clear OCC sensors
470         setSensorValueToNaN(instance);
471 #endif
472     }
473 
474 #ifdef POWER10
475     if (waitingForAllOccActiveSensors)
476     {
477         if (utils::isHostRunning())
478         {
479             checkAllActiveSensors();
480         }
481     }
482 #endif
483 }
484 
485 #ifdef I2C_OCC
initStatusObjects()486 void Manager::initStatusObjects()
487 {
488     // Make sure we have a valid path string
489     static_assert(sizeof(DEV_PATH) != 0);
490 
491     auto deviceNames = i2c_occ::getOccHwmonDevices(DEV_PATH);
492     for (auto& name : deviceNames)
493     {
494         i2c_occ::i2cToDbus(name);
495         name = std::string(OCC_NAME) + '_' + name;
496         auto path = fs::path(OCC_CONTROL_ROOT) / name;
497         statusObjects.emplace_back(
498             std::make_unique<Status>(event, path.c_str(), *this));
499     }
500     // The first device is master occ
501     pcap = std::make_unique<open_power::occ::powercap::PowerCap>(
502         *statusObjects.front());
503 #ifdef POWER10
504     pmode = std::make_unique<powermode::PowerMode>(*this, powermode::PMODE_PATH,
505                                                    powermode::PIPS_PATH);
506     // Set the master OCC on the PowerMode object
507     pmode->setMasterOcc(path);
508 #endif
509 }
510 #endif
511 
512 #ifdef PLDM
sbeTimeout(unsigned int instance)513 void Manager::sbeTimeout(unsigned int instance)
514 {
515     auto obj = std::find_if(statusObjects.begin(), statusObjects.end(),
516                             [instance](const auto& obj) {
517                                 return instance == obj->getOccInstanceID();
518                             });
519 
520     if (obj != statusObjects.end() && (*obj)->occActive())
521     {
522         log<level::INFO>(
523             std::format("SBE timeout, requesting HRESET (OCC{})", instance)
524                 .c_str());
525 
526         setSBEState(instance, SBE_STATE_NOT_USABLE);
527 
528         pldmHandle->sendHRESET(instance);
529     }
530 }
531 
updateOCCActive(instanceID instance,bool status)532 bool Manager::updateOCCActive(instanceID instance, bool status)
533 {
534     auto obj = std::find_if(statusObjects.begin(), statusObjects.end(),
535                             [instance](const auto& obj) {
536                                 return instance == obj->getOccInstanceID();
537                             });
538 
539     const bool hostRunning = open_power::occ::utils::isHostRunning();
540     if (obj != statusObjects.end())
541     {
542         if (!hostRunning && (status == true))
543         {
544             log<level::WARNING>(
545                 std::format(
546                     "updateOCCActive: Host is not running yet (OCC{} active={}), clearing sensor received",
547                     instance, status)
548                     .c_str());
549             (*obj)->setPldmSensorReceived(false);
550             if (!waitingForAllOccActiveSensors)
551             {
552                 log<level::INFO>(
553                     "updateOCCActive: Waiting for Host and all OCC Active Sensors");
554                 waitingForAllOccActiveSensors = true;
555             }
556 #ifdef POWER10
557             discoverTimer->restartOnce(30s);
558 #endif
559             return false;
560         }
561         else
562         {
563             (*obj)->setPldmSensorReceived(true);
564             return (*obj)->occActive(status);
565         }
566     }
567     else
568     {
569         if (hostRunning)
570         {
571             log<level::WARNING>(
572                 std::format(
573                     "updateOCCActive: No status object to update for OCC{} (active={})",
574                     instance, status)
575                     .c_str());
576         }
577         else
578         {
579             if (status == true)
580             {
581                 log<level::WARNING>(
582                     std::format(
583                         "updateOCCActive: No status objects and Host is not running yet (OCC{} active={})",
584                         instance, status)
585                         .c_str());
586             }
587         }
588         if (status == true)
589         {
590             // OCC went active
591             queuedActiveState.insert(instance);
592         }
593         else
594         {
595             auto match = queuedActiveState.find(instance);
596             if (match != queuedActiveState.end())
597             {
598                 // OCC was disabled
599                 queuedActiveState.erase(match);
600             }
601         }
602         return false;
603     }
604 }
605 
606 // Called upon pldm event To set powermode Safe Mode State for system.
updateOccSafeMode(bool safeMode)607 void Manager::updateOccSafeMode(bool safeMode)
608 {
609 #ifdef POWER10
610     pmode->updateDbusSafeMode(safeMode);
611 #endif
612     // Update the processor throttle status on dbus
613     for (auto& obj : statusObjects)
614     {
615         obj->updateThrottle(safeMode, THROTTLED_SAFE);
616     }
617 }
618 
sbeHRESETResult(instanceID instance,bool success)619 void Manager::sbeHRESETResult(instanceID instance, bool success)
620 {
621     if (success)
622     {
623         log<level::INFO>(
624             std::format("HRESET succeeded (OCC{})", instance).c_str());
625 
626         setSBEState(instance, SBE_STATE_BOOTED);
627 
628         return;
629     }
630 
631     setSBEState(instance, SBE_STATE_FAILED);
632 
633     if (sbeCanDump(instance))
634     {
635         log<level::INFO>(
636             std::format("HRESET failed (OCC{}), triggering SBE dump", instance)
637                 .c_str());
638 
639         auto& bus = utils::getBus();
640         uint32_t src6 = instance << 16;
641         uint32_t logId =
642             FFDC::createPEL("org.open_power.Processor.Error.SbeChipOpTimeout",
643                             src6, "SBE command timeout");
644 
645         try
646         {
647             constexpr auto interface = "xyz.openbmc_project.Dump.Create";
648             constexpr auto function = "CreateDump";
649 
650             std::string service =
651                 utils::getService(OP_DUMP_OBJ_PATH, interface);
652             auto method = bus.new_method_call(service.c_str(), OP_DUMP_OBJ_PATH,
653                                               interface, function);
654 
655             std::map<std::string, std::variant<std::string, uint64_t>>
656                 createParams{
657                     {"com.ibm.Dump.Create.CreateParameters.ErrorLogId",
658                      uint64_t(logId)},
659                     {"com.ibm.Dump.Create.CreateParameters.DumpType",
660                      "com.ibm.Dump.Create.DumpType.SBE"},
661                     {"com.ibm.Dump.Create.CreateParameters.FailingUnitId",
662                      uint64_t(instance)},
663                 };
664 
665             method.append(createParams);
666 
667             auto response = bus.call(method);
668         }
669         catch (const sdbusplus::exception_t& e)
670         {
671             constexpr auto ERROR_DUMP_DISABLED =
672                 "xyz.openbmc_project.Dump.Create.Error.Disabled";
673             if (e.name() == ERROR_DUMP_DISABLED)
674             {
675                 log<level::INFO>("Dump is disabled, skipping");
676             }
677             else
678             {
679                 log<level::ERR>("Dump failed");
680             }
681         }
682     }
683 }
684 
sbeCanDump(unsigned int instance)685 bool Manager::sbeCanDump(unsigned int instance)
686 {
687     struct pdbg_target* proc = getPdbgTarget(instance);
688 
689     if (!proc)
690     {
691         // allow the dump in the error case
692         return true;
693     }
694 
695     try
696     {
697         if (!openpower::phal::sbe::isDumpAllowed(proc))
698         {
699             return false;
700         }
701 
702         if (openpower::phal::pdbg::isSbeVitalAttnActive(proc))
703         {
704             return false;
705         }
706     }
707     catch (openpower::phal::exception::SbeError& e)
708     {
709         log<level::INFO>("Failed to query SBE state");
710     }
711 
712     // allow the dump in the error case
713     return true;
714 }
715 
setSBEState(unsigned int instance,enum sbe_state state)716 void Manager::setSBEState(unsigned int instance, enum sbe_state state)
717 {
718     struct pdbg_target* proc = getPdbgTarget(instance);
719 
720     if (!proc)
721     {
722         return;
723     }
724 
725     try
726     {
727         openpower::phal::sbe::setState(proc, state);
728     }
729     catch (const openpower::phal::exception::SbeError& e)
730     {
731         log<level::ERR>(
732             std::format("Failed to set SBE state: {}", e.what()).c_str());
733     }
734 }
735 
getPdbgTarget(unsigned int instance)736 struct pdbg_target* Manager::getPdbgTarget(unsigned int instance)
737 {
738     if (!pdbgInitialized)
739     {
740         try
741         {
742             openpower::phal::pdbg::init();
743             pdbgInitialized = true;
744         }
745         catch (const openpower::phal::exception::PdbgError& e)
746         {
747             log<level::ERR>("pdbg initialization failed");
748             return nullptr;
749         }
750     }
751 
752     struct pdbg_target* proc = nullptr;
753     pdbg_for_each_class_target("proc", proc)
754     {
755         if (pdbg_target_index(proc) == instance)
756         {
757             return proc;
758         }
759     }
760 
761     log<level::ERR>("Failed to get pdbg target");
762     return nullptr;
763 }
764 #endif
765 
pollerTimerExpired()766 void Manager::pollerTimerExpired()
767 {
768     if (!_pollTimer)
769     {
770         log<level::ERR>(
771             "Manager::pollerTimerExpired() ERROR: Timer not defined");
772         return;
773     }
774 
775     for (auto& obj : statusObjects)
776     {
777         if (!obj->occActive())
778         {
779             // OCC is not running yet
780 #ifdef READ_OCC_SENSORS
781             auto id = obj->getOccInstanceID();
782             setSensorValueToNaN(id);
783 #endif
784             continue;
785         }
786 
787         // Read sysfs to force kernel to poll OCC
788         obj->readOccState();
789 
790 #ifdef READ_OCC_SENSORS
791         // Read occ sensor values
792         getSensorValues(obj);
793 #endif
794     }
795 
796     if (activeCount > 0)
797     {
798         // Restart OCC poll timer
799         _pollTimer->restartOnce(std::chrono::seconds(pollInterval));
800     }
801     else
802     {
803         // No OCCs running, so poll timer will not be restarted
804         log<level::INFO>(
805             std::format(
806                 "Manager::pollerTimerExpired: poll timer will not be restarted")
807                 .c_str());
808     }
809 }
810 
811 #ifdef READ_OCC_SENSORS
readTempSensors(const fs::path & path,uint32_t occInstance)812 void Manager::readTempSensors(const fs::path& path, uint32_t occInstance)
813 {
814     // There may be more than one sensor with the same FRU type
815     // and label so make two passes: the first to read the temps
816     // from sysfs, and the second to put them on D-Bus after
817     // resolving any conflicts.
818     std::map<std::string, double> sensorData;
819 
820     std::regex expr{"temp\\d+_label$"}; // Example: temp5_label
821     for (auto& file : fs::directory_iterator(path))
822     {
823         if (!std::regex_search(file.path().string(), expr))
824         {
825             continue;
826         }
827 
828         uint32_t labelValue{0};
829 
830         try
831         {
832             labelValue = readFile<uint32_t>(file.path());
833         }
834         catch (const std::system_error& e)
835         {
836             log<level::DEBUG>(
837                 std::format("readTempSensors: Failed reading {}, errno = {}",
838                             file.path().string(), e.code().value())
839                     .c_str());
840             continue;
841         }
842 
843         const std::string& tempLabel = "label";
844         const std::string filePathString = file.path().string().substr(
845             0, file.path().string().length() - tempLabel.length());
846 
847         uint32_t fruTypeValue{0};
848         try
849         {
850             fruTypeValue = readFile<uint32_t>(filePathString + fruTypeSuffix);
851         }
852         catch (const std::system_error& e)
853         {
854             log<level::DEBUG>(
855                 std::format("readTempSensors: Failed reading {}, errno = {}",
856                             filePathString + fruTypeSuffix, e.code().value())
857                     .c_str());
858             continue;
859         }
860 
861         std::string sensorPath =
862             OCC_SENSORS_ROOT + std::string("/temperature/");
863 
864         std::string dvfsTempPath;
865 
866         if (fruTypeValue == VRMVdd)
867         {
868             sensorPath.append(
869                 "vrm_vdd" + std::to_string(occInstance) + "_temp");
870         }
871         else if (fruTypeValue == processorIoRing)
872         {
873             sensorPath.append(
874                 "proc" + std::to_string(occInstance) + "_ioring_temp");
875             dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/proc" +
876                            std::to_string(occInstance) + "_ioring_dvfs_temp";
877         }
878         else
879         {
880             uint16_t type = (labelValue & 0xFF000000) >> 24;
881             uint16_t instanceID = labelValue & 0x0000FFFF;
882 
883             if (type == OCC_DIMM_TEMP_SENSOR_TYPE)
884             {
885                 if (fruTypeValue == fruTypeNotAvailable)
886                 {
887                     // Not all DIMM related temps are available to read
888                     // (no _input file in this case)
889                     continue;
890                 }
891                 auto iter = dimmTempSensorName.find(fruTypeValue);
892                 if (iter == dimmTempSensorName.end())
893                 {
894                     log<level::ERR>(
895                         std::format(
896                             "readTempSensors: Fru type error! fruTypeValue = {}) ",
897                             fruTypeValue)
898                             .c_str());
899                     continue;
900                 }
901 
902                 sensorPath.append(
903                     "dimm" + std::to_string(instanceID) + iter->second);
904 
905                 dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/" +
906                                dimmDVFSSensorName.at(fruTypeValue);
907             }
908             else if (type == OCC_CPU_TEMP_SENSOR_TYPE)
909             {
910                 if (fruTypeValue == processorCore)
911                 {
912                     // The OCC reports small core temps, of which there are
913                     // two per big core.  All current P10 systems are in big
914                     // core mode, so use a big core name.
915                     uint16_t coreNum = instanceID / 2;
916                     uint16_t tempNum = instanceID % 2;
917                     sensorPath.append("proc" + std::to_string(occInstance) +
918                                       "_core" + std::to_string(coreNum) + "_" +
919                                       std::to_string(tempNum) + "_temp");
920 
921                     dvfsTempPath =
922                         std::string{OCC_SENSORS_ROOT} + "/temperature/proc" +
923                         std::to_string(occInstance) + "_core_dvfs_temp";
924                 }
925                 else
926                 {
927                     continue;
928                 }
929             }
930             else
931             {
932                 continue;
933             }
934         }
935 
936         // The dvfs temp file only needs to be read once per chip per type.
937         if (!dvfsTempPath.empty() &&
938             !dbus::OccDBusSensors::getOccDBus().hasDvfsTemp(dvfsTempPath))
939         {
940             try
941             {
942                 auto dvfsValue = readFile<double>(filePathString + maxSuffix);
943 
944                 dbus::OccDBusSensors::getOccDBus().setDvfsTemp(
945                     dvfsTempPath, dvfsValue * std::pow(10, -3));
946             }
947             catch (const std::system_error& e)
948             {
949                 log<level::DEBUG>(
950                     std::format(
951                         "readTempSensors: Failed reading {}, errno = {}",
952                         filePathString + maxSuffix, e.code().value())
953                         .c_str());
954             }
955         }
956 
957         uint32_t faultValue{0};
958         try
959         {
960             faultValue = readFile<uint32_t>(filePathString + faultSuffix);
961         }
962         catch (const std::system_error& e)
963         {
964             log<level::DEBUG>(
965                 std::format("readTempSensors: Failed reading {}, errno = {}",
966                             filePathString + faultSuffix, e.code().value())
967                     .c_str());
968             continue;
969         }
970 
971         double tempValue{0};
972         // NOTE: if OCC sends back 0xFF, kernal sets this fault value to 1.
973         if (faultValue != 0)
974         {
975             tempValue = std::numeric_limits<double>::quiet_NaN();
976         }
977         else
978         {
979             // Read the temperature
980             try
981             {
982                 tempValue = readFile<double>(filePathString + inputSuffix);
983             }
984             catch (const std::system_error& e)
985             {
986                 log<level::DEBUG>(
987                     std::format(
988                         "readTempSensors: Failed reading {}, errno = {}",
989                         filePathString + inputSuffix, e.code().value())
990                         .c_str());
991 
992                 // if errno == EAGAIN(Resource temporarily unavailable) then set
993                 // temp to 0, to avoid using old temp, and affecting FAN
994                 // Control.
995                 if (e.code().value() == EAGAIN)
996                 {
997                     tempValue = 0;
998                 }
999                 // else the errno would be something like
1000                 //     EBADF(Bad file descriptor)
1001                 // or ENOENT(No such file or directory)
1002                 else
1003                 {
1004                     continue;
1005                 }
1006             }
1007         }
1008 
1009         // If this object path already has a value, only overwite
1010         // it if the previous one was an NaN or a smaller value.
1011         auto existing = sensorData.find(sensorPath);
1012         if (existing != sensorData.end())
1013         {
1014             // Multiple sensors found for this FRU type
1015             if ((std::isnan(existing->second) && (tempValue == 0)) ||
1016                 ((existing->second == 0) && std::isnan(tempValue)))
1017             {
1018                 // One of the redundant sensors has failed (0xFF/nan), and the
1019                 // other sensor has no reading (0), so set the FRU to NaN to
1020                 // force fan increase
1021                 tempValue = std::numeric_limits<double>::quiet_NaN();
1022                 existing->second = tempValue;
1023             }
1024             if (std::isnan(existing->second) || (tempValue > existing->second))
1025             {
1026                 existing->second = tempValue;
1027             }
1028         }
1029         else
1030         {
1031             // First sensor for this FRU type
1032             sensorData[sensorPath] = tempValue;
1033         }
1034     }
1035 
1036     // Now publish the values on D-Bus.
1037     for (const auto& [objectPath, value] : sensorData)
1038     {
1039         dbus::OccDBusSensors::getOccDBus().setValue(objectPath,
1040                                                     value * std::pow(10, -3));
1041 
1042         dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1043             objectPath, !std::isnan(value));
1044 
1045         if (existingSensors.find(objectPath) == existingSensors.end())
1046         {
1047             dbus::OccDBusSensors::getOccDBus().setChassisAssociation(
1048                 objectPath);
1049         }
1050 
1051         existingSensors[objectPath] = occInstance;
1052     }
1053 }
1054 
1055 std::optional<std::string>
getPowerLabelFunctionID(const std::string & value)1056     Manager::getPowerLabelFunctionID(const std::string& value)
1057 {
1058     // If the value is "system", then the FunctionID is "system".
1059     if (value == "system")
1060     {
1061         return value;
1062     }
1063 
1064     // If the value is not "system", then the label value have 3 numbers, of
1065     // which we only care about the middle one:
1066     // <sensor id>_<function id>_<apss channel>
1067     // eg: The value is "0_10_5" , then the FunctionID is "10".
1068     if (value.find("_") == std::string::npos)
1069     {
1070         return std::nullopt;
1071     }
1072 
1073     auto powerLabelValue = value.substr((value.find("_") + 1));
1074 
1075     if (powerLabelValue.find("_") == std::string::npos)
1076     {
1077         return std::nullopt;
1078     }
1079 
1080     return powerLabelValue.substr(0, powerLabelValue.find("_"));
1081 }
1082 
readPowerSensors(const fs::path & path,uint32_t id)1083 void Manager::readPowerSensors(const fs::path& path, uint32_t id)
1084 {
1085     std::regex expr{"power\\d+_label$"}; // Example: power5_label
1086     for (auto& file : fs::directory_iterator(path))
1087     {
1088         if (!std::regex_search(file.path().string(), expr))
1089         {
1090             continue;
1091         }
1092 
1093         std::string labelValue;
1094         try
1095         {
1096             labelValue = readFile<std::string>(file.path());
1097         }
1098         catch (const std::system_error& e)
1099         {
1100             log<level::DEBUG>(
1101                 std::format("readPowerSensors: Failed reading {}, errno = {}",
1102                             file.path().string(), e.code().value())
1103                     .c_str());
1104             continue;
1105         }
1106 
1107         auto functionID = getPowerLabelFunctionID(labelValue);
1108         if (functionID == std::nullopt)
1109         {
1110             continue;
1111         }
1112 
1113         const std::string& tempLabel = "label";
1114         const std::string filePathString = file.path().string().substr(
1115             0, file.path().string().length() - tempLabel.length());
1116 
1117         std::string sensorPath = OCC_SENSORS_ROOT + std::string("/power/");
1118 
1119         auto iter = powerSensorName.find(*functionID);
1120         if (iter == powerSensorName.end())
1121         {
1122             continue;
1123         }
1124         sensorPath.append(iter->second);
1125 
1126         double tempValue{0};
1127 
1128         try
1129         {
1130             tempValue = readFile<double>(filePathString + inputSuffix);
1131         }
1132         catch (const std::system_error& e)
1133         {
1134             log<level::DEBUG>(
1135                 std::format("readPowerSensors: Failed reading {}, errno = {}",
1136                             filePathString + inputSuffix, e.code().value())
1137                     .c_str());
1138             continue;
1139         }
1140 
1141         dbus::OccDBusSensors::getOccDBus().setUnit(
1142             sensorPath, "xyz.openbmc_project.Sensor.Value.Unit.Watts");
1143 
1144         dbus::OccDBusSensors::getOccDBus().setValue(
1145             sensorPath, tempValue * std::pow(10, -3) * std::pow(10, -3));
1146 
1147         dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1148             sensorPath, true);
1149 
1150         if (existingSensors.find(sensorPath) == existingSensors.end())
1151         {
1152             dbus::OccDBusSensors::getOccDBus().setChassisAssociation(
1153                 sensorPath);
1154         }
1155 
1156         existingSensors[sensorPath] = id;
1157     }
1158     return;
1159 }
1160 
setSensorValueToNaN(uint32_t id) const1161 void Manager::setSensorValueToNaN(uint32_t id) const
1162 {
1163     for (const auto& [sensorPath, occId] : existingSensors)
1164     {
1165         if (occId == id)
1166         {
1167             dbus::OccDBusSensors::getOccDBus().setValue(
1168                 sensorPath, std::numeric_limits<double>::quiet_NaN());
1169 
1170             dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1171                 sensorPath, true);
1172         }
1173     }
1174     return;
1175 }
1176 
setSensorValueToNonFunctional(uint32_t id) const1177 void Manager::setSensorValueToNonFunctional(uint32_t id) const
1178 {
1179     for (const auto& [sensorPath, occId] : existingSensors)
1180     {
1181         if (occId == id)
1182         {
1183             dbus::OccDBusSensors::getOccDBus().setValue(
1184                 sensorPath, std::numeric_limits<double>::quiet_NaN());
1185 
1186             dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1187                 sensorPath, false);
1188         }
1189     }
1190     return;
1191 }
1192 
getSensorValues(std::unique_ptr<Status> & occ)1193 void Manager::getSensorValues(std::unique_ptr<Status>& occ)
1194 {
1195     static bool tracedError[8] = {0};
1196     const fs::path sensorPath = occ->getHwmonPath();
1197     const uint32_t id = occ->getOccInstanceID();
1198 
1199     if (fs::exists(sensorPath))
1200     {
1201         // Read temperature sensors
1202         readTempSensors(sensorPath, id);
1203 
1204         if (occ->isMasterOcc())
1205         {
1206             // Read power sensors
1207             readPowerSensors(sensorPath, id);
1208         }
1209         tracedError[id] = false;
1210     }
1211     else
1212     {
1213         if (!tracedError[id])
1214         {
1215             log<level::ERR>(
1216                 std::format(
1217                     "Manager::getSensorValues: OCC{} sensor path missing: {}",
1218                     id, sensorPath.c_str())
1219                     .c_str());
1220             tracedError[id] = true;
1221         }
1222     }
1223 
1224     return;
1225 }
1226 #endif
1227 
1228 // Read the altitude from DBus
readAltitude()1229 void Manager::readAltitude()
1230 {
1231     static bool traceAltitudeErr = true;
1232 
1233     utils::PropertyValue altitudeProperty{};
1234     try
1235     {
1236         altitudeProperty = utils::getProperty(ALTITUDE_PATH, ALTITUDE_INTERFACE,
1237                                               ALTITUDE_PROP);
1238         auto sensorVal = std::get<double>(altitudeProperty);
1239         if (sensorVal < 0xFFFF)
1240         {
1241             if (sensorVal < 0)
1242             {
1243                 altitude = 0;
1244             }
1245             else
1246             {
1247                 // Round to nearest meter
1248                 altitude = uint16_t(sensorVal + 0.5);
1249             }
1250             log<level::DEBUG>(std::format("readAltitude: sensor={} ({}m)",
1251                                           sensorVal, altitude)
1252                                   .c_str());
1253             traceAltitudeErr = true;
1254         }
1255         else
1256         {
1257             if (traceAltitudeErr)
1258             {
1259                 traceAltitudeErr = false;
1260                 log<level::DEBUG>(
1261                     std::format("Invalid altitude value: {}", sensorVal)
1262                         .c_str());
1263             }
1264         }
1265     }
1266     catch (const sdbusplus::exception_t& e)
1267     {
1268         if (traceAltitudeErr)
1269         {
1270             traceAltitudeErr = false;
1271             log<level::INFO>(
1272                 std::format("Unable to read Altitude: {}", e.what()).c_str());
1273         }
1274         altitude = 0xFFFF; // not available
1275     }
1276 }
1277 
1278 // Callback function when ambient temperature changes
ambientCallback(sdbusplus::message_t & msg)1279 void Manager::ambientCallback(sdbusplus::message_t& msg)
1280 {
1281     double currentTemp = 0;
1282     uint8_t truncatedTemp = 0xFF;
1283     std::string msgSensor;
1284     std::map<std::string, std::variant<double>> msgData;
1285     msg.read(msgSensor, msgData);
1286 
1287     auto valPropMap = msgData.find(AMBIENT_PROP);
1288     if (valPropMap == msgData.end())
1289     {
1290         log<level::DEBUG>("ambientCallback: Unknown ambient property changed");
1291         return;
1292     }
1293     currentTemp = std::get<double>(valPropMap->second);
1294     if (std::isnan(currentTemp))
1295     {
1296         truncatedTemp = 0xFF;
1297     }
1298     else
1299     {
1300         if (currentTemp < 0)
1301         {
1302             truncatedTemp = 0;
1303         }
1304         else
1305         {
1306             // Round to nearest degree C
1307             truncatedTemp = uint8_t(currentTemp + 0.5);
1308         }
1309     }
1310 
1311     // If ambient changes, notify OCCs
1312     if (truncatedTemp != ambient)
1313     {
1314         log<level::DEBUG>(
1315             std::format("ambientCallback: Ambient change from {} to {}C",
1316                         ambient, currentTemp)
1317                 .c_str());
1318 
1319         ambient = truncatedTemp;
1320         if (altitude == 0xFFFF)
1321         {
1322             // No altitude yet, try reading again
1323             readAltitude();
1324         }
1325 
1326         log<level::DEBUG>(
1327             std::format("ambientCallback: Ambient: {}C, altitude: {}m", ambient,
1328                         altitude)
1329                 .c_str());
1330 #ifdef POWER10
1331         // Send ambient and altitude to all OCCs
1332         for (auto& obj : statusObjects)
1333         {
1334             if (obj->occActive())
1335             {
1336                 obj->sendAmbient(ambient, altitude);
1337             }
1338         }
1339 #endif // POWER10
1340     }
1341 }
1342 
1343 // return the current ambient and altitude readings
getAmbientData(bool & ambientValid,uint8_t & ambientTemp,uint16_t & altitudeValue) const1344 void Manager::getAmbientData(bool& ambientValid, uint8_t& ambientTemp,
1345                              uint16_t& altitudeValue) const
1346 {
1347     ambientValid = true;
1348     ambientTemp = ambient;
1349     altitudeValue = altitude;
1350 
1351     if (ambient == 0xFF)
1352     {
1353         ambientValid = false;
1354     }
1355 }
1356 
1357 #ifdef POWER10
1358 // Called when waitForAllOccsTimer expires
1359 // After the first OCC goes active, this timer will be started (60 seconds)
occsNotAllRunning()1360 void Manager::occsNotAllRunning()
1361 {
1362     if (activeCount != statusObjects.size())
1363     {
1364         // Not all OCCs went active
1365         log<level::WARNING>(
1366             std::format(
1367                 "occsNotAllRunning: Active OCC count ({}) does not match expected count ({})",
1368                 activeCount, statusObjects.size())
1369                 .c_str());
1370         // Procs may be garded, so may be expected
1371     }
1372 
1373     validateOccMaster();
1374 }
1375 
1376 #ifdef PLDM
1377 // Called when throttlePldmTraceTimer expires.
1378 // If this timer expires, that indicates there are no OCC active sensor PDRs
1379 // found which will trigger pldm traces to be throttled.
1380 // The second time this timer expires, a PEL will get created.
throttlePldmTraceExpired()1381 void Manager::throttlePldmTraceExpired()
1382 {
1383     if (utils::isHostRunning())
1384     {
1385         if (!onPldmTimeoutCreatePel)
1386         {
1387             // Throttle traces
1388             pldmHandle->setTraceThrottle(true);
1389             // Restart timer to log a PEL when timer expires
1390             onPldmTimeoutCreatePel = true;
1391             throttlePldmTraceTimer->restartOnce(40min);
1392         }
1393         else
1394         {
1395             log<level::ERR>(
1396                 "throttlePldmTraceExpired(): OCC active sensors still not available!");
1397             // Create PEL
1398             createPldmSensorPEL();
1399         }
1400     }
1401     else
1402     {
1403         // Make sure traces are not throttled
1404         pldmHandle->setTraceThrottle(false);
1405         log<level::INFO>(
1406             "throttlePldmTraceExpired(): host it not running ignoring sensor timer");
1407     }
1408 }
1409 
createPldmSensorPEL()1410 void Manager::createPldmSensorPEL()
1411 {
1412     Error::Descriptor d = Error::Descriptor(MISSING_OCC_SENSORS_PATH);
1413     std::map<std::string, std::string> additionalData;
1414 
1415     additionalData.emplace("_PID", std::to_string(getpid()));
1416 
1417     log<level::INFO>(
1418         std::format(
1419             "createPldmSensorPEL(): Unable to find PLDM sensors for the OCCs")
1420             .c_str());
1421 
1422     auto& bus = utils::getBus();
1423 
1424     try
1425     {
1426         FFDCFiles ffdc;
1427         // Add occ-control journal traces to PEL FFDC
1428         auto occJournalFile =
1429             FFDC::addJournalEntries(ffdc, "openpower-occ-control", 40);
1430 
1431         static constexpr auto loggingObjectPath =
1432             "/xyz/openbmc_project/logging";
1433         static constexpr auto opLoggingInterface = "org.open_power.Logging.PEL";
1434         std::string service =
1435             utils::getService(loggingObjectPath, opLoggingInterface);
1436         auto method =
1437             bus.new_method_call(service.c_str(), loggingObjectPath,
1438                                 opLoggingInterface, "CreatePELWithFFDCFiles");
1439 
1440         // Set level to Warning (Predictive).
1441         auto level =
1442             sdbusplus::xyz::openbmc_project::Logging::server::convertForMessage(
1443                 sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level::
1444                     Warning);
1445 
1446         method.append(d.path, level, additionalData, ffdc);
1447         bus.call(method);
1448     }
1449     catch (const sdbusplus::exception_t& e)
1450     {
1451         log<level::ERR>(
1452             std::format("Failed to create MISSING_OCC_SENSORS PEL: {}",
1453                         e.what())
1454                 .c_str());
1455     }
1456 }
1457 #endif // PLDM
1458 #endif // POWER10
1459 
1460 // Verify single master OCC and start presence monitor
validateOccMaster()1461 void Manager::validateOccMaster()
1462 {
1463     int masterInstance = -1;
1464     for (auto& obj : statusObjects)
1465     {
1466         auto instance = obj->getOccInstanceID();
1467 #ifdef POWER10
1468         if (!obj->occActive())
1469         {
1470             if (utils::isHostRunning())
1471             {
1472                 // Check if sensor was queued while waiting for discovery
1473                 auto match = queuedActiveState.find(instance);
1474                 if (match != queuedActiveState.end())
1475                 {
1476                     queuedActiveState.erase(match);
1477                     log<level::INFO>(
1478                         std::format(
1479                             "validateOccMaster: OCC{} is ACTIVE (queued)",
1480                             instance)
1481                             .c_str());
1482                     obj->occActive(true);
1483                 }
1484                 else
1485                 {
1486                     // OCC does not appear to be active yet, check active sensor
1487 #ifdef PLDM
1488                     pldmHandle->checkActiveSensor(instance);
1489 #endif
1490                     if (obj->occActive())
1491                     {
1492                         log<level::INFO>(
1493                             std::format(
1494                                 "validateOccMaster: OCC{} is ACTIVE after reading sensor",
1495                                 instance)
1496                                 .c_str());
1497                     }
1498                 }
1499             }
1500             else
1501             {
1502                 log<level::WARNING>(
1503                     std::format(
1504                         "validateOccMaster: HOST is not running (OCC{})",
1505                         instance)
1506                         .c_str());
1507                 return;
1508             }
1509         }
1510 #endif // POWER10
1511 
1512         if (obj->isMasterOcc())
1513         {
1514             obj->addPresenceWatchMaster();
1515 
1516             if (masterInstance == -1)
1517             {
1518                 masterInstance = instance;
1519             }
1520             else
1521             {
1522                 log<level::ERR>(
1523                     std::format(
1524                         "validateOccMaster: Multiple OCC masters! ({} and {})",
1525                         masterInstance, instance)
1526                         .c_str());
1527                 // request reset
1528                 obj->deviceError(Error::Descriptor(PRESENCE_ERROR_PATH));
1529             }
1530         }
1531     }
1532 
1533     if (masterInstance < 0)
1534     {
1535         log<level::ERR>(
1536             std::format("validateOccMaster: Master OCC not found! (of {} OCCs)",
1537                         statusObjects.size())
1538                 .c_str());
1539         // request reset
1540         statusObjects.front()->deviceError(
1541             Error::Descriptor(PRESENCE_ERROR_PATH));
1542     }
1543     else
1544     {
1545         log<level::INFO>(
1546             std::format("validateOccMaster: OCC{} is master of {} OCCs",
1547                         masterInstance, activeCount)
1548                 .c_str());
1549 #ifdef POWER10
1550         pmode->updateDbusSafeMode(false);
1551 #endif
1552     }
1553 }
1554 
updatePcapBounds() const1555 void Manager::updatePcapBounds() const
1556 {
1557     if (pcap)
1558     {
1559         pcap->updatePcapBounds();
1560     }
1561 }
1562 
1563 } // namespace occ
1564 } // namespace open_power
1565