xref: /openbmc/openpower-occ-control/occ_manager.cpp (revision ffb6321e9e5acd2823e19174c28683dee8140d95)
1 #include "config.h"
2 
3 #include "occ_manager.hpp"
4 
5 #include "occ_dbus.hpp"
6 #include "occ_errors.hpp"
7 #include "utils.hpp"
8 
9 #include <nlohmann/json.hpp>
10 #include <phosphor-logging/elog-errors.hpp>
11 #include <phosphor-logging/lg2.hpp>
12 #include <xyz/openbmc_project/Common/error.hpp>
13 
14 #include <chrono>
15 #include <cmath>
16 #include <filesystem>
17 #include <fstream>
18 #include <regex>
19 
20 namespace open_power
21 {
22 namespace occ
23 {
24 
25 constexpr uint32_t fruTypeNotAvailable = 0xFF;
26 constexpr auto fruTypeSuffix = "fru_type";
27 constexpr auto faultSuffix = "fault";
28 constexpr auto inputSuffix = "input";
29 constexpr auto maxSuffix = "max";
30 
31 const auto HOST_ON_FILE = "/run/openbmc/host@0-on";
32 const std::string Manager::dumpFile = "/tmp/occ_control_dump.json";
33 
34 using namespace phosphor::logging;
35 using namespace std::literals::chrono_literals;
36 using json = nlohmann::json;
37 
38 template <typename T>
readFile(const std::string & path)39 T readFile(const std::string& path)
40 {
41     std::ifstream ifs;
42     ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit |
43                    std::ifstream::eofbit);
44     T data;
45 
46     try
47     {
48         ifs.open(path);
49         ifs >> data;
50         ifs.close();
51     }
52     catch (const std::exception& e)
53     {
54         auto err = errno;
55         throw std::system_error(err, std::generic_category());
56     }
57 
58     return data;
59 }
60 
createPldmHandle()61 void Manager::createPldmHandle()
62 {
63     pldmHandle = std::make_unique<pldm::Interface>(
64         std::bind(std::mem_fn(&Manager::updateOCCActive), this,
65                   std::placeholders::_1, std::placeholders::_2),
66         std::bind(std::mem_fn(&Manager::sbeHRESETResult), this,
67                   std::placeholders::_1, std::placeholders::_2),
68         std::bind(std::mem_fn(&Manager::updateOccSafeMode), this,
69                   std::placeholders::_1),
70         std::bind(std::mem_fn(&Manager::hostPoweredOff), this), event);
71 }
72 
73 // findAndCreateObjects():
74 // Takes care of getting the required objects created and
75 // finds the available devices/processors.
76 // (function is called everytime the discoverTimer expires)
77 // - create the PowerMode object to control OCC modes
78 // - create statusObjects for each OCC device found
79 // - waits for OCC Active sensors PDRs to become available
80 // - restart discoverTimer if all data is not available yet
findAndCreateObjects()81 void Manager::findAndCreateObjects()
82 {
83     if (!pmode)
84     {
85         // Create the power mode object
86         pmode = std::make_unique<powermode::PowerMode>(
87             *this, powermode::PMODE_PATH, powermode::PIPS_PATH, event);
88     }
89 
90     if (!fs::exists(HOST_ON_FILE))
91     {
92         static bool statusObjCreated = false;
93         if (!statusObjCreated)
94         {
95             // Create the OCCs based on on the /dev/occX devices
96             auto occs = findOCCsInDev();
97 
98             if (occs.empty() || (prevOCCSearch.size() != occs.size()))
99             {
100                 // Something changed or no OCCs yet, try again in 10s.
101                 // Note on the first pass prevOCCSearch will be empty,
102                 // so there will be at least one delay to give things
103                 // a chance to settle.
104                 prevOCCSearch = occs;
105 
106                 lg2::info(
107                     "Manager::findAndCreateObjects(): Waiting for OCCs (currently {QTY})",
108                     "QTY", occs.size());
109 
110                 discoverTimer->restartOnce(10s);
111             }
112             else
113             {
114                 // All OCCs appear to be available, create status objects
115 
116                 // createObjects requires OCC0 first.
117                 std::sort(occs.begin(), occs.end());
118 
119                 lg2::info(
120                     "Manager::findAndCreateObjects(): Creating {QTY} OCC Status Objects",
121                     "QTY", occs.size());
122                 for (auto id : occs)
123                 {
124                     createObjects(std::string(OCC_NAME) + std::to_string(id));
125                 }
126                 statusObjCreated = true;
127                 waitingForAllOccActiveSensors = true;
128 
129                 // Find/update the processor path associated with each OCC
130                 for (auto& obj : statusObjects)
131                 {
132                     obj->updateProcAssociation();
133                 }
134             }
135         }
136 
137         if (statusObjCreated && waitingForAllOccActiveSensors)
138         {
139             static bool tracedHostWait = false;
140             if (utils::isHostRunning())
141             {
142                 if (tracedHostWait)
143                 {
144                     lg2::info(
145                         "Manager::findAndCreateObjects(): Host is running");
146                     tracedHostWait = false;
147                 }
148                 checkAllActiveSensors();
149             }
150             else
151             {
152                 if (!tracedHostWait)
153                 {
154                     lg2::info(
155                         "Manager::findAndCreateObjects(): Waiting for host to start");
156                     tracedHostWait = true;
157                 }
158                 discoverTimer->restartOnce(30s);
159 
160                 if (throttlePldmTraceTimer->isEnabled())
161                 {
162                     // Host is no longer running, disable throttle timer and
163                     // make sure traces are not throttled
164                     lg2::info("findAndCreateObjects(): disabling sensor timer");
165                     throttlePldmTraceTimer->setEnabled(false);
166                     pldmHandle->setTraceThrottle(false);
167                 }
168             }
169         }
170     }
171     else
172     {
173         lg2::info(
174             "Manager::findAndCreateObjects(): Waiting for {FILE} to complete...",
175             "FILE", HOST_ON_FILE);
176         discoverTimer->restartOnce(10s);
177     }
178 }
179 
180 // Check if all occActive sensors are available
checkAllActiveSensors()181 void Manager::checkAllActiveSensors()
182 {
183     static bool allActiveSensorAvailable = false;
184     static bool tracedSensorWait = false;
185     static bool waitingForHost = false;
186 
187     if (open_power::occ::utils::isHostRunning())
188     {
189         if (waitingForHost)
190         {
191             waitingForHost = false;
192             lg2::info("checkAllActiveSensors(): Host is now running");
193         }
194 
195         // Start with the assumption that all are available
196         allActiveSensorAvailable = true;
197         for (auto& obj : statusObjects)
198         {
199             if ((!obj->occActive()) && (!obj->getPldmSensorReceived()))
200             {
201                 auto instance = obj->getOccInstanceID();
202                 // Check if sensor was queued while waiting for discovery
203                 auto match = queuedActiveState.find(instance);
204                 if (match != queuedActiveState.end())
205                 {
206                     queuedActiveState.erase(match);
207                     lg2::info(
208                         "checkAllActiveSensors(): OCC{INST} is ACTIVE (queued)",
209                         "INST", instance);
210                     obj->occActive(true);
211                 }
212                 else
213                 {
214                     allActiveSensorAvailable = false;
215                     if (!tracedSensorWait)
216                     {
217                         lg2::info(
218                             "checkAllActiveSensors(): Waiting on OCC{INST} Active sensor",
219                             "INST", instance);
220                         tracedSensorWait = true;
221                         // Make sure PLDM traces are not throttled
222                         pldmHandle->setTraceThrottle(false);
223                         // Start timer to throttle PLDM traces when timer
224                         // expires
225                         onPldmTimeoutCreatePel = false;
226                         throttlePldmTraceTimer->restartOnce(5min);
227                     }
228                     // Ignore active sensor check if the OCCs are being reset
229                     if (!resetInProgress)
230                     {
231                         pldmHandle->checkActiveSensor(obj->getOccInstanceID());
232                     }
233                     break;
234                 }
235             }
236         }
237     }
238     else
239     {
240         if (!waitingForHost)
241         {
242             waitingForHost = true;
243             lg2::info("checkAllActiveSensors(): Waiting for host to start");
244             if (throttlePldmTraceTimer->isEnabled())
245             {
246                 // Host is no longer running, disable throttle timer and
247                 // make sure traces are not throttled
248                 lg2::info("checkAllActiveSensors(): disabling sensor timer");
249                 throttlePldmTraceTimer->setEnabled(false);
250                 pldmHandle->setTraceThrottle(false);
251             }
252         }
253     }
254 
255     if (allActiveSensorAvailable)
256     {
257         // All sensors were found, disable the discovery timer
258         if (discoverTimer->isEnabled())
259         {
260             discoverTimer->setEnabled(false);
261         }
262         if (throttlePldmTraceTimer->isEnabled())
263         {
264             // Disable throttle timer and make sure traces are not throttled
265             throttlePldmTraceTimer->setEnabled(false);
266             pldmHandle->setTraceThrottle(false);
267         }
268         if (waitingForAllOccActiveSensors)
269         {
270             lg2::info(
271                 "checkAllActiveSensors(): OCC Active sensors are available");
272             waitingForAllOccActiveSensors = false;
273 
274             if (resetRequired)
275             {
276                 initiateOccRequest(resetInstance);
277 
278                 if (!waitForAllOccsTimer->isEnabled())
279                 {
280                     lg2::warning(
281                         "occsNotAllRunning: Restarting waitForAllOccTimer");
282                     // restart occ wait timer to check status after reset
283                     // completes
284                     waitForAllOccsTimer->restartOnce(60s);
285                 }
286             }
287         }
288         queuedActiveState.clear();
289         tracedSensorWait = false;
290     }
291     else
292     {
293         // Not all sensors were available, so keep waiting
294         if (!tracedSensorWait)
295         {
296             lg2::info(
297                 "checkAllActiveSensors(): Waiting for OCC Active sensors to become available");
298             tracedSensorWait = true;
299         }
300         discoverTimer->restartOnce(10s);
301     }
302 }
303 
findOCCsInDev()304 std::vector<int> Manager::findOCCsInDev()
305 {
306     std::vector<int> occs;
307     std::regex expr{R"(occ(\d+)$)"};
308 
309     for (auto& file : fs::directory_iterator("/dev"))
310     {
311         std::smatch match;
312         std::string path{file.path().string()};
313         if (std::regex_search(path, match, expr))
314         {
315             auto num = std::stoi(match[1].str());
316 
317             // /dev numbering starts at 1, ours starts at 0.
318             occs.push_back(num - 1);
319         }
320     }
321 
322     return occs;
323 }
324 
cpuCreated(sdbusplus::message_t & msg)325 int Manager::cpuCreated(sdbusplus::message_t& msg)
326 {
327     namespace fs = std::filesystem;
328 
329     sdbusplus::message::object_path o;
330     msg.read(o);
331     fs::path cpuPath(std::string(std::move(o)));
332 
333     auto name = cpuPath.filename().string();
334     auto index = name.find(CPU_NAME);
335     name.replace(index, std::strlen(CPU_NAME), OCC_NAME);
336 
337     createObjects(name);
338 
339     return 0;
340 }
341 
createObjects(const std::string & occ)342 void Manager::createObjects(const std::string& occ)
343 {
344     auto path = fs::path(OCC_CONTROL_ROOT) / occ;
345 
346     statusObjects.emplace_back(std::make_unique<Status>(
347         event, path.c_str(), *this, pmode,
348         std::bind(std::mem_fn(&Manager::statusCallBack), this,
349                   std::placeholders::_1, std::placeholders::_2),
350         // Callback will set flag indicating reset needs to be done
351         // instead of immediately issuing a reset via PLDM.
352         std::bind(std::mem_fn(&Manager::resetOccRequest), this,
353                   std::placeholders::_1)));
354 
355     // Create the power cap monitor object
356     if (!pcap)
357     {
358         pcap = std::make_unique<open_power::occ::powercap::PowerCap>(
359             *statusObjects.back());
360     }
361 
362     if (statusObjects.back()->isMasterOcc())
363     {
364         lg2::info("Manager::createObjects(): OCC{INST} is the master", "INST",
365                   statusObjects.back()->getOccInstanceID());
366         _pollTimer->setEnabled(false);
367 
368         // Set the master OCC on the PowerMode object
369         pmode->setMasterOcc(path);
370     }
371 
372     passThroughObjects.emplace_back(
373         std::make_unique<PassThrough>(path.c_str(), pmode));
374 }
375 
376 // If a reset is not already outstanding, set a flag to indicate that a reset is
377 // needed.
resetOccRequest(instanceID instance)378 void Manager::resetOccRequest(instanceID instance)
379 {
380     if (!resetRequired)
381     {
382         resetRequired = true;
383         resetInstance = instance;
384         lg2::error(
385             "resetOccRequest: PM Complex reset was requested due to OCC{INST}",
386             "INST", instance);
387     }
388     else if (instance != resetInstance)
389     {
390         lg2::warning(
391             "resetOccRequest: Ignoring PM Complex reset request for OCC{INST}, because reset already outstanding for OCC{RINST}",
392             "INST", instance, "RINST", resetInstance);
393     }
394 }
395 
396 // If a reset has not been started, initiate an OCC reset via PLDM
initiateOccRequest(instanceID instance)397 void Manager::initiateOccRequest(instanceID instance)
398 {
399     if (!resetInProgress)
400     {
401         resetInProgress = true;
402         resetInstance = instance;
403         lg2::error(
404             "initiateOccRequest: Initiating PM Complex reset due to OCC{INST}",
405             "INST", instance);
406 
407         // Make sure ALL OCC comm stops to all OCCs before the reset
408         for (auto& obj : statusObjects)
409         {
410             if (obj->occActive())
411             {
412                 obj->occActive(false);
413             }
414         }
415 
416         pldmHandle->resetOCC(instance);
417         resetRequired = false;
418     }
419     else
420     {
421         lg2::warning(
422             "initiateOccRequest: Ignoring PM Complex reset request for OCC{INST}, because reset already in process for OCC{RINST}",
423             "INST", instance, "RINST", resetInstance);
424     }
425 }
426 
statusCallBack(instanceID instance,bool status)427 void Manager::statusCallBack(instanceID instance, bool status)
428 {
429     if (status == true)
430     {
431         if (resetInProgress)
432         {
433             lg2::info(
434                 "statusCallBack: Ignoring OCC{INST} activate because a reset has been initiated due to OCC{RINST}",
435                 "INST", instance, "RINST", resetInstance);
436             return;
437         }
438 
439         // OCC went active
440         ++activeCount;
441 
442         if (activeCount == 1)
443         {
444             // First OCC went active (allow some time for all OCCs to go active)
445             waitForAllOccsTimer->restartOnce(60s);
446         }
447 
448         if (activeCount == statusObjects.size())
449         {
450             // All OCCs are now running
451             if (waitForAllOccsTimer->isEnabled())
452             {
453                 // stop occ wait timer
454                 waitForAllOccsTimer->setEnabled(false);
455             }
456 
457             // All OCCs have been found, check if we need a reset
458             if (resetRequired)
459             {
460                 initiateOccRequest(resetInstance);
461 
462                 if (!waitForAllOccsTimer->isEnabled())
463                 {
464                     lg2::warning(
465                         "occsNotAllRunning: Restarting waitForAllOccTimer");
466                     // restart occ wait timer
467                     waitForAllOccsTimer->restartOnce(60s);
468                 }
469             }
470             else
471             {
472                 // Verify master OCC and start presence monitor
473                 validateOccMaster();
474             }
475         }
476 
477         // Start poll timer if not already started (since at least one OCC is
478         // running)
479         if (!_pollTimer->isEnabled())
480         {
481             // An OCC just went active, PM Complex is just coming online so
482             // clear any outstanding reset requests
483             if (resetRequired)
484             {
485                 resetRequired = false;
486                 lg2::error(
487                     "statusCallBack: clearing resetRequired (since OCC{INST} went active, resetInProgress={RIP})",
488                     "INST", instance, "RIP", resetInProgress);
489             }
490 
491             lg2::info("Manager: OCCs will be polled every {TIME} seconds",
492                       "TIME", pollInterval);
493 
494             // Send poll and start OCC poll timer
495             pollerTimerExpired();
496         }
497     }
498     else
499     {
500         // OCC went away
501         if (activeCount > 0)
502         {
503             --activeCount;
504         }
505         else
506         {
507             lg2::info("OCC{INST} disabled, and no other OCCs are active",
508                       "INST", instance);
509         }
510 
511         if (activeCount == 0)
512         {
513             // No OCCs are running
514 
515             if (resetInProgress)
516             {
517                 // All OCC active sensors are clear (reset should be in
518                 // progress)
519                 lg2::info(
520                     "statusCallBack: Clearing resetInProgress (activeCount={COUNT}, OCC{INST}, status={STATUS})",
521                     "COUNT", activeCount, "INST", instance, "STATUS", status);
522                 resetInProgress = false;
523                 resetInstance = 255;
524             }
525 
526             // Stop OCC poll timer
527             if (_pollTimer->isEnabled())
528             {
529                 lg2::info(
530                     "Manager::statusCallBack(): OCCs are not running, stopping poll timer");
531                 _pollTimer->setEnabled(false);
532             }
533 
534             // stop wait timer
535             if (waitForAllOccsTimer->isEnabled())
536             {
537                 waitForAllOccsTimer->setEnabled(false);
538             }
539         }
540         else if (resetInProgress)
541         {
542             lg2::info(
543                 "statusCallBack: Skipping clear of resetInProgress (activeCount={COUNT}, OCC{INST}, status={STATUS})",
544                 "COUNT", activeCount, "INST", instance, "STATUS", status);
545         }
546         // Clear OCC sensors
547         setSensorValueToNaN(instance);
548     }
549 
550     if (waitingForAllOccActiveSensors)
551     {
552         if (utils::isHostRunning())
553         {
554             checkAllActiveSensors();
555         }
556     }
557 }
558 
sbeTimeout(unsigned int instance)559 void Manager::sbeTimeout(unsigned int instance)
560 {
561     auto obj = std::find_if(statusObjects.begin(), statusObjects.end(),
562                             [instance](const auto& obj) {
563                                 return instance == obj->getOccInstanceID();
564                             });
565 
566     if (obj != statusObjects.end() && (*obj)->occActive())
567     {
568         lg2::info("SBE timeout, requesting HRESET (OCC{INST})", "INST",
569                   instance);
570 
571 #ifdef PHAL_SUPPORT
572         setSBEState(instance, SBE_STATE_NOT_USABLE);
573 #endif
574 
575         // Stop communication with this OCC
576         (*obj)->occActive(false);
577 
578         pldmHandle->sendHRESET(instance);
579     }
580 }
581 
updateOCCActive(instanceID instance,bool status)582 bool Manager::updateOCCActive(instanceID instance, bool status)
583 {
584     auto obj = std::find_if(statusObjects.begin(), statusObjects.end(),
585                             [instance](const auto& obj) {
586                                 return instance == obj->getOccInstanceID();
587                             });
588 
589     const bool hostRunning = open_power::occ::utils::isHostRunning();
590     if (obj != statusObjects.end())
591     {
592         if (!hostRunning && (status == true))
593         {
594             lg2::warning(
595                 "updateOCCActive: Host is not running yet (OCC{INST} active={STAT}), clearing sensor received",
596                 "INST", instance, "STAT", status);
597             (*obj)->setPldmSensorReceived(false);
598             if (!waitingForAllOccActiveSensors)
599             {
600                 lg2::info(
601                     "updateOCCActive: Waiting for Host and all OCC Active Sensors");
602                 waitingForAllOccActiveSensors = true;
603             }
604             discoverTimer->restartOnce(30s);
605             return false;
606         }
607         else
608         {
609             (*obj)->setPldmSensorReceived(true);
610             return (*obj)->occActive(status);
611         }
612     }
613     else
614     {
615         if (hostRunning)
616         {
617             lg2::warning(
618                 "updateOCCActive: No status object to update for OCC{INST} (active={STAT})",
619                 "INST", instance, "STAT", status);
620         }
621         else
622         {
623             if (status == true)
624             {
625                 lg2::warning(
626                     "updateOCCActive: No status objects and Host is not running yet (OCC{INST} active={STAT})",
627                     "INST", instance, "STAT", status);
628             }
629         }
630         if (status == true)
631         {
632             // OCC went active
633             queuedActiveState.insert(instance);
634         }
635         else
636         {
637             auto match = queuedActiveState.find(instance);
638             if (match != queuedActiveState.end())
639             {
640                 // OCC was disabled
641                 queuedActiveState.erase(match);
642             }
643         }
644         return false;
645     }
646 }
647 
648 // Called upon pldm event To set powermode Safe Mode State for system.
updateOccSafeMode(bool safeMode)649 void Manager::updateOccSafeMode(bool safeMode)
650 {
651     pmode->updateDbusSafeMode(safeMode);
652     // Update the processor throttle status on dbus
653     for (auto& obj : statusObjects)
654     {
655         obj->updateThrottle(safeMode, THROTTLED_SAFE);
656     }
657 }
658 
sbeHRESETResult(instanceID instance,bool success)659 void Manager::sbeHRESETResult(instanceID instance, bool success)
660 {
661     if (success)
662     {
663         lg2::info("HRESET succeeded (OCC{INST})", "INST", instance);
664 
665 #ifdef PHAL_SUPPORT
666         setSBEState(instance, SBE_STATE_BOOTED);
667 #endif
668 
669         // Re-enable communication with this OCC
670         auto obj = std::find_if(statusObjects.begin(), statusObjects.end(),
671                                 [instance](const auto& obj) {
672                                     return instance == obj->getOccInstanceID();
673                                 });
674         if (obj != statusObjects.end() && (!(*obj)->occActive()))
675         {
676             (*obj)->occActive(true);
677         }
678 
679         return;
680     }
681 
682 #ifdef PHAL_SUPPORT
683     setSBEState(instance, SBE_STATE_FAILED);
684 
685     if (sbeCanDump(instance))
686     {
687         lg2::info("HRESET failed (OCC{INST}), triggering SBE dump", "INST",
688                   instance);
689 
690         auto& bus = utils::getBus();
691         uint32_t src6 = instance << 16;
692         uint32_t logId =
693             FFDC::createPEL("org.open_power.Processor.Error.SbeChipOpTimeout",
694                             src6, "SBE command timeout");
695 
696         try
697         {
698             constexpr auto interface = "xyz.openbmc_project.Dump.Create";
699             constexpr auto function = "CreateDump";
700 
701             std::string service =
702                 utils::getService(OP_DUMP_OBJ_PATH, interface);
703             auto method = bus.new_method_call(service.c_str(), OP_DUMP_OBJ_PATH,
704                                               interface, function);
705 
706             std::map<std::string, std::variant<std::string, uint64_t>>
707                 createParams{
708                     {"com.ibm.Dump.Create.CreateParameters.ErrorLogId",
709                      uint64_t(logId)},
710                     {"com.ibm.Dump.Create.CreateParameters.DumpType",
711                      "com.ibm.Dump.Create.DumpType.SBE"},
712                     {"com.ibm.Dump.Create.CreateParameters.FailingUnitId",
713                      uint64_t(instance)},
714                 };
715 
716             method.append(createParams);
717 
718             auto response = bus.call(method);
719         }
720         catch (const sdbusplus::exception_t& e)
721         {
722             constexpr auto ERROR_DUMP_DISABLED =
723                 "xyz.openbmc_project.Dump.Create.Error.Disabled";
724             if (e.name() == ERROR_DUMP_DISABLED)
725             {
726                 lg2::info("Dump is disabled, skipping");
727             }
728             else
729             {
730                 lg2::error("Dump failed");
731             }
732         }
733     }
734 #endif
735 
736     // SBE Reset failed, try PM Complex reset
737     lg2::error("sbeHRESETResult: Forcing PM Complex reset");
738     resetOccRequest(instance);
739 }
740 
741 #ifdef PHAL_SUPPORT
sbeCanDump(unsigned int instance)742 bool Manager::sbeCanDump(unsigned int instance)
743 {
744     struct pdbg_target* proc = getPdbgTarget(instance);
745 
746     if (!proc)
747     {
748         // allow the dump in the error case
749         return true;
750     }
751 
752     try
753     {
754         if (!openpower::phal::sbe::isDumpAllowed(proc))
755         {
756             return false;
757         }
758 
759         if (openpower::phal::pdbg::isSbeVitalAttnActive(proc))
760         {
761             return false;
762         }
763     }
764     catch (openpower::phal::exception::SbeError& e)
765     {
766         lg2::info("Failed to query SBE state");
767     }
768 
769     // allow the dump in the error case
770     return true;
771 }
772 
setSBEState(unsigned int instance,enum sbe_state state)773 void Manager::setSBEState(unsigned int instance, enum sbe_state state)
774 {
775     struct pdbg_target* proc = getPdbgTarget(instance);
776 
777     if (!proc)
778     {
779         return;
780     }
781 
782     try
783     {
784         openpower::phal::sbe::setState(proc, state);
785     }
786     catch (const openpower::phal::exception::SbeError& e)
787     {
788         lg2::error("Failed to set SBE state: {ERROR}", "ERROR", e.what());
789     }
790 }
791 
getPdbgTarget(unsigned int instance)792 struct pdbg_target* Manager::getPdbgTarget(unsigned int instance)
793 {
794     if (!pdbgInitialized)
795     {
796         try
797         {
798             openpower::phal::pdbg::init();
799             pdbgInitialized = true;
800         }
801         catch (const openpower::phal::exception::PdbgError& e)
802         {
803             lg2::error("pdbg initialization failed");
804             return nullptr;
805         }
806     }
807 
808     struct pdbg_target* proc = nullptr;
809     pdbg_for_each_class_target("proc", proc)
810     {
811         if (pdbg_target_index(proc) == instance)
812         {
813             return proc;
814         }
815     }
816 
817     lg2::error("Failed to get pdbg target");
818     return nullptr;
819 }
820 #endif
821 
pollerTimerExpired()822 void Manager::pollerTimerExpired()
823 {
824     if (!_pollTimer)
825     {
826         lg2::error("pollerTimerExpired() ERROR: Timer not defined");
827         return;
828     }
829 
830     if (resetRequired)
831     {
832         lg2::error("pollerTimerExpired() - Initiating PM Complex reset");
833         initiateOccRequest(resetInstance);
834 
835         if (!waitForAllOccsTimer->isEnabled())
836         {
837             lg2::warning("pollerTimerExpired: Restarting waitForAllOccTimer");
838             // restart occ wait timer
839             waitForAllOccsTimer->restartOnce(60s);
840         }
841         return;
842     }
843 
844     for (auto& obj : statusObjects)
845     {
846         if (!obj->occActive())
847         {
848             // OCC is not running yet
849             auto id = obj->getOccInstanceID();
850             setSensorValueToNaN(id);
851             continue;
852         }
853 
854         // Read sysfs to force kernel to poll OCC
855         obj->readOccState();
856 
857         // Read occ sensor values
858         getSensorValues(obj);
859     }
860 
861     if (activeCount > 0)
862     {
863         // Restart OCC poll timer
864         _pollTimer->restartOnce(std::chrono::seconds(pollInterval));
865     }
866     else
867     {
868         // No OCCs running, so poll timer will not be restarted
869         lg2::info(
870             "Manager::pollerTimerExpired: poll timer will not be restarted");
871     }
872 }
873 
readTempSensors(const fs::path & path,uint32_t occInstance)874 void Manager::readTempSensors(const fs::path& path, uint32_t occInstance)
875 {
876     // There may be more than one sensor with the same FRU type
877     // and label so make two passes: the first to read the temps
878     // from sysfs, and the second to put them on D-Bus after
879     // resolving any conflicts.
880     std::map<std::string, double> sensorData;
881 
882     std::regex expr{"temp\\d+_label$"}; // Example: temp5_label
883     for (auto& file : fs::directory_iterator(path))
884     {
885         if (!std::regex_search(file.path().string(), expr))
886         {
887             continue;
888         }
889 
890         uint32_t labelValue{0};
891 
892         try
893         {
894             labelValue = readFile<uint32_t>(file.path());
895         }
896         catch (const std::system_error& e)
897         {
898             lg2::debug(
899                 "readTempSensors: Failed reading {PATH}, errno = {ERROR}",
900                 "PATH", file.path().string(), "ERROR", e.code().value());
901             continue;
902         }
903 
904         const std::string& tempLabel = "label";
905         const std::string filePathString = file.path().string().substr(
906             0, file.path().string().length() - tempLabel.length());
907 
908         uint32_t fruTypeValue{0};
909         try
910         {
911             fruTypeValue = readFile<uint32_t>(filePathString + fruTypeSuffix);
912         }
913         catch (const std::system_error& e)
914         {
915             lg2::debug(
916                 "readTempSensors: Failed reading {PATH}, errno = {ERROR}",
917                 "PATH", filePathString + fruTypeSuffix, "ERROR",
918                 e.code().value());
919             continue;
920         }
921 
922         std::string sensorPath =
923             OCC_SENSORS_ROOT + std::string("/temperature/");
924 
925         std::string dvfsTempPath;
926 
927         if (fruTypeValue == VRMVdd)
928         {
929             sensorPath.append(
930                 "vrm_vdd" + std::to_string(occInstance) + "_temp");
931         }
932         else if (fruTypeValue == processorIoRing)
933         {
934             sensorPath.append(
935                 "proc" + std::to_string(occInstance) + "_ioring_temp");
936             dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/proc" +
937                            std::to_string(occInstance) + "_ioring_dvfs_temp";
938         }
939         else
940         {
941             uint16_t type = (labelValue & 0xFF000000) >> 24;
942             uint16_t instanceID = labelValue & 0x0000FFFF;
943 
944             if (type == OCC_DIMM_TEMP_SENSOR_TYPE)
945             {
946                 if (fruTypeValue == fruTypeNotAvailable)
947                 {
948                     // Not all DIMM related temps are available to read
949                     // (no _input file in this case)
950                     continue;
951                 }
952                 auto iter = dimmTempSensorName.find(fruTypeValue);
953                 if (iter == dimmTempSensorName.end())
954                 {
955                     lg2::error(
956                         "readTempSensors: Fru type error! fruTypeValue = {FRU}) ",
957                         "FRU", fruTypeValue);
958                     continue;
959                 }
960 
961                 sensorPath.append(
962                     "dimm" + std::to_string(instanceID) + iter->second);
963 
964                 dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/" +
965                                dimmDVFSSensorName.at(fruTypeValue);
966             }
967             else if (type == OCC_CPU_TEMP_SENSOR_TYPE)
968             {
969                 if (fruTypeValue == processorCore)
970                 {
971                     // The OCC reports small core temps, of which there are
972                     // two per big core.  All current P10 systems are in big
973                     // core mode, so use a big core name.
974                     uint16_t coreNum = instanceID / 2;
975                     uint16_t tempNum = instanceID % 2;
976                     sensorPath.append("proc" + std::to_string(occInstance) +
977                                       "_core" + std::to_string(coreNum) + "_" +
978                                       std::to_string(tempNum) + "_temp");
979 
980                     dvfsTempPath =
981                         std::string{OCC_SENSORS_ROOT} + "/temperature/proc" +
982                         std::to_string(occInstance) + "_core_dvfs_temp";
983                 }
984                 else
985                 {
986                     continue;
987                 }
988             }
989             else
990             {
991                 continue;
992             }
993         }
994 
995         // The dvfs temp file only needs to be read once per chip per type.
996         if (!dvfsTempPath.empty() &&
997             !dbus::OccDBusSensors::getOccDBus().hasDvfsTemp(dvfsTempPath))
998         {
999             try
1000             {
1001                 auto dvfsValue = readFile<double>(filePathString + maxSuffix);
1002 
1003                 dbus::OccDBusSensors::getOccDBus().setDvfsTemp(
1004                     dvfsTempPath, dvfsValue * std::pow(10, -3));
1005             }
1006             catch (const std::system_error& e)
1007             {
1008                 lg2::debug(
1009                     "readTempSensors: Failed reading {PATH}, errno = {ERROR}",
1010                     "PATH", filePathString + maxSuffix, "ERROR",
1011                     e.code().value());
1012             }
1013         }
1014 
1015         uint32_t faultValue{0};
1016         try
1017         {
1018             faultValue = readFile<uint32_t>(filePathString + faultSuffix);
1019         }
1020         catch (const std::system_error& e)
1021         {
1022             lg2::debug(
1023                 "readTempSensors: Failed reading {PATH}, errno = {ERROR}",
1024                 "PATH", filePathString + faultSuffix, "ERROR",
1025                 e.code().value());
1026             continue;
1027         }
1028 
1029         double tempValue{0};
1030         // NOTE: if OCC sends back 0xFF, kernal sets this fault value to 1.
1031         if (faultValue != 0)
1032         {
1033             tempValue = std::numeric_limits<double>::quiet_NaN();
1034         }
1035         else
1036         {
1037             // Read the temperature
1038             try
1039             {
1040                 tempValue = readFile<double>(filePathString + inputSuffix);
1041             }
1042             catch (const std::system_error& e)
1043             {
1044                 lg2::debug(
1045                     "readTempSensors: Failed reading {PATH}, errno = {ERROR}",
1046                     "PATH", filePathString + inputSuffix, "ERROR",
1047                     e.code().value());
1048 
1049                 // if errno == EAGAIN(Resource temporarily unavailable) then set
1050                 // temp to 0, to avoid using old temp, and affecting FAN
1051                 // Control.
1052                 if (e.code().value() == EAGAIN)
1053                 {
1054                     tempValue = 0;
1055                 }
1056                 // else the errno would be something like
1057                 //     EBADF(Bad file descriptor)
1058                 // or ENOENT(No such file or directory)
1059                 else
1060                 {
1061                     continue;
1062                 }
1063             }
1064         }
1065 
1066         // If this object path already has a value, only overwite
1067         // it if the previous one was an NaN or a smaller value.
1068         auto existing = sensorData.find(sensorPath);
1069         if (existing != sensorData.end())
1070         {
1071             // Multiple sensors found for this FRU type
1072             if ((std::isnan(existing->second) && (tempValue == 0)) ||
1073                 ((existing->second == 0) && std::isnan(tempValue)))
1074             {
1075                 // One of the redundant sensors has failed (0xFF/nan), and the
1076                 // other sensor has no reading (0), so set the FRU to NaN to
1077                 // force fan increase
1078                 tempValue = std::numeric_limits<double>::quiet_NaN();
1079                 existing->second = tempValue;
1080             }
1081             if (std::isnan(existing->second) || (tempValue > existing->second))
1082             {
1083                 existing->second = tempValue;
1084             }
1085         }
1086         else
1087         {
1088             // First sensor for this FRU type
1089             sensorData[sensorPath] = tempValue;
1090         }
1091     }
1092 
1093     // Now publish the values on D-Bus.
1094     for (const auto& [objectPath, value] : sensorData)
1095     {
1096         dbus::OccDBusSensors::getOccDBus().setValue(objectPath,
1097                                                     value * std::pow(10, -3));
1098 
1099         dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1100             objectPath, !std::isnan(value));
1101 
1102         if (existingSensors.find(objectPath) == existingSensors.end())
1103         {
1104             dbus::OccDBusSensors::getOccDBus().setChassisAssociation(
1105                 objectPath, {"all_sensors"});
1106         }
1107         existingSensors[objectPath] = occInstance;
1108     }
1109 }
1110 
getPowerLabelFunctionID(const std::string & value)1111 std::optional<std::string> Manager::getPowerLabelFunctionID(
1112     const std::string& value)
1113 {
1114     // If the value is "system", then the FunctionID is "system".
1115     if (value == "system")
1116     {
1117         return value;
1118     }
1119 
1120     // If the value is not "system", then the label value have 3 numbers, of
1121     // which we only care about the middle one:
1122     // <sensor id>_<function id>_<apss channel>
1123     // eg: The value is "0_10_5" , then the FunctionID is "10".
1124     if (value.find("_") == std::string::npos)
1125     {
1126         return std::nullopt;
1127     }
1128 
1129     auto powerLabelValue = value.substr((value.find("_") + 1));
1130 
1131     if (powerLabelValue.find("_") == std::string::npos)
1132     {
1133         return std::nullopt;
1134     }
1135 
1136     return powerLabelValue.substr(0, powerLabelValue.find("_"));
1137 }
1138 
readPowerSensors(const fs::path & path,uint32_t id)1139 void Manager::readPowerSensors(const fs::path& path, uint32_t id)
1140 {
1141     std::regex expr{"power\\d+_label$"}; // Example: power5_label
1142     for (auto& file : fs::directory_iterator(path))
1143     {
1144         if (!std::regex_search(file.path().string(), expr))
1145         {
1146             continue;
1147         }
1148 
1149         std::string labelValue;
1150         try
1151         {
1152             labelValue = readFile<std::string>(file.path());
1153         }
1154         catch (const std::system_error& e)
1155         {
1156             lg2::debug(
1157                 "readPowerSensors: Failed reading {PATH}, errno = {ERROR}",
1158                 "PATH", file.path().string(), "ERROR", e.code().value());
1159             continue;
1160         }
1161 
1162         auto functionID = getPowerLabelFunctionID(labelValue);
1163         if (functionID == std::nullopt)
1164         {
1165             continue;
1166         }
1167 
1168         const std::string& tempLabel = "label";
1169         const std::string filePathString = file.path().string().substr(
1170             0, file.path().string().length() - tempLabel.length());
1171 
1172         std::string sensorPath = OCC_SENSORS_ROOT + std::string("/power/");
1173 
1174         auto iter = powerSensorName.find(*functionID);
1175         if (iter == powerSensorName.end())
1176         {
1177             continue;
1178         }
1179         sensorPath.append(iter->second);
1180 
1181         double tempValue{0};
1182 
1183         try
1184         {
1185             tempValue = readFile<double>(filePathString + inputSuffix);
1186         }
1187         catch (const std::system_error& e)
1188         {
1189             lg2::debug(
1190                 "readPowerSensors: Failed reading {PATH}, errno = {ERROR}",
1191                 "PATH", filePathString + inputSuffix, "ERROR",
1192                 e.code().value());
1193             continue;
1194         }
1195 
1196         dbus::OccDBusSensors::getOccDBus().setUnit(
1197             sensorPath, "xyz.openbmc_project.Sensor.Value.Unit.Watts");
1198 
1199         dbus::OccDBusSensors::getOccDBus().setValue(
1200             sensorPath, tempValue * std::pow(10, -3) * std::pow(10, -3));
1201 
1202         dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1203             sensorPath, true);
1204 
1205         if (existingSensors.find(sensorPath) == existingSensors.end())
1206         {
1207             std::vector<std::string> fTypeList = {"all_sensors"};
1208             if (iter->second == "total_power")
1209             {
1210                 // Set sensor purpose as TotalPower
1211                 dbus::OccDBusSensors::getOccDBus().setPurpose(
1212                     sensorPath,
1213                     "xyz.openbmc_project.Sensor.Purpose.SensorPurpose.TotalPower");
1214             }
1215             dbus::OccDBusSensors::getOccDBus().setChassisAssociation(
1216                 sensorPath, fTypeList);
1217         }
1218         existingSensors[sensorPath] = id;
1219     }
1220     return;
1221 }
1222 
readExtnSensors(const fs::path & path,uint32_t id)1223 void Manager::readExtnSensors(const fs::path& path, uint32_t id)
1224 {
1225     std::regex expr{"extn\\d+_label$"}; // Example: extn5_label
1226     for (auto& file : fs::directory_iterator(path))
1227     {
1228         if (!std::regex_search(file.path().string(), expr))
1229         {
1230             continue;
1231         }
1232 
1233         // Read in Label value of the sensor from file.
1234         std::string labelValue;
1235         try
1236         {
1237             labelValue = readFile<std::string>(file.path());
1238         }
1239         catch (const std::system_error& e)
1240         {
1241             lg2::debug(
1242                 "readExtnSensors:label Failed reading {PATH}, errno = {ERROR}",
1243                 "PATH", file.path().string(), "ERROR", e.code().value());
1244             continue;
1245         }
1246         const std::string& tempLabel = "label";
1247         const std::string filePathString = file.path().string().substr(
1248             0, file.path().string().length() - tempLabel.length());
1249 
1250         std::string sensorPath = OCC_SENSORS_ROOT + std::string("/power/");
1251 
1252         // Labels of EXTN sections from OCC interface Document
1253         //     have different formats.
1254         // 0x464d494e : FMIN            0x46444953 : FDIS
1255         // 0x46424153 : FBAS            0x46555400 : FUT
1256         // 0x464d4158 : FMAX            0x434c4950 : CLIP
1257         // 0x4d4f4445 : MODE            0x574f4643 : WOFC
1258         // 0x574f4649 : WOFI            0x5057524d : PWRM
1259         // 0x50575250 : PWRP            0x45525248 : ERRH
1260         // Label indicating byte 5 and 6 is the current (mem,proc) power in
1261         //      Watts.
1262         if ((labelValue == EXTN_LABEL_PWRM_MEMORY_POWER) ||
1263             (labelValue == EXTN_LABEL_PWRP_PROCESSOR_POWER))
1264         {
1265             // Build the dbus String for this chiplet power asset.
1266             if (labelValue == EXTN_LABEL_PWRP_PROCESSOR_POWER)
1267             {
1268                 labelValue = "_power";
1269             }
1270             else // else EXTN_LABEL_PWRM_MEMORY_POWER
1271             {
1272                 labelValue = "_mem_power";
1273             }
1274             sensorPath.append("chiplet" + std::to_string(id) + labelValue);
1275 
1276             // Read in data value of the sensor from file.
1277             // Read in as string due to different format of data in sensors.
1278             std::string extnValue;
1279             try
1280             {
1281                 extnValue = readFile<std::string>(filePathString + inputSuffix);
1282             }
1283             catch (const std::system_error& e)
1284             {
1285                 lg2::debug(
1286                     "readExtnSensors:value Failed reading {PATH}, errno = {ERROR}",
1287                     "PATH", filePathString + inputSuffix, "ERROR",
1288                     e.code().value());
1289                 continue;
1290             }
1291 
1292             // For Power field, Convert last 4 bytes of hex string into number
1293             //   value.
1294             std::stringstream ssData;
1295             ssData << std::hex << extnValue.substr(extnValue.length() - 4);
1296             uint16_t MyHexNumber;
1297             ssData >> MyHexNumber;
1298 
1299             // Convert output/DC power to input/AC power in Watts (round up)
1300             MyHexNumber =
1301                 std::round(((MyHexNumber / (PS_DERATING_FACTOR / 100.0))));
1302 
1303             dbus::OccDBusSensors::getOccDBus().setUnit(
1304                 sensorPath, "xyz.openbmc_project.Sensor.Value.Unit.Watts");
1305 
1306             dbus::OccDBusSensors::getOccDBus().setValue(sensorPath,
1307                                                         MyHexNumber);
1308 
1309             dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1310                 sensorPath, true);
1311 
1312             if (existingSensors.find(sensorPath) == existingSensors.end())
1313             {
1314                 dbus::OccDBusSensors::getOccDBus().setChassisAssociation(
1315                     sensorPath, {"all_sensors"});
1316             }
1317 
1318             existingSensors[sensorPath] = id;
1319         } // End Extended Power Sensors.
1320     } // End For loop on files for Extended Sensors.
1321     return;
1322 }
1323 
setSensorValueToNaN(uint32_t id) const1324 void Manager::setSensorValueToNaN(uint32_t id) const
1325 {
1326     for (const auto& [sensorPath, occId] : existingSensors)
1327     {
1328         if (occId == id)
1329         {
1330             dbus::OccDBusSensors::getOccDBus().setValue(
1331                 sensorPath, std::numeric_limits<double>::quiet_NaN());
1332 
1333             dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1334                 sensorPath, true);
1335         }
1336     }
1337     return;
1338 }
1339 
setSensorValueToNonFunctional(uint32_t id) const1340 void Manager::setSensorValueToNonFunctional(uint32_t id) const
1341 {
1342     for (const auto& [sensorPath, occId] : existingSensors)
1343     {
1344         if (occId == id)
1345         {
1346             dbus::OccDBusSensors::getOccDBus().setValue(
1347                 sensorPath, std::numeric_limits<double>::quiet_NaN());
1348 
1349             dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1350                 sensorPath, false);
1351         }
1352     }
1353     return;
1354 }
1355 
getSensorValues(std::unique_ptr<Status> & occ)1356 void Manager::getSensorValues(std::unique_ptr<Status>& occ)
1357 {
1358     static bool tracedError[8] = {0};
1359     const fs::path sensorPath = occ->getHwmonPath();
1360     const uint32_t id = occ->getOccInstanceID();
1361 
1362     if (fs::exists(sensorPath))
1363     {
1364         // Read temperature sensors
1365         readTempSensors(sensorPath, id);
1366         // Read Extended sensors
1367         readExtnSensors(sensorPath, id);
1368 
1369         if (occ->isMasterOcc())
1370         {
1371             // Read power sensors
1372             readPowerSensors(sensorPath, id);
1373         }
1374         tracedError[id] = false;
1375     }
1376     else
1377     {
1378         if (!tracedError[id])
1379         {
1380             lg2::error(
1381                 "Manager::getSensorValues: OCC{INST} sensor path missing: {PATH}",
1382                 "INST", id, "PATH", sensorPath);
1383             tracedError[id] = true;
1384         }
1385     }
1386 
1387     return;
1388 }
1389 
1390 // Read the altitude from DBus
readAltitude()1391 void Manager::readAltitude()
1392 {
1393     static bool traceAltitudeErr = true;
1394 
1395     utils::PropertyValue altitudeProperty{};
1396     try
1397     {
1398         altitudeProperty = utils::getProperty(ALTITUDE_PATH, ALTITUDE_INTERFACE,
1399                                               ALTITUDE_PROP);
1400         auto sensorVal = std::get<double>(altitudeProperty);
1401         if (sensorVal < 0xFFFF)
1402         {
1403             if (sensorVal < 0)
1404             {
1405                 altitude = 0;
1406             }
1407             else
1408             {
1409                 // Round to nearest meter
1410                 altitude = uint16_t(sensorVal + 0.5);
1411             }
1412             lg2::debug("readAltitude: sensor={VALUE} ({ALT}m)", "VALUE",
1413                        sensorVal, "ALT", altitude);
1414             traceAltitudeErr = true;
1415         }
1416         else
1417         {
1418             if (traceAltitudeErr)
1419             {
1420                 traceAltitudeErr = false;
1421                 lg2::debug("Invalid altitude value: {ALT}", "ALT", sensorVal);
1422             }
1423         }
1424     }
1425     catch (const sdbusplus::exception_t& e)
1426     {
1427         if (traceAltitudeErr)
1428         {
1429             traceAltitudeErr = false;
1430             lg2::info("Unable to read Altitude: {ERROR}", "ERROR", e.what());
1431         }
1432         altitude = 0xFFFF; // not available
1433     }
1434 }
1435 
1436 // Callback function when ambient temperature changes
ambientCallback(sdbusplus::message_t & msg)1437 void Manager::ambientCallback(sdbusplus::message_t& msg)
1438 {
1439     double currentTemp = 0;
1440     uint8_t truncatedTemp = 0xFF;
1441     std::string msgSensor;
1442     std::map<std::string, std::variant<double>> msgData;
1443     msg.read(msgSensor, msgData);
1444 
1445     auto valPropMap = msgData.find(AMBIENT_PROP);
1446     if (valPropMap == msgData.end())
1447     {
1448         lg2::debug("ambientCallback: Unknown ambient property changed");
1449         return;
1450     }
1451     currentTemp = std::get<double>(valPropMap->second);
1452     if (std::isnan(currentTemp))
1453     {
1454         truncatedTemp = 0xFF;
1455     }
1456     else
1457     {
1458         if (currentTemp < 0)
1459         {
1460             truncatedTemp = 0;
1461         }
1462         else
1463         {
1464             // Round to nearest degree C
1465             truncatedTemp = uint8_t(currentTemp + 0.5);
1466         }
1467     }
1468 
1469     // If ambient changes, notify OCCs
1470     if (truncatedTemp != ambient)
1471     {
1472         lg2::debug("ambientCallback: Ambient change from {OLD} to {NEW}C",
1473                    "OLD", ambient, "NEW", currentTemp);
1474 
1475         ambient = truncatedTemp;
1476         if (altitude == 0xFFFF)
1477         {
1478             // No altitude yet, try reading again
1479             readAltitude();
1480         }
1481 
1482         lg2::debug("ambientCallback: Ambient: {TEMP}C, altitude: {ALT}m",
1483                    "TEMP", ambient, "ALT", altitude);
1484 
1485         // Send ambient and altitude to all OCCs
1486         for (auto& obj : statusObjects)
1487         {
1488             if (obj->occActive())
1489             {
1490                 obj->sendAmbient(ambient, altitude);
1491             }
1492         }
1493     }
1494 }
1495 
1496 // return the current ambient and altitude readings
getAmbientData(bool & ambientValid,uint8_t & ambientTemp,uint16_t & altitudeValue) const1497 void Manager::getAmbientData(bool& ambientValid, uint8_t& ambientTemp,
1498                              uint16_t& altitudeValue) const
1499 {
1500     ambientValid = true;
1501     ambientTemp = ambient;
1502     altitudeValue = altitude;
1503 
1504     if (ambient == 0xFF)
1505     {
1506         ambientValid = false;
1507     }
1508 }
1509 
1510 // Called when waitForAllOccsTimer expires
1511 // After the first OCC goes active, this timer will be started (60 seconds)
occsNotAllRunning()1512 void Manager::occsNotAllRunning()
1513 {
1514     if (resetInProgress)
1515     {
1516         lg2::warning(
1517             "occsNotAllRunning: Ignoring waitForAllOccsTimer because reset is in progress");
1518         return;
1519     }
1520     if (activeCount != statusObjects.size())
1521     {
1522         // Not all OCCs went active
1523         lg2::warning(
1524             "occsNotAllRunning: Active OCC count ({COUNT}) does not match expected count ({EXP})",
1525             "COUNT", activeCount, "EXP", statusObjects.size());
1526         // Procs may be garded, so may be expected
1527     }
1528 
1529     if (resetRequired)
1530     {
1531         initiateOccRequest(resetInstance);
1532 
1533         if (!waitForAllOccsTimer->isEnabled())
1534         {
1535             lg2::warning("occsNotAllRunning: Restarting waitForAllOccTimer");
1536             // restart occ wait timer
1537             waitForAllOccsTimer->restartOnce(60s);
1538         }
1539     }
1540     else
1541     {
1542         validateOccMaster();
1543     }
1544 }
1545 
1546 // Called when throttlePldmTraceTimer expires.
1547 // If this timer expires, that indicates there are no OCC active sensor PDRs
1548 // found which will trigger pldm traces to be throttled.
1549 // The second time this timer expires, a PEL will get created.
throttlePldmTraceExpired()1550 void Manager::throttlePldmTraceExpired()
1551 {
1552     if (utils::isHostRunning())
1553     {
1554         if (!onPldmTimeoutCreatePel)
1555         {
1556             // Throttle traces
1557             pldmHandle->setTraceThrottle(true);
1558             // Restart timer to log a PEL when timer expires
1559             onPldmTimeoutCreatePel = true;
1560             throttlePldmTraceTimer->restartOnce(40min);
1561         }
1562         else
1563         {
1564             lg2::error(
1565                 "throttlePldmTraceExpired(): OCC active sensors still not available!");
1566             // Create PEL
1567             createPldmSensorPEL();
1568         }
1569     }
1570     else
1571     {
1572         // Make sure traces are not throttled
1573         pldmHandle->setTraceThrottle(false);
1574         lg2::info(
1575             "throttlePldmTraceExpired(): host it not running ignoring sensor timer");
1576     }
1577 }
1578 
createPldmSensorPEL()1579 void Manager::createPldmSensorPEL()
1580 {
1581     Error::Descriptor d = Error::Descriptor(MISSING_OCC_SENSORS_PATH);
1582     std::map<std::string, std::string> additionalData;
1583 
1584     additionalData.emplace("_PID", std::to_string(getpid()));
1585 
1586     lg2::info(
1587         "createPldmSensorPEL(): Unable to find PLDM sensors for the OCCs");
1588 
1589     auto& bus = utils::getBus();
1590 
1591     try
1592     {
1593         FFDCFiles ffdc;
1594         // Add occ-control journal traces to PEL FFDC
1595         auto occJournalFile =
1596             FFDC::addJournalEntries(ffdc, "openpower-occ-control", 40);
1597 
1598         static constexpr auto loggingObjectPath =
1599             "/xyz/openbmc_project/logging";
1600         static constexpr auto opLoggingInterface = "org.open_power.Logging.PEL";
1601         std::string service =
1602             utils::getService(loggingObjectPath, opLoggingInterface);
1603         auto method =
1604             bus.new_method_call(service.c_str(), loggingObjectPath,
1605                                 opLoggingInterface, "CreatePELWithFFDCFiles");
1606 
1607         // Set level to Warning (Predictive).
1608         auto level =
1609             sdbusplus::xyz::openbmc_project::Logging::server::convertForMessage(
1610                 sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level::
1611                     Warning);
1612 
1613         method.append(d.path, level, additionalData, ffdc);
1614         bus.call(method);
1615     }
1616     catch (const sdbusplus::exception_t& e)
1617     {
1618         lg2::error("Failed to create MISSING_OCC_SENSORS PEL: {ERROR}", "ERROR",
1619                    e.what());
1620     }
1621 }
1622 
1623 // Verify single master OCC and start presence monitor
validateOccMaster()1624 void Manager::validateOccMaster()
1625 {
1626     int masterInstance = -1;
1627     for (auto& obj : statusObjects)
1628     {
1629         auto instance = obj->getOccInstanceID();
1630 
1631         if (!obj->occActive())
1632         {
1633             if (utils::isHostRunning())
1634             {
1635                 // Check if sensor was queued while waiting for discovery
1636                 auto match = queuedActiveState.find(instance);
1637                 if (match != queuedActiveState.end())
1638                 {
1639                     queuedActiveState.erase(match);
1640                     lg2::info("validateOccMaster: OCC{INST} is ACTIVE (queued)",
1641                               "INST", instance);
1642                     obj->occActive(true);
1643                 }
1644                 else
1645                 {
1646                     // OCC does not appear to be active yet, check active sensor
1647                     pldmHandle->checkActiveSensor(instance);
1648                     if (obj->occActive())
1649                     {
1650                         lg2::info(
1651                             "validateOccMaster: OCC{INST} is ACTIVE after reading sensor",
1652                             "INST", instance);
1653                     }
1654                 }
1655             }
1656             else
1657             {
1658                 lg2::warning(
1659                     "validateOccMaster: HOST is not running (OCC{INST})",
1660                     "INST", instance);
1661                 return;
1662             }
1663         }
1664 
1665         if (obj->isMasterOcc())
1666         {
1667             obj->addPresenceWatchMaster();
1668 
1669             if (masterInstance == -1)
1670             {
1671                 masterInstance = instance;
1672             }
1673             else
1674             {
1675                 lg2::error(
1676                     "validateOccMaster: Multiple OCC masters! ({MAST1} and {MAST2})",
1677                     "MAST1", masterInstance, "MAST2", instance);
1678                 // request reset
1679                 obj->deviceError(Error::Descriptor(PRESENCE_ERROR_PATH));
1680             }
1681         }
1682     }
1683 
1684     if (masterInstance < 0)
1685     {
1686         lg2::error("validateOccMaster: Master OCC not found! (of {NUM} OCCs)",
1687                    "NUM", statusObjects.size());
1688         // request reset
1689         statusObjects.front()->deviceError(
1690             Error::Descriptor(PRESENCE_ERROR_PATH));
1691     }
1692     else
1693     {
1694         lg2::info("validateOccMaster: OCC{INST} is master of {COUNT} OCCs",
1695                   "INST", masterInstance, "COUNT", activeCount);
1696 
1697         pmode->updateDbusSafeMode(false);
1698     }
1699 }
1700 
updatePcapBounds() const1701 void Manager::updatePcapBounds() const
1702 {
1703     if (pcap)
1704     {
1705         pcap->updatePcapBounds();
1706     }
1707 }
1708 
1709 // Clean up any variables since the OCC is no longer running.
1710 // Called when pldm receives an event indicating host is powered off.
hostPoweredOff()1711 void Manager::hostPoweredOff()
1712 {
1713     if (resetRequired)
1714     {
1715         lg2::info("hostPoweredOff: Clearing resetRequired for OCC{INST}",
1716                   "INST", resetInstance);
1717         resetRequired = false;
1718     }
1719     if (resetInProgress)
1720     {
1721         lg2::info("hostPoweredOff: Clearing resetInProgress for OCC{INST}",
1722                   "INST", resetInstance);
1723         resetInProgress = false;
1724     }
1725     resetInstance = 255;
1726 }
1727 
collectDumpData(sdeventplus::source::Signal &,const struct signalfd_siginfo *)1728 void Manager::collectDumpData(sdeventplus::source::Signal&,
1729                               const struct signalfd_siginfo*)
1730 {
1731     json data;
1732     lg2::info("collectDumpData()");
1733     data["objectCount"] = std::to_string(statusObjects.size()) + " OCC objects";
1734     if (statusObjects.size() > 0)
1735     {
1736         try
1737         {
1738             for (auto& occ : statusObjects)
1739             {
1740                 json occData;
1741                 auto instance = occ->getOccInstanceID();
1742                 std::string occName = "occ" + std::to_string(instance);
1743 
1744                 if (occ->occActive())
1745                 {
1746                     // OCC General Info
1747                     occData["occState"] = "ACTIVE";
1748                     occData["occRole"] =
1749                         occ->isMasterOcc() ? "MASTER" : "SECONDARY";
1750                     occData["occHwmonPath"] =
1751                         occ->getHwmonPath().generic_string();
1752 
1753                     // OCC Poll Response
1754                     std::vector<std::uint8_t> cmd = {0x00, 0x00, 0x01, 0x20};
1755                     std::vector<std::uint8_t> rsp;
1756                     std::vector<std::string> rspHex;
1757                     rsp = passThroughObjects[instance]->send(cmd);
1758                     if (rsp.size() > 5)
1759                     {
1760                         rsp.erase(rsp.begin(),
1761                                   rsp.begin() + 5); // Strip rsp header
1762                         rspHex = utils::hex_dump(rsp);
1763                         occData["pollResponse"] = rspHex;
1764                     }
1765 
1766                     // Debug Data: WOF Dynamic Data
1767                     cmd = {0x40, 0x00, 0x01, 0x01};
1768                     rsp = passThroughObjects[instance]->send(cmd);
1769                     if (rsp.size() > 5)
1770                     {
1771                         rsp.erase(rsp.begin(),
1772                                   rsp.begin() + 5); // Strip rsp header
1773                         rspHex = utils::hex_dump(rsp);
1774                         occData["wofDataDynamic"] = rspHex;
1775                     }
1776 
1777                     // Debug Data: WOF Dynamic Data
1778                     cmd = {0x40, 0x00, 0x01, 0x0A};
1779                     rsp = passThroughObjects[instance]->send(cmd);
1780                     if (rsp.size() > 5)
1781                     {
1782                         rsp.erase(rsp.begin(),
1783                                   rsp.begin() + 5); // Strip rsp header
1784                         rspHex = utils::hex_dump(rsp);
1785                         occData["wofDataStatic"] = rspHex;
1786                     }
1787                 }
1788                 else
1789                 {
1790                     occData["occState"] = "NOT ACTIVE";
1791                 }
1792 
1793                 data[occName] = occData;
1794             }
1795         }
1796         catch (const std::exception& e)
1797         {
1798             lg2::error("Failed to collect OCC dump data: {ERR}", "ERR",
1799                        e.what());
1800         }
1801     }
1802 
1803     std::ofstream file{Manager::dumpFile};
1804     if (!file)
1805     {
1806         lg2::error("Failed to open {FILE} for occ-control data", "FILE",
1807                    Manager::dumpFile);
1808         return;
1809     }
1810 
1811     file << std::setw(4) << data;
1812 }
1813 
1814 } // namespace occ
1815 } // namespace open_power
1816