1 #include "config.h"
2
3 #include "occ_manager.hpp"
4
5 #include "i2c_occ.hpp"
6 #include "occ_dbus.hpp"
7 #include "occ_errors.hpp"
8 #include "utils.hpp"
9
10 #include <phosphor-logging/elog-errors.hpp>
11 #include <phosphor-logging/log.hpp>
12 #include <xyz/openbmc_project/Common/error.hpp>
13
14 #include <chrono>
15 #include <cmath>
16 #include <filesystem>
17 #include <fstream>
18 #include <regex>
19
20 namespace open_power
21 {
22 namespace occ
23 {
24
25 constexpr uint32_t fruTypeNotAvailable = 0xFF;
26 constexpr auto fruTypeSuffix = "fru_type";
27 constexpr auto faultSuffix = "fault";
28 constexpr auto inputSuffix = "input";
29 constexpr auto maxSuffix = "max";
30
31 const auto HOST_ON_FILE = "/run/openbmc/host@0-on";
32
33 using namespace phosphor::logging;
34 using namespace std::literals::chrono_literals;
35
36 template <typename T>
readFile(const std::string & path)37 T readFile(const std::string& path)
38 {
39 std::ifstream ifs;
40 ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit |
41 std::ifstream::eofbit);
42 T data;
43
44 try
45 {
46 ifs.open(path);
47 ifs >> data;
48 ifs.close();
49 }
50 catch (const std::exception& e)
51 {
52 auto err = errno;
53 throw std::system_error(err, std::generic_category());
54 }
55
56 return data;
57 }
58
59 // findAndCreateObjects():
60 // Takes care of getting the required objects created and
61 // finds the available devices/processors.
62 // (function is called everytime the discoverTimer expires)
63 // - create the PowerMode object to control OCC modes
64 // - create statusObjects for each OCC device found
65 // - waits for OCC Active sensors PDRs to become available
66 // - restart discoverTimer if all data is not available yet
findAndCreateObjects()67 void Manager::findAndCreateObjects()
68 {
69 #ifndef POWER10
70 for (auto id = 0; id < MAX_CPUS; ++id)
71 {
72 // Create one occ per cpu
73 auto occ = std::string(OCC_NAME) + std::to_string(id);
74 createObjects(occ);
75 }
76 #else
77 if (!pmode)
78 {
79 // Create the power mode object
80 pmode = std::make_unique<powermode::PowerMode>(
81 *this, powermode::PMODE_PATH, powermode::PIPS_PATH, event);
82 }
83
84 if (!fs::exists(HOST_ON_FILE))
85 {
86 static bool statusObjCreated = false;
87 if (!statusObjCreated)
88 {
89 // Create the OCCs based on on the /dev/occX devices
90 auto occs = findOCCsInDev();
91
92 if (occs.empty() || (prevOCCSearch.size() != occs.size()))
93 {
94 // Something changed or no OCCs yet, try again in 10s.
95 // Note on the first pass prevOCCSearch will be empty,
96 // so there will be at least one delay to give things
97 // a chance to settle.
98 prevOCCSearch = occs;
99
100 log<level::INFO>(
101 std::format(
102 "Manager::findAndCreateObjects(): Waiting for OCCs (currently {})",
103 occs.size())
104 .c_str());
105
106 discoverTimer->restartOnce(10s);
107 }
108 else
109 {
110 // All OCCs appear to be available, create status objects
111
112 // createObjects requires OCC0 first.
113 std::sort(occs.begin(), occs.end());
114
115 log<level::INFO>(
116 std::format(
117 "Manager::findAndCreateObjects(): Creating {} OCC Status Objects",
118 occs.size())
119 .c_str());
120 for (auto id : occs)
121 {
122 createObjects(std::string(OCC_NAME) + std::to_string(id));
123 }
124 statusObjCreated = true;
125 waitingForAllOccActiveSensors = true;
126
127 // Find/update the processor path associated with each OCC
128 for (auto& obj : statusObjects)
129 {
130 obj->updateProcAssociation();
131 }
132 }
133 }
134
135 if (statusObjCreated && waitingForAllOccActiveSensors)
136 {
137 static bool tracedHostWait = false;
138 if (utils::isHostRunning())
139 {
140 if (tracedHostWait)
141 {
142 log<level::INFO>(
143 "Manager::findAndCreateObjects(): Host is running");
144 tracedHostWait = false;
145 }
146 checkAllActiveSensors();
147 }
148 else
149 {
150 if (!tracedHostWait)
151 {
152 log<level::INFO>(
153 "Manager::findAndCreateObjects(): Waiting for host to start");
154 tracedHostWait = true;
155 }
156 discoverTimer->restartOnce(30s);
157 #ifdef PLDM
158 if (throttlePldmTraceTimer->isEnabled())
159 {
160 // Host is no longer running, disable throttle timer and
161 // make sure traces are not throttled
162 log<level::INFO>(
163 "findAndCreateObjects(): disabling sensor timer");
164 throttlePldmTraceTimer->setEnabled(false);
165 pldmHandle->setTraceThrottle(false);
166 }
167 #endif
168 }
169 }
170 }
171 else
172 {
173 log<level::INFO>(
174 std::format(
175 "Manager::findAndCreateObjects(): Waiting for {} to complete...",
176 HOST_ON_FILE)
177 .c_str());
178 discoverTimer->restartOnce(10s);
179 }
180 #endif
181 }
182
183 #ifdef POWER10
184 // Check if all occActive sensors are available
checkAllActiveSensors()185 void Manager::checkAllActiveSensors()
186 {
187 static bool allActiveSensorAvailable = false;
188 static bool tracedSensorWait = false;
189 static bool waitingForHost = false;
190
191 if (open_power::occ::utils::isHostRunning())
192 {
193 if (waitingForHost)
194 {
195 waitingForHost = false;
196 log<level::INFO>("checkAllActiveSensors(): Host is now running");
197 }
198
199 // Start with the assumption that all are available
200 allActiveSensorAvailable = true;
201 for (auto& obj : statusObjects)
202 {
203 if ((!obj->occActive()) && (!obj->getPldmSensorReceived()))
204 {
205 auto instance = obj->getOccInstanceID();
206 // Check if sensor was queued while waiting for discovery
207 auto match = queuedActiveState.find(instance);
208 if (match != queuedActiveState.end())
209 {
210 queuedActiveState.erase(match);
211 log<level::INFO>(
212 std::format(
213 "checkAllActiveSensors(): OCC{} is ACTIVE (queued)",
214 instance)
215 .c_str());
216 obj->occActive(true);
217 }
218 else
219 {
220 allActiveSensorAvailable = false;
221 if (!tracedSensorWait)
222 {
223 log<level::INFO>(
224 std::format(
225 "checkAllActiveSensors(): Waiting on OCC{} Active sensor",
226 instance)
227 .c_str());
228 tracedSensorWait = true;
229 #ifdef PLDM
230 // Make sure PLDM traces are not throttled
231 pldmHandle->setTraceThrottle(false);
232 // Start timer to throttle PLDM traces when timer
233 // expires
234 onPldmTimeoutCreatePel = false;
235 throttlePldmTraceTimer->restartOnce(5min);
236 #endif
237 }
238 #ifdef PLDM
239 // Ignore active sensor check if the OCCs are being reset
240 if (!resetInProgress)
241 {
242 pldmHandle->checkActiveSensor(obj->getOccInstanceID());
243 }
244 #endif
245 break;
246 }
247 }
248 }
249 }
250 else
251 {
252 if (!waitingForHost)
253 {
254 waitingForHost = true;
255 log<level::INFO>(
256 "checkAllActiveSensors(): Waiting for host to start");
257 #ifdef PLDM
258 if (throttlePldmTraceTimer->isEnabled())
259 {
260 // Host is no longer running, disable throttle timer and
261 // make sure traces are not throttled
262 log<level::INFO>(
263 "checkAllActiveSensors(): disabling sensor timer");
264 throttlePldmTraceTimer->setEnabled(false);
265 pldmHandle->setTraceThrottle(false);
266 }
267 #endif
268 }
269 }
270
271 if (allActiveSensorAvailable)
272 {
273 // All sensors were found, disable the discovery timer
274 if (discoverTimer->isEnabled())
275 {
276 discoverTimer->setEnabled(false);
277 }
278 #ifdef PLDM
279 if (throttlePldmTraceTimer->isEnabled())
280 {
281 // Disable throttle timer and make sure traces are not throttled
282 throttlePldmTraceTimer->setEnabled(false);
283 pldmHandle->setTraceThrottle(false);
284 }
285 #endif
286 if (waitingForAllOccActiveSensors)
287 {
288 log<level::INFO>(
289 "checkAllActiveSensors(): OCC Active sensors are available");
290 waitingForAllOccActiveSensors = false;
291
292 if (resetRequired)
293 {
294 initiateOccRequest(resetInstance);
295
296 if (!waitForAllOccsTimer->isEnabled())
297 {
298 log<level::WARNING>(
299 "occsNotAllRunning: Restarting waitForAllOccTimer");
300 // restart occ wait timer to check status after reset
301 // completes
302 waitForAllOccsTimer->restartOnce(60s);
303 }
304 }
305 }
306 queuedActiveState.clear();
307 tracedSensorWait = false;
308 }
309 else
310 {
311 // Not all sensors were available, so keep waiting
312 if (!tracedSensorWait)
313 {
314 log<level::INFO>(
315 "checkAllActiveSensors(): Waiting for OCC Active sensors to become available");
316 tracedSensorWait = true;
317 }
318 discoverTimer->restartOnce(10s);
319 }
320 }
321 #endif
322
findOCCsInDev()323 std::vector<int> Manager::findOCCsInDev()
324 {
325 std::vector<int> occs;
326 std::regex expr{R"(occ(\d+)$)"};
327
328 for (auto& file : fs::directory_iterator("/dev"))
329 {
330 std::smatch match;
331 std::string path{file.path().string()};
332 if (std::regex_search(path, match, expr))
333 {
334 auto num = std::stoi(match[1].str());
335
336 // /dev numbering starts at 1, ours starts at 0.
337 occs.push_back(num - 1);
338 }
339 }
340
341 return occs;
342 }
343
cpuCreated(sdbusplus::message_t & msg)344 int Manager::cpuCreated(sdbusplus::message_t& msg)
345 {
346 namespace fs = std::filesystem;
347
348 sdbusplus::message::object_path o;
349 msg.read(o);
350 fs::path cpuPath(std::string(std::move(o)));
351
352 auto name = cpuPath.filename().string();
353 auto index = name.find(CPU_NAME);
354 name.replace(index, std::strlen(CPU_NAME), OCC_NAME);
355
356 createObjects(name);
357
358 return 0;
359 }
360
createObjects(const std::string & occ)361 void Manager::createObjects(const std::string& occ)
362 {
363 auto path = fs::path(OCC_CONTROL_ROOT) / occ;
364
365 statusObjects.emplace_back(std::make_unique<Status>(
366 event, path.c_str(), *this,
367 #ifdef POWER10
368 pmode,
369 #endif
370 std::bind(std::mem_fn(&Manager::statusCallBack), this,
371 std::placeholders::_1, std::placeholders::_2)
372 #ifdef PLDM
373 ,
374 // Callback will set flag indicating reset needs to be done
375 // instead of immediately issuing a reset via PLDM.
376 std::bind(std::mem_fn(&Manager::resetOccRequest), this,
377 std::placeholders::_1)
378 #endif
379 ));
380
381 // Create the power cap monitor object
382 if (!pcap)
383 {
384 pcap = std::make_unique<open_power::occ::powercap::PowerCap>(
385 *statusObjects.back());
386 }
387
388 if (statusObjects.back()->isMasterOcc())
389 {
390 log<level::INFO>(
391 std::format("Manager::createObjects(): OCC{} is the master",
392 statusObjects.back()->getOccInstanceID())
393 .c_str());
394 _pollTimer->setEnabled(false);
395
396 #ifdef POWER10
397 // Set the master OCC on the PowerMode object
398 pmode->setMasterOcc(path);
399 #endif
400 }
401
402 passThroughObjects.emplace_back(std::make_unique<PassThrough>(
403 path.c_str()
404 #ifdef POWER10
405 ,
406 pmode
407 #endif
408 ));
409 }
410
411 // If a reset is not already outstanding, set a flag to indicate that a reset is
412 // needed.
resetOccRequest(instanceID instance)413 void Manager::resetOccRequest(instanceID instance)
414 {
415 if (!resetRequired)
416 {
417 resetRequired = true;
418 resetInstance = instance;
419 log<level::ERR>(
420 std::format(
421 "resetOccRequest: PM Complex reset was requested due to OCC{}",
422 instance)
423 .c_str());
424 }
425 else if (instance != resetInstance)
426 {
427 log<level::WARNING>(
428 std::format(
429 "resetOccRequest: Ignoring PM Complex reset request for OCC{}, because reset already outstanding for OCC{}",
430 instance, resetInstance)
431 .c_str());
432 }
433 }
434
435 // If a reset has not been started, initiate an OCC reset via PLDM
initiateOccRequest(instanceID instance)436 void Manager::initiateOccRequest(instanceID instance)
437 {
438 if (!resetInProgress)
439 {
440 resetInProgress = true;
441 resetInstance = instance;
442 log<level::ERR>(
443 std::format(
444 "initiateOccRequest: Initiating PM Complex reset due to OCC{}",
445 instance)
446 .c_str());
447 #ifdef PLDM
448 pldmHandle->resetOCC(instance);
449 #endif
450 resetRequired = false;
451 }
452 else
453 {
454 log<level::WARNING>(
455 std::format(
456 "initiateOccRequest: Ignoring PM Complex reset request for OCC{}, because reset already in process for OCC{}",
457 instance, resetInstance)
458 .c_str());
459 }
460 }
461
statusCallBack(instanceID instance,bool status)462 void Manager::statusCallBack(instanceID instance, bool status)
463 {
464 if (status == true)
465 {
466 if (resetInProgress)
467 {
468 log<level::INFO>(
469 std::format(
470 "statusCallBack: Ignoring OCC{} activate because a reset has been initiated due to OCC{}",
471 instance, resetInstance)
472 .c_str());
473 return;
474 }
475
476 // OCC went active
477 ++activeCount;
478
479 #ifdef POWER10
480 if (activeCount == 1)
481 {
482 // First OCC went active (allow some time for all OCCs to go active)
483 waitForAllOccsTimer->restartOnce(60s);
484 }
485 #endif
486
487 if (activeCount == statusObjects.size())
488 {
489 #ifdef POWER10
490 // All OCCs are now running
491 if (waitForAllOccsTimer->isEnabled())
492 {
493 // stop occ wait timer
494 waitForAllOccsTimer->setEnabled(false);
495 }
496
497 // All OCCs have been found, check if we need a reset
498 if (resetRequired)
499 {
500 initiateOccRequest(resetInstance);
501
502 if (!waitForAllOccsTimer->isEnabled())
503 {
504 log<level::WARNING>(
505 "occsNotAllRunning: Restarting waitForAllOccTimer");
506 // restart occ wait timer
507 waitForAllOccsTimer->restartOnce(60s);
508 }
509 }
510 else
511 {
512 // Verify master OCC and start presence monitor
513 validateOccMaster();
514 }
515 #else
516 // Verify master OCC and start presence monitor
517 validateOccMaster();
518 #endif
519 }
520
521 // Start poll timer if not already started
522 if (!_pollTimer->isEnabled())
523 {
524 log<level::INFO>(
525 std::format("Manager: OCCs will be polled every {} seconds",
526 pollInterval)
527 .c_str());
528
529 // Send poll and start OCC poll timer
530 pollerTimerExpired();
531 }
532 }
533 else
534 {
535 // OCC went away
536 if (activeCount > 0)
537 {
538 --activeCount;
539 }
540 else
541 {
542 log<level::INFO>(
543 std::format("OCC{} disabled, but currently no active OCCs",
544 instance)
545 .c_str());
546 }
547
548 if (activeCount == 0)
549 {
550 // No OCCs are running
551
552 if (resetInProgress)
553 {
554 // All OCC active sensors are clear (reset should be in
555 // progress)
556 log<level::INFO>(
557 std::format(
558 "statusCallBack: Clearing resetInProgress (activeCount={}, OCC{}, status={})",
559 activeCount, instance, status)
560 .c_str());
561 resetInProgress = false;
562 resetInstance = 255;
563 }
564
565 // Stop OCC poll timer
566 if (_pollTimer->isEnabled())
567 {
568 log<level::INFO>(
569 "Manager::statusCallBack(): OCCs are not running, stopping poll timer");
570 _pollTimer->setEnabled(false);
571 }
572
573 #ifdef POWER10
574 // stop wait timer
575 if (waitForAllOccsTimer->isEnabled())
576 {
577 waitForAllOccsTimer->setEnabled(false);
578 }
579 #endif
580 }
581 else if (resetInProgress)
582 {
583 log<level::INFO>(
584 std::format(
585 "statusCallBack: Skipping clear of resetInProgress (activeCount={}, OCC{}, status={})",
586 activeCount, instance, status)
587 .c_str());
588 }
589 #ifdef READ_OCC_SENSORS
590 // Clear OCC sensors
591 setSensorValueToNaN(instance);
592 #endif
593 }
594
595 #ifdef POWER10
596 if (waitingForAllOccActiveSensors)
597 {
598 if (utils::isHostRunning())
599 {
600 checkAllActiveSensors();
601 }
602 }
603 #endif
604 }
605
606 #ifdef I2C_OCC
initStatusObjects()607 void Manager::initStatusObjects()
608 {
609 // Make sure we have a valid path string
610 static_assert(sizeof(DEV_PATH) != 0);
611
612 auto deviceNames = i2c_occ::getOccHwmonDevices(DEV_PATH);
613 for (auto& name : deviceNames)
614 {
615 i2c_occ::i2cToDbus(name);
616 name = std::string(OCC_NAME) + '_' + name;
617 auto path = fs::path(OCC_CONTROL_ROOT) / name;
618 statusObjects.emplace_back(
619 std::make_unique<Status>(event, path.c_str(), *this));
620 }
621 // The first device is master occ
622 pcap = std::make_unique<open_power::occ::powercap::PowerCap>(
623 *statusObjects.front());
624 #ifdef POWER10
625 pmode = std::make_unique<powermode::PowerMode>(*this, powermode::PMODE_PATH,
626 powermode::PIPS_PATH);
627 // Set the master OCC on the PowerMode object
628 pmode->setMasterOcc(path);
629 #endif
630 }
631 #endif
632
633 #ifdef PLDM
sbeTimeout(unsigned int instance)634 void Manager::sbeTimeout(unsigned int instance)
635 {
636 auto obj = std::find_if(statusObjects.begin(), statusObjects.end(),
637 [instance](const auto& obj) {
638 return instance == obj->getOccInstanceID();
639 });
640
641 if (obj != statusObjects.end() && (*obj)->occActive())
642 {
643 log<level::INFO>(
644 std::format("SBE timeout, requesting HRESET (OCC{})", instance)
645 .c_str());
646
647 setSBEState(instance, SBE_STATE_NOT_USABLE);
648
649 pldmHandle->sendHRESET(instance);
650 }
651 }
652
updateOCCActive(instanceID instance,bool status)653 bool Manager::updateOCCActive(instanceID instance, bool status)
654 {
655 auto obj = std::find_if(statusObjects.begin(), statusObjects.end(),
656 [instance](const auto& obj) {
657 return instance == obj->getOccInstanceID();
658 });
659
660 const bool hostRunning = open_power::occ::utils::isHostRunning();
661 if (obj != statusObjects.end())
662 {
663 if (!hostRunning && (status == true))
664 {
665 log<level::WARNING>(
666 std::format(
667 "updateOCCActive: Host is not running yet (OCC{} active={}), clearing sensor received",
668 instance, status)
669 .c_str());
670 (*obj)->setPldmSensorReceived(false);
671 if (!waitingForAllOccActiveSensors)
672 {
673 log<level::INFO>(
674 "updateOCCActive: Waiting for Host and all OCC Active Sensors");
675 waitingForAllOccActiveSensors = true;
676 }
677 #ifdef POWER10
678 discoverTimer->restartOnce(30s);
679 #endif
680 return false;
681 }
682 else
683 {
684 (*obj)->setPldmSensorReceived(true);
685 return (*obj)->occActive(status);
686 }
687 }
688 else
689 {
690 if (hostRunning)
691 {
692 log<level::WARNING>(
693 std::format(
694 "updateOCCActive: No status object to update for OCC{} (active={})",
695 instance, status)
696 .c_str());
697 }
698 else
699 {
700 if (status == true)
701 {
702 log<level::WARNING>(
703 std::format(
704 "updateOCCActive: No status objects and Host is not running yet (OCC{} active={})",
705 instance, status)
706 .c_str());
707 }
708 }
709 if (status == true)
710 {
711 // OCC went active
712 queuedActiveState.insert(instance);
713 }
714 else
715 {
716 auto match = queuedActiveState.find(instance);
717 if (match != queuedActiveState.end())
718 {
719 // OCC was disabled
720 queuedActiveState.erase(match);
721 }
722 }
723 return false;
724 }
725 }
726
727 // Called upon pldm event To set powermode Safe Mode State for system.
updateOccSafeMode(bool safeMode)728 void Manager::updateOccSafeMode(bool safeMode)
729 {
730 #ifdef POWER10
731 pmode->updateDbusSafeMode(safeMode);
732 #endif
733 // Update the processor throttle status on dbus
734 for (auto& obj : statusObjects)
735 {
736 obj->updateThrottle(safeMode, THROTTLED_SAFE);
737 }
738 }
739
sbeHRESETResult(instanceID instance,bool success)740 void Manager::sbeHRESETResult(instanceID instance, bool success)
741 {
742 if (success)
743 {
744 log<level::INFO>(
745 std::format("HRESET succeeded (OCC{})", instance).c_str());
746
747 setSBEState(instance, SBE_STATE_BOOTED);
748
749 return;
750 }
751
752 setSBEState(instance, SBE_STATE_FAILED);
753
754 if (sbeCanDump(instance))
755 {
756 log<level::INFO>(
757 std::format("HRESET failed (OCC{}), triggering SBE dump", instance)
758 .c_str());
759
760 auto& bus = utils::getBus();
761 uint32_t src6 = instance << 16;
762 uint32_t logId =
763 FFDC::createPEL("org.open_power.Processor.Error.SbeChipOpTimeout",
764 src6, "SBE command timeout");
765
766 try
767 {
768 constexpr auto interface = "xyz.openbmc_project.Dump.Create";
769 constexpr auto function = "CreateDump";
770
771 std::string service =
772 utils::getService(OP_DUMP_OBJ_PATH, interface);
773 auto method = bus.new_method_call(service.c_str(), OP_DUMP_OBJ_PATH,
774 interface, function);
775
776 std::map<std::string, std::variant<std::string, uint64_t>>
777 createParams{
778 {"com.ibm.Dump.Create.CreateParameters.ErrorLogId",
779 uint64_t(logId)},
780 {"com.ibm.Dump.Create.CreateParameters.DumpType",
781 "com.ibm.Dump.Create.DumpType.SBE"},
782 {"com.ibm.Dump.Create.CreateParameters.FailingUnitId",
783 uint64_t(instance)},
784 };
785
786 method.append(createParams);
787
788 auto response = bus.call(method);
789 }
790 catch (const sdbusplus::exception_t& e)
791 {
792 constexpr auto ERROR_DUMP_DISABLED =
793 "xyz.openbmc_project.Dump.Create.Error.Disabled";
794 if (e.name() == ERROR_DUMP_DISABLED)
795 {
796 log<level::INFO>("Dump is disabled, skipping");
797 }
798 else
799 {
800 log<level::ERR>("Dump failed");
801 }
802 }
803 }
804
805 // SBE Reset failed, try PM Complex reset
806 log<level::ERR>("sbeHRESETResult: Forcing PM Complex reset");
807 resetOccRequest(instance);
808 }
809
sbeCanDump(unsigned int instance)810 bool Manager::sbeCanDump(unsigned int instance)
811 {
812 struct pdbg_target* proc = getPdbgTarget(instance);
813
814 if (!proc)
815 {
816 // allow the dump in the error case
817 return true;
818 }
819
820 try
821 {
822 if (!openpower::phal::sbe::isDumpAllowed(proc))
823 {
824 return false;
825 }
826
827 if (openpower::phal::pdbg::isSbeVitalAttnActive(proc))
828 {
829 return false;
830 }
831 }
832 catch (openpower::phal::exception::SbeError& e)
833 {
834 log<level::INFO>("Failed to query SBE state");
835 }
836
837 // allow the dump in the error case
838 return true;
839 }
840
setSBEState(unsigned int instance,enum sbe_state state)841 void Manager::setSBEState(unsigned int instance, enum sbe_state state)
842 {
843 struct pdbg_target* proc = getPdbgTarget(instance);
844
845 if (!proc)
846 {
847 return;
848 }
849
850 try
851 {
852 openpower::phal::sbe::setState(proc, state);
853 }
854 catch (const openpower::phal::exception::SbeError& e)
855 {
856 log<level::ERR>(
857 std::format("Failed to set SBE state: {}", e.what()).c_str());
858 }
859 }
860
getPdbgTarget(unsigned int instance)861 struct pdbg_target* Manager::getPdbgTarget(unsigned int instance)
862 {
863 if (!pdbgInitialized)
864 {
865 try
866 {
867 openpower::phal::pdbg::init();
868 pdbgInitialized = true;
869 }
870 catch (const openpower::phal::exception::PdbgError& e)
871 {
872 log<level::ERR>("pdbg initialization failed");
873 return nullptr;
874 }
875 }
876
877 struct pdbg_target* proc = nullptr;
878 pdbg_for_each_class_target("proc", proc)
879 {
880 if (pdbg_target_index(proc) == instance)
881 {
882 return proc;
883 }
884 }
885
886 log<level::ERR>("Failed to get pdbg target");
887 return nullptr;
888 }
889 #endif
890
pollerTimerExpired()891 void Manager::pollerTimerExpired()
892 {
893 if (!_pollTimer)
894 {
895 log<level::ERR>("pollerTimerExpired() ERROR: Timer not defined");
896 return;
897 }
898
899 #ifdef POWER10
900 if (resetRequired)
901 {
902 log<level::ERR>("pollerTimerExpired() - Initiating PM Complex reset");
903 initiateOccRequest(resetInstance);
904
905 if (!waitForAllOccsTimer->isEnabled())
906 {
907 log<level::WARNING>(
908 "pollerTimerExpired: Restarting waitForAllOccTimer");
909 // restart occ wait timer
910 waitForAllOccsTimer->restartOnce(60s);
911 }
912 return;
913 }
914 #endif
915
916 for (auto& obj : statusObjects)
917 {
918 if (!obj->occActive())
919 {
920 // OCC is not running yet
921 #ifdef READ_OCC_SENSORS
922 auto id = obj->getOccInstanceID();
923 setSensorValueToNaN(id);
924 #endif
925 continue;
926 }
927
928 // Read sysfs to force kernel to poll OCC
929 obj->readOccState();
930
931 #ifdef READ_OCC_SENSORS
932 // Read occ sensor values
933 getSensorValues(obj);
934 #endif
935 }
936
937 if (activeCount > 0)
938 {
939 // Restart OCC poll timer
940 _pollTimer->restartOnce(std::chrono::seconds(pollInterval));
941 }
942 else
943 {
944 // No OCCs running, so poll timer will not be restarted
945 log<level::INFO>(
946 std::format(
947 "Manager::pollerTimerExpired: poll timer will not be restarted")
948 .c_str());
949 }
950 }
951
952 #ifdef READ_OCC_SENSORS
readTempSensors(const fs::path & path,uint32_t occInstance)953 void Manager::readTempSensors(const fs::path& path, uint32_t occInstance)
954 {
955 // There may be more than one sensor with the same FRU type
956 // and label so make two passes: the first to read the temps
957 // from sysfs, and the second to put them on D-Bus after
958 // resolving any conflicts.
959 std::map<std::string, double> sensorData;
960
961 std::regex expr{"temp\\d+_label$"}; // Example: temp5_label
962 for (auto& file : fs::directory_iterator(path))
963 {
964 if (!std::regex_search(file.path().string(), expr))
965 {
966 continue;
967 }
968
969 uint32_t labelValue{0};
970
971 try
972 {
973 labelValue = readFile<uint32_t>(file.path());
974 }
975 catch (const std::system_error& e)
976 {
977 log<level::DEBUG>(
978 std::format("readTempSensors: Failed reading {}, errno = {}",
979 file.path().string(), e.code().value())
980 .c_str());
981 continue;
982 }
983
984 const std::string& tempLabel = "label";
985 const std::string filePathString = file.path().string().substr(
986 0, file.path().string().length() - tempLabel.length());
987
988 uint32_t fruTypeValue{0};
989 try
990 {
991 fruTypeValue = readFile<uint32_t>(filePathString + fruTypeSuffix);
992 }
993 catch (const std::system_error& e)
994 {
995 log<level::DEBUG>(
996 std::format("readTempSensors: Failed reading {}, errno = {}",
997 filePathString + fruTypeSuffix, e.code().value())
998 .c_str());
999 continue;
1000 }
1001
1002 std::string sensorPath =
1003 OCC_SENSORS_ROOT + std::string("/temperature/");
1004
1005 std::string dvfsTempPath;
1006
1007 if (fruTypeValue == VRMVdd)
1008 {
1009 sensorPath.append(
1010 "vrm_vdd" + std::to_string(occInstance) + "_temp");
1011 }
1012 else if (fruTypeValue == processorIoRing)
1013 {
1014 sensorPath.append(
1015 "proc" + std::to_string(occInstance) + "_ioring_temp");
1016 dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/proc" +
1017 std::to_string(occInstance) + "_ioring_dvfs_temp";
1018 }
1019 else
1020 {
1021 uint16_t type = (labelValue & 0xFF000000) >> 24;
1022 uint16_t instanceID = labelValue & 0x0000FFFF;
1023
1024 if (type == OCC_DIMM_TEMP_SENSOR_TYPE)
1025 {
1026 if (fruTypeValue == fruTypeNotAvailable)
1027 {
1028 // Not all DIMM related temps are available to read
1029 // (no _input file in this case)
1030 continue;
1031 }
1032 auto iter = dimmTempSensorName.find(fruTypeValue);
1033 if (iter == dimmTempSensorName.end())
1034 {
1035 log<level::ERR>(
1036 std::format(
1037 "readTempSensors: Fru type error! fruTypeValue = {}) ",
1038 fruTypeValue)
1039 .c_str());
1040 continue;
1041 }
1042
1043 sensorPath.append(
1044 "dimm" + std::to_string(instanceID) + iter->second);
1045
1046 dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/" +
1047 dimmDVFSSensorName.at(fruTypeValue);
1048 }
1049 else if (type == OCC_CPU_TEMP_SENSOR_TYPE)
1050 {
1051 if (fruTypeValue == processorCore)
1052 {
1053 // The OCC reports small core temps, of which there are
1054 // two per big core. All current P10 systems are in big
1055 // core mode, so use a big core name.
1056 uint16_t coreNum = instanceID / 2;
1057 uint16_t tempNum = instanceID % 2;
1058 sensorPath.append("proc" + std::to_string(occInstance) +
1059 "_core" + std::to_string(coreNum) + "_" +
1060 std::to_string(tempNum) + "_temp");
1061
1062 dvfsTempPath =
1063 std::string{OCC_SENSORS_ROOT} + "/temperature/proc" +
1064 std::to_string(occInstance) + "_core_dvfs_temp";
1065 }
1066 else
1067 {
1068 continue;
1069 }
1070 }
1071 else
1072 {
1073 continue;
1074 }
1075 }
1076
1077 // The dvfs temp file only needs to be read once per chip per type.
1078 if (!dvfsTempPath.empty() &&
1079 !dbus::OccDBusSensors::getOccDBus().hasDvfsTemp(dvfsTempPath))
1080 {
1081 try
1082 {
1083 auto dvfsValue = readFile<double>(filePathString + maxSuffix);
1084
1085 dbus::OccDBusSensors::getOccDBus().setDvfsTemp(
1086 dvfsTempPath, dvfsValue * std::pow(10, -3));
1087 }
1088 catch (const std::system_error& e)
1089 {
1090 log<level::DEBUG>(
1091 std::format(
1092 "readTempSensors: Failed reading {}, errno = {}",
1093 filePathString + maxSuffix, e.code().value())
1094 .c_str());
1095 }
1096 }
1097
1098 uint32_t faultValue{0};
1099 try
1100 {
1101 faultValue = readFile<uint32_t>(filePathString + faultSuffix);
1102 }
1103 catch (const std::system_error& e)
1104 {
1105 log<level::DEBUG>(
1106 std::format("readTempSensors: Failed reading {}, errno = {}",
1107 filePathString + faultSuffix, e.code().value())
1108 .c_str());
1109 continue;
1110 }
1111
1112 double tempValue{0};
1113 // NOTE: if OCC sends back 0xFF, kernal sets this fault value to 1.
1114 if (faultValue != 0)
1115 {
1116 tempValue = std::numeric_limits<double>::quiet_NaN();
1117 }
1118 else
1119 {
1120 // Read the temperature
1121 try
1122 {
1123 tempValue = readFile<double>(filePathString + inputSuffix);
1124 }
1125 catch (const std::system_error& e)
1126 {
1127 log<level::DEBUG>(
1128 std::format(
1129 "readTempSensors: Failed reading {}, errno = {}",
1130 filePathString + inputSuffix, e.code().value())
1131 .c_str());
1132
1133 // if errno == EAGAIN(Resource temporarily unavailable) then set
1134 // temp to 0, to avoid using old temp, and affecting FAN
1135 // Control.
1136 if (e.code().value() == EAGAIN)
1137 {
1138 tempValue = 0;
1139 }
1140 // else the errno would be something like
1141 // EBADF(Bad file descriptor)
1142 // or ENOENT(No such file or directory)
1143 else
1144 {
1145 continue;
1146 }
1147 }
1148 }
1149
1150 // If this object path already has a value, only overwite
1151 // it if the previous one was an NaN or a smaller value.
1152 auto existing = sensorData.find(sensorPath);
1153 if (existing != sensorData.end())
1154 {
1155 // Multiple sensors found for this FRU type
1156 if ((std::isnan(existing->second) && (tempValue == 0)) ||
1157 ((existing->second == 0) && std::isnan(tempValue)))
1158 {
1159 // One of the redundant sensors has failed (0xFF/nan), and the
1160 // other sensor has no reading (0), so set the FRU to NaN to
1161 // force fan increase
1162 tempValue = std::numeric_limits<double>::quiet_NaN();
1163 existing->second = tempValue;
1164 }
1165 if (std::isnan(existing->second) || (tempValue > existing->second))
1166 {
1167 existing->second = tempValue;
1168 }
1169 }
1170 else
1171 {
1172 // First sensor for this FRU type
1173 sensorData[sensorPath] = tempValue;
1174 }
1175 }
1176
1177 // Now publish the values on D-Bus.
1178 for (const auto& [objectPath, value] : sensorData)
1179 {
1180 dbus::OccDBusSensors::getOccDBus().setValue(objectPath,
1181 value * std::pow(10, -3));
1182
1183 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1184 objectPath, !std::isnan(value));
1185
1186 if (existingSensors.find(objectPath) == existingSensors.end())
1187 {
1188 dbus::OccDBusSensors::getOccDBus().setChassisAssociation(
1189 objectPath);
1190 }
1191
1192 existingSensors[objectPath] = occInstance;
1193 }
1194 }
1195
1196 std::optional<std::string>
getPowerLabelFunctionID(const std::string & value)1197 Manager::getPowerLabelFunctionID(const std::string& value)
1198 {
1199 // If the value is "system", then the FunctionID is "system".
1200 if (value == "system")
1201 {
1202 return value;
1203 }
1204
1205 // If the value is not "system", then the label value have 3 numbers, of
1206 // which we only care about the middle one:
1207 // <sensor id>_<function id>_<apss channel>
1208 // eg: The value is "0_10_5" , then the FunctionID is "10".
1209 if (value.find("_") == std::string::npos)
1210 {
1211 return std::nullopt;
1212 }
1213
1214 auto powerLabelValue = value.substr((value.find("_") + 1));
1215
1216 if (powerLabelValue.find("_") == std::string::npos)
1217 {
1218 return std::nullopt;
1219 }
1220
1221 return powerLabelValue.substr(0, powerLabelValue.find("_"));
1222 }
1223
readPowerSensors(const fs::path & path,uint32_t id)1224 void Manager::readPowerSensors(const fs::path& path, uint32_t id)
1225 {
1226 std::regex expr{"power\\d+_label$"}; // Example: power5_label
1227 for (auto& file : fs::directory_iterator(path))
1228 {
1229 if (!std::regex_search(file.path().string(), expr))
1230 {
1231 continue;
1232 }
1233
1234 std::string labelValue;
1235 try
1236 {
1237 labelValue = readFile<std::string>(file.path());
1238 }
1239 catch (const std::system_error& e)
1240 {
1241 log<level::DEBUG>(
1242 std::format("readPowerSensors: Failed reading {}, errno = {}",
1243 file.path().string(), e.code().value())
1244 .c_str());
1245 continue;
1246 }
1247
1248 auto functionID = getPowerLabelFunctionID(labelValue);
1249 if (functionID == std::nullopt)
1250 {
1251 continue;
1252 }
1253
1254 const std::string& tempLabel = "label";
1255 const std::string filePathString = file.path().string().substr(
1256 0, file.path().string().length() - tempLabel.length());
1257
1258 std::string sensorPath = OCC_SENSORS_ROOT + std::string("/power/");
1259
1260 auto iter = powerSensorName.find(*functionID);
1261 if (iter == powerSensorName.end())
1262 {
1263 continue;
1264 }
1265 sensorPath.append(iter->second);
1266
1267 double tempValue{0};
1268
1269 try
1270 {
1271 tempValue = readFile<double>(filePathString + inputSuffix);
1272 }
1273 catch (const std::system_error& e)
1274 {
1275 log<level::DEBUG>(
1276 std::format("readPowerSensors: Failed reading {}, errno = {}",
1277 filePathString + inputSuffix, e.code().value())
1278 .c_str());
1279 continue;
1280 }
1281
1282 dbus::OccDBusSensors::getOccDBus().setUnit(
1283 sensorPath, "xyz.openbmc_project.Sensor.Value.Unit.Watts");
1284
1285 dbus::OccDBusSensors::getOccDBus().setValue(
1286 sensorPath, tempValue * std::pow(10, -3) * std::pow(10, -3));
1287
1288 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1289 sensorPath, true);
1290
1291 if (existingSensors.find(sensorPath) == existingSensors.end())
1292 {
1293 dbus::OccDBusSensors::getOccDBus().setChassisAssociation(
1294 sensorPath);
1295 }
1296
1297 existingSensors[sensorPath] = id;
1298 }
1299 return;
1300 }
1301
setSensorValueToNaN(uint32_t id) const1302 void Manager::setSensorValueToNaN(uint32_t id) const
1303 {
1304 for (const auto& [sensorPath, occId] : existingSensors)
1305 {
1306 if (occId == id)
1307 {
1308 dbus::OccDBusSensors::getOccDBus().setValue(
1309 sensorPath, std::numeric_limits<double>::quiet_NaN());
1310
1311 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1312 sensorPath, true);
1313 }
1314 }
1315 return;
1316 }
1317
setSensorValueToNonFunctional(uint32_t id) const1318 void Manager::setSensorValueToNonFunctional(uint32_t id) const
1319 {
1320 for (const auto& [sensorPath, occId] : existingSensors)
1321 {
1322 if (occId == id)
1323 {
1324 dbus::OccDBusSensors::getOccDBus().setValue(
1325 sensorPath, std::numeric_limits<double>::quiet_NaN());
1326
1327 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1328 sensorPath, false);
1329 }
1330 }
1331 return;
1332 }
1333
getSensorValues(std::unique_ptr<Status> & occ)1334 void Manager::getSensorValues(std::unique_ptr<Status>& occ)
1335 {
1336 static bool tracedError[8] = {0};
1337 const fs::path sensorPath = occ->getHwmonPath();
1338 const uint32_t id = occ->getOccInstanceID();
1339
1340 if (fs::exists(sensorPath))
1341 {
1342 // Read temperature sensors
1343 readTempSensors(sensorPath, id);
1344
1345 if (occ->isMasterOcc())
1346 {
1347 // Read power sensors
1348 readPowerSensors(sensorPath, id);
1349 }
1350 tracedError[id] = false;
1351 }
1352 else
1353 {
1354 if (!tracedError[id])
1355 {
1356 log<level::ERR>(
1357 std::format(
1358 "Manager::getSensorValues: OCC{} sensor path missing: {}",
1359 id, sensorPath.c_str())
1360 .c_str());
1361 tracedError[id] = true;
1362 }
1363 }
1364
1365 return;
1366 }
1367 #endif
1368
1369 // Read the altitude from DBus
readAltitude()1370 void Manager::readAltitude()
1371 {
1372 static bool traceAltitudeErr = true;
1373
1374 utils::PropertyValue altitudeProperty{};
1375 try
1376 {
1377 altitudeProperty = utils::getProperty(ALTITUDE_PATH, ALTITUDE_INTERFACE,
1378 ALTITUDE_PROP);
1379 auto sensorVal = std::get<double>(altitudeProperty);
1380 if (sensorVal < 0xFFFF)
1381 {
1382 if (sensorVal < 0)
1383 {
1384 altitude = 0;
1385 }
1386 else
1387 {
1388 // Round to nearest meter
1389 altitude = uint16_t(sensorVal + 0.5);
1390 }
1391 log<level::DEBUG>(std::format("readAltitude: sensor={} ({}m)",
1392 sensorVal, altitude)
1393 .c_str());
1394 traceAltitudeErr = true;
1395 }
1396 else
1397 {
1398 if (traceAltitudeErr)
1399 {
1400 traceAltitudeErr = false;
1401 log<level::DEBUG>(
1402 std::format("Invalid altitude value: {}", sensorVal)
1403 .c_str());
1404 }
1405 }
1406 }
1407 catch (const sdbusplus::exception_t& e)
1408 {
1409 if (traceAltitudeErr)
1410 {
1411 traceAltitudeErr = false;
1412 log<level::INFO>(
1413 std::format("Unable to read Altitude: {}", e.what()).c_str());
1414 }
1415 altitude = 0xFFFF; // not available
1416 }
1417 }
1418
1419 // Callback function when ambient temperature changes
ambientCallback(sdbusplus::message_t & msg)1420 void Manager::ambientCallback(sdbusplus::message_t& msg)
1421 {
1422 double currentTemp = 0;
1423 uint8_t truncatedTemp = 0xFF;
1424 std::string msgSensor;
1425 std::map<std::string, std::variant<double>> msgData;
1426 msg.read(msgSensor, msgData);
1427
1428 auto valPropMap = msgData.find(AMBIENT_PROP);
1429 if (valPropMap == msgData.end())
1430 {
1431 log<level::DEBUG>("ambientCallback: Unknown ambient property changed");
1432 return;
1433 }
1434 currentTemp = std::get<double>(valPropMap->second);
1435 if (std::isnan(currentTemp))
1436 {
1437 truncatedTemp = 0xFF;
1438 }
1439 else
1440 {
1441 if (currentTemp < 0)
1442 {
1443 truncatedTemp = 0;
1444 }
1445 else
1446 {
1447 // Round to nearest degree C
1448 truncatedTemp = uint8_t(currentTemp + 0.5);
1449 }
1450 }
1451
1452 // If ambient changes, notify OCCs
1453 if (truncatedTemp != ambient)
1454 {
1455 log<level::DEBUG>(
1456 std::format("ambientCallback: Ambient change from {} to {}C",
1457 ambient, currentTemp)
1458 .c_str());
1459
1460 ambient = truncatedTemp;
1461 if (altitude == 0xFFFF)
1462 {
1463 // No altitude yet, try reading again
1464 readAltitude();
1465 }
1466
1467 log<level::DEBUG>(
1468 std::format("ambientCallback: Ambient: {}C, altitude: {}m", ambient,
1469 altitude)
1470 .c_str());
1471 #ifdef POWER10
1472 // Send ambient and altitude to all OCCs
1473 for (auto& obj : statusObjects)
1474 {
1475 if (obj->occActive())
1476 {
1477 obj->sendAmbient(ambient, altitude);
1478 }
1479 }
1480 #endif // POWER10
1481 }
1482 }
1483
1484 // return the current ambient and altitude readings
getAmbientData(bool & ambientValid,uint8_t & ambientTemp,uint16_t & altitudeValue) const1485 void Manager::getAmbientData(bool& ambientValid, uint8_t& ambientTemp,
1486 uint16_t& altitudeValue) const
1487 {
1488 ambientValid = true;
1489 ambientTemp = ambient;
1490 altitudeValue = altitude;
1491
1492 if (ambient == 0xFF)
1493 {
1494 ambientValid = false;
1495 }
1496 }
1497
1498 #ifdef POWER10
1499 // Called when waitForAllOccsTimer expires
1500 // After the first OCC goes active, this timer will be started (60 seconds)
occsNotAllRunning()1501 void Manager::occsNotAllRunning()
1502 {
1503 if (resetInProgress)
1504 {
1505 log<level::WARNING>(
1506 "occsNotAllRunning: Ignoring waitForAllOccsTimer because reset is in progress");
1507 return;
1508 }
1509 if (activeCount != statusObjects.size())
1510 {
1511 // Not all OCCs went active
1512 log<level::WARNING>(
1513 std::format(
1514 "occsNotAllRunning: Active OCC count ({}) does not match expected count ({})",
1515 activeCount, statusObjects.size())
1516 .c_str());
1517 // Procs may be garded, so may be expected
1518 }
1519
1520 if (resetRequired)
1521 {
1522 initiateOccRequest(resetInstance);
1523
1524 if (!waitForAllOccsTimer->isEnabled())
1525 {
1526 log<level::WARNING>(
1527 "occsNotAllRunning: Restarting waitForAllOccTimer");
1528 // restart occ wait timer
1529 waitForAllOccsTimer->restartOnce(60s);
1530 }
1531 }
1532 else
1533 {
1534 validateOccMaster();
1535 }
1536 }
1537
1538 #ifdef PLDM
1539 // Called when throttlePldmTraceTimer expires.
1540 // If this timer expires, that indicates there are no OCC active sensor PDRs
1541 // found which will trigger pldm traces to be throttled.
1542 // The second time this timer expires, a PEL will get created.
throttlePldmTraceExpired()1543 void Manager::throttlePldmTraceExpired()
1544 {
1545 if (utils::isHostRunning())
1546 {
1547 if (!onPldmTimeoutCreatePel)
1548 {
1549 // Throttle traces
1550 pldmHandle->setTraceThrottle(true);
1551 // Restart timer to log a PEL when timer expires
1552 onPldmTimeoutCreatePel = true;
1553 throttlePldmTraceTimer->restartOnce(40min);
1554 }
1555 else
1556 {
1557 log<level::ERR>(
1558 "throttlePldmTraceExpired(): OCC active sensors still not available!");
1559 // Create PEL
1560 createPldmSensorPEL();
1561 }
1562 }
1563 else
1564 {
1565 // Make sure traces are not throttled
1566 pldmHandle->setTraceThrottle(false);
1567 log<level::INFO>(
1568 "throttlePldmTraceExpired(): host it not running ignoring sensor timer");
1569 }
1570 }
1571
createPldmSensorPEL()1572 void Manager::createPldmSensorPEL()
1573 {
1574 Error::Descriptor d = Error::Descriptor(MISSING_OCC_SENSORS_PATH);
1575 std::map<std::string, std::string> additionalData;
1576
1577 additionalData.emplace("_PID", std::to_string(getpid()));
1578
1579 log<level::INFO>(
1580 std::format(
1581 "createPldmSensorPEL(): Unable to find PLDM sensors for the OCCs")
1582 .c_str());
1583
1584 auto& bus = utils::getBus();
1585
1586 try
1587 {
1588 FFDCFiles ffdc;
1589 // Add occ-control journal traces to PEL FFDC
1590 auto occJournalFile =
1591 FFDC::addJournalEntries(ffdc, "openpower-occ-control", 40);
1592
1593 static constexpr auto loggingObjectPath =
1594 "/xyz/openbmc_project/logging";
1595 static constexpr auto opLoggingInterface = "org.open_power.Logging.PEL";
1596 std::string service =
1597 utils::getService(loggingObjectPath, opLoggingInterface);
1598 auto method =
1599 bus.new_method_call(service.c_str(), loggingObjectPath,
1600 opLoggingInterface, "CreatePELWithFFDCFiles");
1601
1602 // Set level to Warning (Predictive).
1603 auto level =
1604 sdbusplus::xyz::openbmc_project::Logging::server::convertForMessage(
1605 sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level::
1606 Warning);
1607
1608 method.append(d.path, level, additionalData, ffdc);
1609 bus.call(method);
1610 }
1611 catch (const sdbusplus::exception_t& e)
1612 {
1613 log<level::ERR>(
1614 std::format("Failed to create MISSING_OCC_SENSORS PEL: {}",
1615 e.what())
1616 .c_str());
1617 }
1618 }
1619 #endif // PLDM
1620 #endif // POWER10
1621
1622 // Verify single master OCC and start presence monitor
validateOccMaster()1623 void Manager::validateOccMaster()
1624 {
1625 int masterInstance = -1;
1626 for (auto& obj : statusObjects)
1627 {
1628 auto instance = obj->getOccInstanceID();
1629 #ifdef POWER10
1630 if (!obj->occActive())
1631 {
1632 if (utils::isHostRunning())
1633 {
1634 // Check if sensor was queued while waiting for discovery
1635 auto match = queuedActiveState.find(instance);
1636 if (match != queuedActiveState.end())
1637 {
1638 queuedActiveState.erase(match);
1639 log<level::INFO>(
1640 std::format(
1641 "validateOccMaster: OCC{} is ACTIVE (queued)",
1642 instance)
1643 .c_str());
1644 obj->occActive(true);
1645 }
1646 else
1647 {
1648 // OCC does not appear to be active yet, check active sensor
1649 #ifdef PLDM
1650 pldmHandle->checkActiveSensor(instance);
1651 #endif
1652 if (obj->occActive())
1653 {
1654 log<level::INFO>(
1655 std::format(
1656 "validateOccMaster: OCC{} is ACTIVE after reading sensor",
1657 instance)
1658 .c_str());
1659 }
1660 }
1661 }
1662 else
1663 {
1664 log<level::WARNING>(
1665 std::format(
1666 "validateOccMaster: HOST is not running (OCC{})",
1667 instance)
1668 .c_str());
1669 return;
1670 }
1671 }
1672 #endif // POWER10
1673
1674 if (obj->isMasterOcc())
1675 {
1676 obj->addPresenceWatchMaster();
1677
1678 if (masterInstance == -1)
1679 {
1680 masterInstance = instance;
1681 }
1682 else
1683 {
1684 log<level::ERR>(
1685 std::format(
1686 "validateOccMaster: Multiple OCC masters! ({} and {})",
1687 masterInstance, instance)
1688 .c_str());
1689 // request reset
1690 obj->deviceError(Error::Descriptor(PRESENCE_ERROR_PATH));
1691 }
1692 }
1693 }
1694
1695 if (masterInstance < 0)
1696 {
1697 log<level::ERR>(
1698 std::format("validateOccMaster: Master OCC not found! (of {} OCCs)",
1699 statusObjects.size())
1700 .c_str());
1701 // request reset
1702 statusObjects.front()->deviceError(
1703 Error::Descriptor(PRESENCE_ERROR_PATH));
1704 }
1705 else
1706 {
1707 log<level::INFO>(
1708 std::format("validateOccMaster: OCC{} is master of {} OCCs",
1709 masterInstance, activeCount)
1710 .c_str());
1711 #ifdef POWER10
1712 pmode->updateDbusSafeMode(false);
1713 #endif
1714 }
1715 }
1716
updatePcapBounds() const1717 void Manager::updatePcapBounds() const
1718 {
1719 if (pcap)
1720 {
1721 pcap->updatePcapBounds();
1722 }
1723 }
1724
1725 } // namespace occ
1726 } // namespace open_power
1727