1 #include "config.h"
2
3 #include "occ_manager.hpp"
4
5 #include "i2c_occ.hpp"
6 #include "occ_dbus.hpp"
7 #include "occ_errors.hpp"
8 #include "utils.hpp"
9
10 #include <phosphor-logging/elog-errors.hpp>
11 #include <phosphor-logging/lg2.hpp>
12 #include <xyz/openbmc_project/Common/error.hpp>
13
14 #include <chrono>
15 #include <cmath>
16 #include <filesystem>
17 #include <fstream>
18 #include <regex>
19
20 namespace open_power
21 {
22 namespace occ
23 {
24
25 constexpr uint32_t fruTypeNotAvailable = 0xFF;
26 constexpr auto fruTypeSuffix = "fru_type";
27 constexpr auto faultSuffix = "fault";
28 constexpr auto inputSuffix = "input";
29 constexpr auto maxSuffix = "max";
30
31 const auto HOST_ON_FILE = "/run/openbmc/host@0-on";
32
33 using namespace phosphor::logging;
34 using namespace std::literals::chrono_literals;
35
36 template <typename T>
readFile(const std::string & path)37 T readFile(const std::string& path)
38 {
39 std::ifstream ifs;
40 ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit |
41 std::ifstream::eofbit);
42 T data;
43
44 try
45 {
46 ifs.open(path);
47 ifs >> data;
48 ifs.close();
49 }
50 catch (const std::exception& e)
51 {
52 auto err = errno;
53 throw std::system_error(err, std::generic_category());
54 }
55
56 return data;
57 }
58
createPldmHandle()59 void Manager::createPldmHandle()
60 {
61 #ifdef PLDM
62 pldmHandle = std::make_unique<pldm::Interface>(
63 std::bind(std::mem_fn(&Manager::updateOCCActive), this,
64 std::placeholders::_1, std::placeholders::_2),
65 std::bind(std::mem_fn(&Manager::sbeHRESETResult), this,
66 std::placeholders::_1, std::placeholders::_2),
67 std::bind(std::mem_fn(&Manager::updateOccSafeMode), this,
68 std::placeholders::_1),
69 event);
70 #endif
71 }
72
73 // findAndCreateObjects():
74 // Takes care of getting the required objects created and
75 // finds the available devices/processors.
76 // (function is called everytime the discoverTimer expires)
77 // - create the PowerMode object to control OCC modes
78 // - create statusObjects for each OCC device found
79 // - waits for OCC Active sensors PDRs to become available
80 // - restart discoverTimer if all data is not available yet
findAndCreateObjects()81 void Manager::findAndCreateObjects()
82 {
83 #ifndef POWER10
84 for (auto id = 0; id < MAX_CPUS; ++id)
85 {
86 // Create one occ per cpu
87 auto occ = std::string(OCC_NAME) + std::to_string(id);
88 createObjects(occ);
89 }
90 #else
91 if (!pmode)
92 {
93 // Create the power mode object
94 pmode = std::make_unique<powermode::PowerMode>(
95 *this, powermode::PMODE_PATH, powermode::PIPS_PATH, event);
96 }
97
98 if (!fs::exists(HOST_ON_FILE))
99 {
100 static bool statusObjCreated = false;
101 if (!statusObjCreated)
102 {
103 // Create the OCCs based on on the /dev/occX devices
104 auto occs = findOCCsInDev();
105
106 if (occs.empty() || (prevOCCSearch.size() != occs.size()))
107 {
108 // Something changed or no OCCs yet, try again in 10s.
109 // Note on the first pass prevOCCSearch will be empty,
110 // so there will be at least one delay to give things
111 // a chance to settle.
112 prevOCCSearch = occs;
113
114 lg2::info(
115 "Manager::findAndCreateObjects(): Waiting for OCCs (currently {QTY})",
116 "QTY", occs.size());
117
118 discoverTimer->restartOnce(10s);
119 }
120 else
121 {
122 // All OCCs appear to be available, create status objects
123
124 // createObjects requires OCC0 first.
125 std::sort(occs.begin(), occs.end());
126
127 lg2::info(
128 "Manager::findAndCreateObjects(): Creating {QTY} OCC Status Objects",
129 "QTY", occs.size());
130 for (auto id : occs)
131 {
132 createObjects(std::string(OCC_NAME) + std::to_string(id));
133 }
134 statusObjCreated = true;
135 waitingForAllOccActiveSensors = true;
136
137 // Find/update the processor path associated with each OCC
138 for (auto& obj : statusObjects)
139 {
140 obj->updateProcAssociation();
141 }
142 }
143 }
144
145 if (statusObjCreated && waitingForAllOccActiveSensors)
146 {
147 static bool tracedHostWait = false;
148 if (utils::isHostRunning())
149 {
150 if (tracedHostWait)
151 {
152 lg2::info(
153 "Manager::findAndCreateObjects(): Host is running");
154 tracedHostWait = false;
155 }
156 checkAllActiveSensors();
157 }
158 else
159 {
160 if (!tracedHostWait)
161 {
162 lg2::info(
163 "Manager::findAndCreateObjects(): Waiting for host to start");
164 tracedHostWait = true;
165 }
166 discoverTimer->restartOnce(30s);
167 #ifdef PLDM
168 if (throttlePldmTraceTimer->isEnabled())
169 {
170 // Host is no longer running, disable throttle timer and
171 // make sure traces are not throttled
172 lg2::info("findAndCreateObjects(): disabling sensor timer");
173 throttlePldmTraceTimer->setEnabled(false);
174 pldmHandle->setTraceThrottle(false);
175 }
176 #endif
177 }
178 }
179 }
180 else
181 {
182 lg2::info(
183 "Manager::findAndCreateObjects(): Waiting for {FILE} to complete...",
184 "FILE", HOST_ON_FILE);
185 discoverTimer->restartOnce(10s);
186 }
187 #endif
188 }
189
190 #ifdef POWER10
191 // Check if all occActive sensors are available
checkAllActiveSensors()192 void Manager::checkAllActiveSensors()
193 {
194 static bool allActiveSensorAvailable = false;
195 static bool tracedSensorWait = false;
196 static bool waitingForHost = false;
197
198 if (open_power::occ::utils::isHostRunning())
199 {
200 if (waitingForHost)
201 {
202 waitingForHost = false;
203 lg2::info("checkAllActiveSensors(): Host is now running");
204 }
205
206 // Start with the assumption that all are available
207 allActiveSensorAvailable = true;
208 for (auto& obj : statusObjects)
209 {
210 if ((!obj->occActive()) && (!obj->getPldmSensorReceived()))
211 {
212 auto instance = obj->getOccInstanceID();
213 // Check if sensor was queued while waiting for discovery
214 auto match = queuedActiveState.find(instance);
215 if (match != queuedActiveState.end())
216 {
217 queuedActiveState.erase(match);
218 lg2::info(
219 "checkAllActiveSensors(): OCC{INST} is ACTIVE (queued)",
220 "INST", instance);
221 obj->occActive(true);
222 }
223 else
224 {
225 allActiveSensorAvailable = false;
226 if (!tracedSensorWait)
227 {
228 lg2::info(
229 "checkAllActiveSensors(): Waiting on OCC{INST} Active sensor",
230 "INST", instance);
231 tracedSensorWait = true;
232 #ifdef PLDM
233 // Make sure PLDM traces are not throttled
234 pldmHandle->setTraceThrottle(false);
235 // Start timer to throttle PLDM traces when timer
236 // expires
237 onPldmTimeoutCreatePel = false;
238 throttlePldmTraceTimer->restartOnce(5min);
239 #endif
240 }
241 #ifdef PLDM
242 // Ignore active sensor check if the OCCs are being reset
243 if (!resetInProgress)
244 {
245 pldmHandle->checkActiveSensor(obj->getOccInstanceID());
246 }
247 #endif
248 break;
249 }
250 }
251 }
252 }
253 else
254 {
255 if (!waitingForHost)
256 {
257 waitingForHost = true;
258 lg2::info("checkAllActiveSensors(): Waiting for host to start");
259 #ifdef PLDM
260 if (throttlePldmTraceTimer->isEnabled())
261 {
262 // Host is no longer running, disable throttle timer and
263 // make sure traces are not throttled
264 lg2::info("checkAllActiveSensors(): disabling sensor timer");
265 throttlePldmTraceTimer->setEnabled(false);
266 pldmHandle->setTraceThrottle(false);
267 }
268 #endif
269 }
270 }
271
272 if (allActiveSensorAvailable)
273 {
274 // All sensors were found, disable the discovery timer
275 if (discoverTimer->isEnabled())
276 {
277 discoverTimer->setEnabled(false);
278 }
279 #ifdef PLDM
280 if (throttlePldmTraceTimer->isEnabled())
281 {
282 // Disable throttle timer and make sure traces are not throttled
283 throttlePldmTraceTimer->setEnabled(false);
284 pldmHandle->setTraceThrottle(false);
285 }
286 #endif
287 if (waitingForAllOccActiveSensors)
288 {
289 lg2::info(
290 "checkAllActiveSensors(): OCC Active sensors are available");
291 waitingForAllOccActiveSensors = false;
292
293 if (resetRequired)
294 {
295 initiateOccRequest(resetInstance);
296
297 if (!waitForAllOccsTimer->isEnabled())
298 {
299 lg2::warning(
300 "occsNotAllRunning: Restarting waitForAllOccTimer");
301 // restart occ wait timer to check status after reset
302 // completes
303 waitForAllOccsTimer->restartOnce(60s);
304 }
305 }
306 }
307 queuedActiveState.clear();
308 tracedSensorWait = false;
309 }
310 else
311 {
312 // Not all sensors were available, so keep waiting
313 if (!tracedSensorWait)
314 {
315 lg2::info(
316 "checkAllActiveSensors(): Waiting for OCC Active sensors to become available");
317 tracedSensorWait = true;
318 }
319 discoverTimer->restartOnce(10s);
320 }
321 }
322 #endif
323
findOCCsInDev()324 std::vector<int> Manager::findOCCsInDev()
325 {
326 std::vector<int> occs;
327 std::regex expr{R"(occ(\d+)$)"};
328
329 for (auto& file : fs::directory_iterator("/dev"))
330 {
331 std::smatch match;
332 std::string path{file.path().string()};
333 if (std::regex_search(path, match, expr))
334 {
335 auto num = std::stoi(match[1].str());
336
337 // /dev numbering starts at 1, ours starts at 0.
338 occs.push_back(num - 1);
339 }
340 }
341
342 return occs;
343 }
344
cpuCreated(sdbusplus::message_t & msg)345 int Manager::cpuCreated(sdbusplus::message_t& msg)
346 {
347 namespace fs = std::filesystem;
348
349 sdbusplus::message::object_path o;
350 msg.read(o);
351 fs::path cpuPath(std::string(std::move(o)));
352
353 auto name = cpuPath.filename().string();
354 auto index = name.find(CPU_NAME);
355 name.replace(index, std::strlen(CPU_NAME), OCC_NAME);
356
357 createObjects(name);
358
359 return 0;
360 }
361
createObjects(const std::string & occ)362 void Manager::createObjects(const std::string& occ)
363 {
364 auto path = fs::path(OCC_CONTROL_ROOT) / occ;
365
366 statusObjects.emplace_back(std::make_unique<Status>(
367 event, path.c_str(), *this,
368 #ifdef POWER10
369 pmode,
370 #endif
371 std::bind(std::mem_fn(&Manager::statusCallBack), this,
372 std::placeholders::_1, std::placeholders::_2)
373 #ifdef PLDM
374 ,
375 // Callback will set flag indicating reset needs to be done
376 // instead of immediately issuing a reset via PLDM.
377 std::bind(std::mem_fn(&Manager::resetOccRequest), this,
378 std::placeholders::_1)
379 #endif
380 ));
381
382 // Create the power cap monitor object
383 if (!pcap)
384 {
385 pcap = std::make_unique<open_power::occ::powercap::PowerCap>(
386 *statusObjects.back());
387 }
388
389 if (statusObjects.back()->isMasterOcc())
390 {
391 lg2::info("Manager::createObjects(): OCC{INST} is the master", "INST",
392 statusObjects.back()->getOccInstanceID());
393 _pollTimer->setEnabled(false);
394
395 #ifdef POWER10
396 // Set the master OCC on the PowerMode object
397 pmode->setMasterOcc(path);
398 #endif
399 }
400
401 passThroughObjects.emplace_back(std::make_unique<PassThrough>(
402 path.c_str()
403 #ifdef POWER10
404 ,
405 pmode
406 #endif
407 ));
408 }
409
410 // If a reset is not already outstanding, set a flag to indicate that a reset is
411 // needed.
resetOccRequest(instanceID instance)412 void Manager::resetOccRequest(instanceID instance)
413 {
414 if (!resetRequired)
415 {
416 resetRequired = true;
417 resetInstance = instance;
418 lg2::error(
419 "resetOccRequest: PM Complex reset was requested due to OCC{INST}",
420 "INST", instance);
421 }
422 else if (instance != resetInstance)
423 {
424 lg2::warning(
425 "resetOccRequest: Ignoring PM Complex reset request for OCC{INST}, because reset already outstanding for OCC{RINST}",
426 "INST", instance, "RINST", resetInstance);
427 }
428 }
429
430 // If a reset has not been started, initiate an OCC reset via PLDM
initiateOccRequest(instanceID instance)431 void Manager::initiateOccRequest(instanceID instance)
432 {
433 if (!resetInProgress)
434 {
435 resetInProgress = true;
436 resetInstance = instance;
437 lg2::error(
438 "initiateOccRequest: Initiating PM Complex reset due to OCC{INST}",
439 "INST", instance);
440 #ifdef PLDM
441 pldmHandle->resetOCC(instance);
442 #endif
443 resetRequired = false;
444 }
445 else
446 {
447 lg2::warning(
448 "initiateOccRequest: Ignoring PM Complex reset request for OCC{INST}, because reset already in process for OCC{RINST}",
449 "INST", instance, "RINST", resetInstance);
450 }
451 }
452
statusCallBack(instanceID instance,bool status)453 void Manager::statusCallBack(instanceID instance, bool status)
454 {
455 if (status == true)
456 {
457 if (resetInProgress)
458 {
459 lg2::info(
460 "statusCallBack: Ignoring OCC{INST} activate because a reset has been initiated due to OCC{RINST}",
461 "INST", instance, "RINST", resetInstance);
462 return;
463 }
464
465 // OCC went active
466 ++activeCount;
467
468 #ifdef POWER10
469 if (activeCount == 1)
470 {
471 // First OCC went active (allow some time for all OCCs to go active)
472 waitForAllOccsTimer->restartOnce(60s);
473 }
474 #endif
475
476 if (activeCount == statusObjects.size())
477 {
478 #ifdef POWER10
479 // All OCCs are now running
480 if (waitForAllOccsTimer->isEnabled())
481 {
482 // stop occ wait timer
483 waitForAllOccsTimer->setEnabled(false);
484 }
485
486 // All OCCs have been found, check if we need a reset
487 if (resetRequired)
488 {
489 initiateOccRequest(resetInstance);
490
491 if (!waitForAllOccsTimer->isEnabled())
492 {
493 lg2::warning(
494 "occsNotAllRunning: Restarting waitForAllOccTimer");
495 // restart occ wait timer
496 waitForAllOccsTimer->restartOnce(60s);
497 }
498 }
499 else
500 {
501 // Verify master OCC and start presence monitor
502 validateOccMaster();
503 }
504 #else
505 // Verify master OCC and start presence monitor
506 validateOccMaster();
507 #endif
508 }
509
510 // Start poll timer if not already started
511 if (!_pollTimer->isEnabled())
512 {
513 lg2::info("Manager: OCCs will be polled every {TIME} seconds",
514 "TIME", pollInterval);
515
516 // Send poll and start OCC poll timer
517 pollerTimerExpired();
518 }
519 }
520 else
521 {
522 // OCC went away
523 if (activeCount > 0)
524 {
525 --activeCount;
526 }
527 else
528 {
529 lg2::info("OCC{INST} disabled, and no other OCCs are active",
530 "INST", instance);
531 }
532
533 if (activeCount == 0)
534 {
535 // No OCCs are running
536
537 if (resetInProgress)
538 {
539 // All OCC active sensors are clear (reset should be in
540 // progress)
541 lg2::info(
542 "statusCallBack: Clearing resetInProgress (activeCount={COUNT}, OCC{INST}, status={STATUS})",
543 "COUNT", activeCount, "INST", instance, "STATUS", status);
544 resetInProgress = false;
545 resetInstance = 255;
546 }
547
548 // Stop OCC poll timer
549 if (_pollTimer->isEnabled())
550 {
551 lg2::info(
552 "Manager::statusCallBack(): OCCs are not running, stopping poll timer");
553 _pollTimer->setEnabled(false);
554 }
555
556 #ifdef POWER10
557 // stop wait timer
558 if (waitForAllOccsTimer->isEnabled())
559 {
560 waitForAllOccsTimer->setEnabled(false);
561 }
562 #endif
563 }
564 else if (resetInProgress)
565 {
566 lg2::info(
567 "statusCallBack: Skipping clear of resetInProgress (activeCount={COUNT}, OCC{INST}, status={STATUS})",
568 "COUNT", activeCount, "INST", instance, "STATUS", status);
569 }
570 #ifdef READ_OCC_SENSORS
571 // Clear OCC sensors
572 setSensorValueToNaN(instance);
573 #endif
574 }
575
576 #ifdef POWER10
577 if (waitingForAllOccActiveSensors)
578 {
579 if (utils::isHostRunning())
580 {
581 checkAllActiveSensors();
582 }
583 }
584 #endif
585 }
586
587 #ifdef I2C_OCC
initStatusObjects()588 void Manager::initStatusObjects()
589 {
590 // Make sure we have a valid path string
591 static_assert(sizeof(DEV_PATH) != 0);
592
593 auto deviceNames = i2c_occ::getOccHwmonDevices(DEV_PATH);
594 for (auto& name : deviceNames)
595 {
596 i2c_occ::i2cToDbus(name);
597 name = std::string(OCC_NAME) + '_' + name;
598 auto path = fs::path(OCC_CONTROL_ROOT) / name;
599 statusObjects.emplace_back(
600 std::make_unique<Status>(event, path.c_str(), *this));
601 }
602 // The first device is master occ
603 pcap = std::make_unique<open_power::occ::powercap::PowerCap>(
604 *statusObjects.front());
605 #ifdef POWER10
606 pmode = std::make_unique<powermode::PowerMode>(*this, powermode::PMODE_PATH,
607 powermode::PIPS_PATH);
608 // Set the master OCC on the PowerMode object
609 pmode->setMasterOcc(path);
610 #endif
611 }
612 #endif
613
614 #ifdef PLDM
sbeTimeout(unsigned int instance)615 void Manager::sbeTimeout(unsigned int instance)
616 {
617 auto obj = std::find_if(statusObjects.begin(), statusObjects.end(),
618 [instance](const auto& obj) {
619 return instance == obj->getOccInstanceID();
620 });
621
622 if (obj != statusObjects.end() && (*obj)->occActive())
623 {
624 lg2::info("SBE timeout, requesting HRESET (OCC{INST})", "INST",
625 instance);
626
627 #ifdef PHAL_SUPPORT
628 setSBEState(instance, SBE_STATE_NOT_USABLE);
629 #endif
630
631 // Stop communication with this OCC
632 (*obj)->occActive(false);
633
634 pldmHandle->sendHRESET(instance);
635 }
636 }
637
updateOCCActive(instanceID instance,bool status)638 bool Manager::updateOCCActive(instanceID instance, bool status)
639 {
640 auto obj = std::find_if(statusObjects.begin(), statusObjects.end(),
641 [instance](const auto& obj) {
642 return instance == obj->getOccInstanceID();
643 });
644
645 const bool hostRunning = open_power::occ::utils::isHostRunning();
646 if (obj != statusObjects.end())
647 {
648 if (!hostRunning && (status == true))
649 {
650 lg2::warning(
651 "updateOCCActive: Host is not running yet (OCC{INST} active={STAT}), clearing sensor received",
652 "INST", instance, "STAT", status);
653 (*obj)->setPldmSensorReceived(false);
654 if (!waitingForAllOccActiveSensors)
655 {
656 lg2::info(
657 "updateOCCActive: Waiting for Host and all OCC Active Sensors");
658 waitingForAllOccActiveSensors = true;
659 }
660 #ifdef POWER10
661 discoverTimer->restartOnce(30s);
662 #endif
663 return false;
664 }
665 else
666 {
667 (*obj)->setPldmSensorReceived(true);
668 return (*obj)->occActive(status);
669 }
670 }
671 else
672 {
673 if (hostRunning)
674 {
675 lg2::warning(
676 "updateOCCActive: No status object to update for OCC{INST} (active={STAT})",
677 "INST", instance, "STAT", status);
678 }
679 else
680 {
681 if (status == true)
682 {
683 lg2::warning(
684 "updateOCCActive: No status objects and Host is not running yet (OCC{INST} active={STAT})",
685 "INST", instance, "STAT", status);
686 }
687 }
688 if (status == true)
689 {
690 // OCC went active
691 queuedActiveState.insert(instance);
692 }
693 else
694 {
695 auto match = queuedActiveState.find(instance);
696 if (match != queuedActiveState.end())
697 {
698 // OCC was disabled
699 queuedActiveState.erase(match);
700 }
701 }
702 return false;
703 }
704 }
705
706 // Called upon pldm event To set powermode Safe Mode State for system.
updateOccSafeMode(bool safeMode)707 void Manager::updateOccSafeMode(bool safeMode)
708 {
709 #ifdef POWER10
710 pmode->updateDbusSafeMode(safeMode);
711 #endif
712 // Update the processor throttle status on dbus
713 for (auto& obj : statusObjects)
714 {
715 obj->updateThrottle(safeMode, THROTTLED_SAFE);
716 }
717 }
718
sbeHRESETResult(instanceID instance,bool success)719 void Manager::sbeHRESETResult(instanceID instance, bool success)
720 {
721 if (success)
722 {
723 lg2::info("HRESET succeeded (OCC{INST})", "INST", instance);
724
725 #ifdef PHAL_SUPPORT
726 setSBEState(instance, SBE_STATE_BOOTED);
727 #endif
728
729 // Re-enable communication with this OCC
730 auto obj = std::find_if(statusObjects.begin(), statusObjects.end(),
731 [instance](const auto& obj) {
732 return instance == obj->getOccInstanceID();
733 });
734 if (obj != statusObjects.end() && (!(*obj)->occActive()))
735 {
736 (*obj)->occActive(true);
737 }
738
739 return;
740 }
741
742 #ifdef PHAL_SUPPORT
743 setSBEState(instance, SBE_STATE_FAILED);
744
745 if (sbeCanDump(instance))
746 {
747 lg2::info("HRESET failed (OCC{INST}), triggering SBE dump", "INST",
748 instance);
749
750 auto& bus = utils::getBus();
751 uint32_t src6 = instance << 16;
752 uint32_t logId =
753 FFDC::createPEL("org.open_power.Processor.Error.SbeChipOpTimeout",
754 src6, "SBE command timeout");
755
756 try
757 {
758 constexpr auto interface = "xyz.openbmc_project.Dump.Create";
759 constexpr auto function = "CreateDump";
760
761 std::string service =
762 utils::getService(OP_DUMP_OBJ_PATH, interface);
763 auto method = bus.new_method_call(service.c_str(), OP_DUMP_OBJ_PATH,
764 interface, function);
765
766 std::map<std::string, std::variant<std::string, uint64_t>>
767 createParams{
768 {"com.ibm.Dump.Create.CreateParameters.ErrorLogId",
769 uint64_t(logId)},
770 {"com.ibm.Dump.Create.CreateParameters.DumpType",
771 "com.ibm.Dump.Create.DumpType.SBE"},
772 {"com.ibm.Dump.Create.CreateParameters.FailingUnitId",
773 uint64_t(instance)},
774 };
775
776 method.append(createParams);
777
778 auto response = bus.call(method);
779 }
780 catch (const sdbusplus::exception_t& e)
781 {
782 constexpr auto ERROR_DUMP_DISABLED =
783 "xyz.openbmc_project.Dump.Create.Error.Disabled";
784 if (e.name() == ERROR_DUMP_DISABLED)
785 {
786 lg2::info("Dump is disabled, skipping");
787 }
788 else
789 {
790 lg2::error("Dump failed");
791 }
792 }
793 }
794 #endif
795
796 // SBE Reset failed, try PM Complex reset
797 lg2::error("sbeHRESETResult: Forcing PM Complex reset");
798 resetOccRequest(instance);
799 }
800
801 #ifdef PHAL_SUPPORT
sbeCanDump(unsigned int instance)802 bool Manager::sbeCanDump(unsigned int instance)
803 {
804 struct pdbg_target* proc = getPdbgTarget(instance);
805
806 if (!proc)
807 {
808 // allow the dump in the error case
809 return true;
810 }
811
812 try
813 {
814 if (!openpower::phal::sbe::isDumpAllowed(proc))
815 {
816 return false;
817 }
818
819 if (openpower::phal::pdbg::isSbeVitalAttnActive(proc))
820 {
821 return false;
822 }
823 }
824 catch (openpower::phal::exception::SbeError& e)
825 {
826 lg2::info("Failed to query SBE state");
827 }
828
829 // allow the dump in the error case
830 return true;
831 }
832
setSBEState(unsigned int instance,enum sbe_state state)833 void Manager::setSBEState(unsigned int instance, enum sbe_state state)
834 {
835 struct pdbg_target* proc = getPdbgTarget(instance);
836
837 if (!proc)
838 {
839 return;
840 }
841
842 try
843 {
844 openpower::phal::sbe::setState(proc, state);
845 }
846 catch (const openpower::phal::exception::SbeError& e)
847 {
848 lg2::error("Failed to set SBE state: {ERROR}", "ERROR", e.what());
849 }
850 }
851
getPdbgTarget(unsigned int instance)852 struct pdbg_target* Manager::getPdbgTarget(unsigned int instance)
853 {
854 if (!pdbgInitialized)
855 {
856 try
857 {
858 openpower::phal::pdbg::init();
859 pdbgInitialized = true;
860 }
861 catch (const openpower::phal::exception::PdbgError& e)
862 {
863 lg2::error("pdbg initialization failed");
864 return nullptr;
865 }
866 }
867
868 struct pdbg_target* proc = nullptr;
869 pdbg_for_each_class_target("proc", proc)
870 {
871 if (pdbg_target_index(proc) == instance)
872 {
873 return proc;
874 }
875 }
876
877 lg2::error("Failed to get pdbg target");
878 return nullptr;
879 }
880 #endif
881 #endif
882
pollerTimerExpired()883 void Manager::pollerTimerExpired()
884 {
885 if (!_pollTimer)
886 {
887 lg2::error("pollerTimerExpired() ERROR: Timer not defined");
888 return;
889 }
890
891 #ifdef POWER10
892 if (resetRequired)
893 {
894 lg2::error("pollerTimerExpired() - Initiating PM Complex reset");
895 initiateOccRequest(resetInstance);
896
897 if (!waitForAllOccsTimer->isEnabled())
898 {
899 lg2::warning("pollerTimerExpired: Restarting waitForAllOccTimer");
900 // restart occ wait timer
901 waitForAllOccsTimer->restartOnce(60s);
902 }
903 return;
904 }
905 #endif
906
907 for (auto& obj : statusObjects)
908 {
909 if (!obj->occActive())
910 {
911 // OCC is not running yet
912 #ifdef READ_OCC_SENSORS
913 auto id = obj->getOccInstanceID();
914 setSensorValueToNaN(id);
915 #endif
916 continue;
917 }
918
919 // Read sysfs to force kernel to poll OCC
920 obj->readOccState();
921
922 #ifdef READ_OCC_SENSORS
923 // Read occ sensor values
924 getSensorValues(obj);
925 #endif
926 }
927
928 if (activeCount > 0)
929 {
930 // Restart OCC poll timer
931 _pollTimer->restartOnce(std::chrono::seconds(pollInterval));
932 }
933 else
934 {
935 // No OCCs running, so poll timer will not be restarted
936 lg2::info(
937 "Manager::pollerTimerExpired: poll timer will not be restarted");
938 }
939 }
940
941 #ifdef READ_OCC_SENSORS
readTempSensors(const fs::path & path,uint32_t occInstance)942 void Manager::readTempSensors(const fs::path& path, uint32_t occInstance)
943 {
944 // There may be more than one sensor with the same FRU type
945 // and label so make two passes: the first to read the temps
946 // from sysfs, and the second to put them on D-Bus after
947 // resolving any conflicts.
948 std::map<std::string, double> sensorData;
949
950 std::regex expr{"temp\\d+_label$"}; // Example: temp5_label
951 for (auto& file : fs::directory_iterator(path))
952 {
953 if (!std::regex_search(file.path().string(), expr))
954 {
955 continue;
956 }
957
958 uint32_t labelValue{0};
959
960 try
961 {
962 labelValue = readFile<uint32_t>(file.path());
963 }
964 catch (const std::system_error& e)
965 {
966 lg2::debug(
967 "readTempSensors: Failed reading {PATH}, errno = {ERROR}",
968 "PATH", file.path().string(), "ERROR", e.code().value());
969 continue;
970 }
971
972 const std::string& tempLabel = "label";
973 const std::string filePathString = file.path().string().substr(
974 0, file.path().string().length() - tempLabel.length());
975
976 uint32_t fruTypeValue{0};
977 try
978 {
979 fruTypeValue = readFile<uint32_t>(filePathString + fruTypeSuffix);
980 }
981 catch (const std::system_error& e)
982 {
983 lg2::debug(
984 "readTempSensors: Failed reading {PATH}, errno = {ERROR}",
985 "PATH", filePathString + fruTypeSuffix, "ERROR",
986 e.code().value());
987 continue;
988 }
989
990 std::string sensorPath =
991 OCC_SENSORS_ROOT + std::string("/temperature/");
992
993 std::string dvfsTempPath;
994
995 if (fruTypeValue == VRMVdd)
996 {
997 sensorPath.append(
998 "vrm_vdd" + std::to_string(occInstance) + "_temp");
999 }
1000 else if (fruTypeValue == processorIoRing)
1001 {
1002 sensorPath.append(
1003 "proc" + std::to_string(occInstance) + "_ioring_temp");
1004 dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/proc" +
1005 std::to_string(occInstance) + "_ioring_dvfs_temp";
1006 }
1007 else
1008 {
1009 uint16_t type = (labelValue & 0xFF000000) >> 24;
1010 uint16_t instanceID = labelValue & 0x0000FFFF;
1011
1012 if (type == OCC_DIMM_TEMP_SENSOR_TYPE)
1013 {
1014 if (fruTypeValue == fruTypeNotAvailable)
1015 {
1016 // Not all DIMM related temps are available to read
1017 // (no _input file in this case)
1018 continue;
1019 }
1020 auto iter = dimmTempSensorName.find(fruTypeValue);
1021 if (iter == dimmTempSensorName.end())
1022 {
1023 lg2::error(
1024 "readTempSensors: Fru type error! fruTypeValue = {FRU}) ",
1025 "FRU", fruTypeValue);
1026 continue;
1027 }
1028
1029 sensorPath.append(
1030 "dimm" + std::to_string(instanceID) + iter->second);
1031
1032 dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/" +
1033 dimmDVFSSensorName.at(fruTypeValue);
1034 }
1035 else if (type == OCC_CPU_TEMP_SENSOR_TYPE)
1036 {
1037 if (fruTypeValue == processorCore)
1038 {
1039 // The OCC reports small core temps, of which there are
1040 // two per big core. All current P10 systems are in big
1041 // core mode, so use a big core name.
1042 uint16_t coreNum = instanceID / 2;
1043 uint16_t tempNum = instanceID % 2;
1044 sensorPath.append("proc" + std::to_string(occInstance) +
1045 "_core" + std::to_string(coreNum) + "_" +
1046 std::to_string(tempNum) + "_temp");
1047
1048 dvfsTempPath =
1049 std::string{OCC_SENSORS_ROOT} + "/temperature/proc" +
1050 std::to_string(occInstance) + "_core_dvfs_temp";
1051 }
1052 else
1053 {
1054 continue;
1055 }
1056 }
1057 else
1058 {
1059 continue;
1060 }
1061 }
1062
1063 // The dvfs temp file only needs to be read once per chip per type.
1064 if (!dvfsTempPath.empty() &&
1065 !dbus::OccDBusSensors::getOccDBus().hasDvfsTemp(dvfsTempPath))
1066 {
1067 try
1068 {
1069 auto dvfsValue = readFile<double>(filePathString + maxSuffix);
1070
1071 dbus::OccDBusSensors::getOccDBus().setDvfsTemp(
1072 dvfsTempPath, dvfsValue * std::pow(10, -3));
1073 }
1074 catch (const std::system_error& e)
1075 {
1076 lg2::debug(
1077 "readTempSensors: Failed reading {PATH}, errno = {ERROR}",
1078 "PATH", filePathString + maxSuffix, "ERROR",
1079 e.code().value());
1080 }
1081 }
1082
1083 uint32_t faultValue{0};
1084 try
1085 {
1086 faultValue = readFile<uint32_t>(filePathString + faultSuffix);
1087 }
1088 catch (const std::system_error& e)
1089 {
1090 lg2::debug(
1091 "readTempSensors: Failed reading {PATH}, errno = {ERROR}",
1092 "PATH", filePathString + faultSuffix, "ERROR",
1093 e.code().value());
1094 continue;
1095 }
1096
1097 double tempValue{0};
1098 // NOTE: if OCC sends back 0xFF, kernal sets this fault value to 1.
1099 if (faultValue != 0)
1100 {
1101 tempValue = std::numeric_limits<double>::quiet_NaN();
1102 }
1103 else
1104 {
1105 // Read the temperature
1106 try
1107 {
1108 tempValue = readFile<double>(filePathString + inputSuffix);
1109 }
1110 catch (const std::system_error& e)
1111 {
1112 lg2::debug(
1113 "readTempSensors: Failed reading {PATH}, errno = {ERROR}",
1114 "PATH", filePathString + inputSuffix, "ERROR",
1115 e.code().value());
1116
1117 // if errno == EAGAIN(Resource temporarily unavailable) then set
1118 // temp to 0, to avoid using old temp, and affecting FAN
1119 // Control.
1120 if (e.code().value() == EAGAIN)
1121 {
1122 tempValue = 0;
1123 }
1124 // else the errno would be something like
1125 // EBADF(Bad file descriptor)
1126 // or ENOENT(No such file or directory)
1127 else
1128 {
1129 continue;
1130 }
1131 }
1132 }
1133
1134 // If this object path already has a value, only overwite
1135 // it if the previous one was an NaN or a smaller value.
1136 auto existing = sensorData.find(sensorPath);
1137 if (existing != sensorData.end())
1138 {
1139 // Multiple sensors found for this FRU type
1140 if ((std::isnan(existing->second) && (tempValue == 0)) ||
1141 ((existing->second == 0) && std::isnan(tempValue)))
1142 {
1143 // One of the redundant sensors has failed (0xFF/nan), and the
1144 // other sensor has no reading (0), so set the FRU to NaN to
1145 // force fan increase
1146 tempValue = std::numeric_limits<double>::quiet_NaN();
1147 existing->second = tempValue;
1148 }
1149 if (std::isnan(existing->second) || (tempValue > existing->second))
1150 {
1151 existing->second = tempValue;
1152 }
1153 }
1154 else
1155 {
1156 // First sensor for this FRU type
1157 sensorData[sensorPath] = tempValue;
1158 }
1159 }
1160
1161 // Now publish the values on D-Bus.
1162 for (const auto& [objectPath, value] : sensorData)
1163 {
1164 dbus::OccDBusSensors::getOccDBus().setValue(objectPath,
1165 value * std::pow(10, -3));
1166
1167 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1168 objectPath, !std::isnan(value));
1169
1170 if (existingSensors.find(objectPath) == existingSensors.end())
1171 {
1172 dbus::OccDBusSensors::getOccDBus().setChassisAssociation(
1173 objectPath, {"all_sensors"});
1174 }
1175 existingSensors[objectPath] = occInstance;
1176 }
1177 }
1178
getPowerLabelFunctionID(const std::string & value)1179 std::optional<std::string> Manager::getPowerLabelFunctionID(
1180 const std::string& value)
1181 {
1182 // If the value is "system", then the FunctionID is "system".
1183 if (value == "system")
1184 {
1185 return value;
1186 }
1187
1188 // If the value is not "system", then the label value have 3 numbers, of
1189 // which we only care about the middle one:
1190 // <sensor id>_<function id>_<apss channel>
1191 // eg: The value is "0_10_5" , then the FunctionID is "10".
1192 if (value.find("_") == std::string::npos)
1193 {
1194 return std::nullopt;
1195 }
1196
1197 auto powerLabelValue = value.substr((value.find("_") + 1));
1198
1199 if (powerLabelValue.find("_") == std::string::npos)
1200 {
1201 return std::nullopt;
1202 }
1203
1204 return powerLabelValue.substr(0, powerLabelValue.find("_"));
1205 }
1206
readPowerSensors(const fs::path & path,uint32_t id)1207 void Manager::readPowerSensors(const fs::path& path, uint32_t id)
1208 {
1209 std::regex expr{"power\\d+_label$"}; // Example: power5_label
1210 for (auto& file : fs::directory_iterator(path))
1211 {
1212 if (!std::regex_search(file.path().string(), expr))
1213 {
1214 continue;
1215 }
1216
1217 std::string labelValue;
1218 try
1219 {
1220 labelValue = readFile<std::string>(file.path());
1221 }
1222 catch (const std::system_error& e)
1223 {
1224 lg2::debug(
1225 "readPowerSensors: Failed reading {PATH}, errno = {ERROR}",
1226 "PATH", file.path().string(), "ERROR", e.code().value());
1227 continue;
1228 }
1229
1230 auto functionID = getPowerLabelFunctionID(labelValue);
1231 if (functionID == std::nullopt)
1232 {
1233 continue;
1234 }
1235
1236 const std::string& tempLabel = "label";
1237 const std::string filePathString = file.path().string().substr(
1238 0, file.path().string().length() - tempLabel.length());
1239
1240 std::string sensorPath = OCC_SENSORS_ROOT + std::string("/power/");
1241
1242 auto iter = powerSensorName.find(*functionID);
1243 if (iter == powerSensorName.end())
1244 {
1245 continue;
1246 }
1247 sensorPath.append(iter->second);
1248
1249 double tempValue{0};
1250
1251 try
1252 {
1253 tempValue = readFile<double>(filePathString + inputSuffix);
1254 }
1255 catch (const std::system_error& e)
1256 {
1257 lg2::debug(
1258 "readPowerSensors: Failed reading {PATH}, errno = {ERROR}",
1259 "PATH", filePathString + inputSuffix, "ERROR",
1260 e.code().value());
1261 continue;
1262 }
1263
1264 dbus::OccDBusSensors::getOccDBus().setUnit(
1265 sensorPath, "xyz.openbmc_project.Sensor.Value.Unit.Watts");
1266
1267 dbus::OccDBusSensors::getOccDBus().setValue(
1268 sensorPath, tempValue * std::pow(10, -3) * std::pow(10, -3));
1269
1270 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1271 sensorPath, true);
1272
1273 if (existingSensors.find(sensorPath) == existingSensors.end())
1274 {
1275 std::vector<std::string> fTypeList = {"all_sensors"};
1276 if (iter->second == "total_power")
1277 {
1278 // Set sensor purpose as TotalPower
1279 dbus::OccDBusSensors::getOccDBus().setPurpose(
1280 sensorPath,
1281 "xyz.openbmc_project.Sensor.Purpose.SensorPurpose.TotalPower");
1282 }
1283 dbus::OccDBusSensors::getOccDBus().setChassisAssociation(
1284 sensorPath, fTypeList);
1285 }
1286 existingSensors[sensorPath] = id;
1287 }
1288 return;
1289 }
1290
readExtnSensors(const fs::path & path,uint32_t id)1291 void Manager::readExtnSensors(const fs::path& path, uint32_t id)
1292 {
1293 std::regex expr{"extn\\d+_label$"}; // Example: extn5_label
1294 for (auto& file : fs::directory_iterator(path))
1295 {
1296 if (!std::regex_search(file.path().string(), expr))
1297 {
1298 continue;
1299 }
1300
1301 // Read in Label value of the sensor from file.
1302 std::string labelValue;
1303 try
1304 {
1305 labelValue = readFile<std::string>(file.path());
1306 }
1307 catch (const std::system_error& e)
1308 {
1309 lg2::debug(
1310 "readExtnSensors:label Failed reading {PATH}, errno = {ERROR}",
1311 "PATH", file.path().string(), "ERROR", e.code().value());
1312 continue;
1313 }
1314 const std::string& tempLabel = "label";
1315 const std::string filePathString = file.path().string().substr(
1316 0, file.path().string().length() - tempLabel.length());
1317
1318 std::string sensorPath = OCC_SENSORS_ROOT + std::string("/power/");
1319
1320 // Labels of EXTN sections from OCC interface Document
1321 // have different formats.
1322 // 0x464d494e : FMIN 0x46444953 : FDIS
1323 // 0x46424153 : FBAS 0x46555400 : FUT
1324 // 0x464d4158 : FMAX 0x434c4950 : CLIP
1325 // 0x4d4f4445 : MODE 0x574f4643 : WOFC
1326 // 0x574f4649 : WOFI 0x5057524d : PWRM
1327 // 0x50575250 : PWRP 0x45525248 : ERRH
1328 // Label indicating byte 5 and 6 is the current (mem,proc) power in
1329 // Watts.
1330 if ((labelValue == EXTN_LABEL_PWRM_MEMORY_POWER) ||
1331 (labelValue == EXTN_LABEL_PWRP_PROCESSOR_POWER))
1332 {
1333 // Build the dbus String for this chiplet power asset.
1334 if (labelValue == EXTN_LABEL_PWRP_PROCESSOR_POWER)
1335 {
1336 labelValue = "_power";
1337 }
1338 else // else EXTN_LABEL_PWRM_MEMORY_POWER
1339 {
1340 labelValue = "_mem_power";
1341 }
1342 sensorPath.append("chiplet" + std::to_string(id) + labelValue);
1343
1344 // Read in data value of the sensor from file.
1345 // Read in as string due to different format of data in sensors.
1346 std::string extnValue;
1347 try
1348 {
1349 extnValue = readFile<std::string>(filePathString + inputSuffix);
1350 }
1351 catch (const std::system_error& e)
1352 {
1353 lg2::debug(
1354 "readExtnSensors:value Failed reading {PATH}, errno = {ERROR}",
1355 "PATH", filePathString + inputSuffix, "ERROR",
1356 e.code().value());
1357 continue;
1358 }
1359
1360 // For Power field, Convert last 4 bytes of hex string into number
1361 // value.
1362 std::stringstream ssData;
1363 ssData << std::hex << extnValue.substr(extnValue.length() - 4);
1364 uint16_t MyHexNumber;
1365 ssData >> MyHexNumber;
1366
1367 // Convert output/DC power to input/AC power in Watts (round up)
1368 MyHexNumber =
1369 std::round(((MyHexNumber / (PS_DERATING_FACTOR / 100.0))));
1370
1371 dbus::OccDBusSensors::getOccDBus().setUnit(
1372 sensorPath, "xyz.openbmc_project.Sensor.Value.Unit.Watts");
1373
1374 dbus::OccDBusSensors::getOccDBus().setValue(sensorPath,
1375 MyHexNumber);
1376
1377 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1378 sensorPath, true);
1379
1380 if (existingSensors.find(sensorPath) == existingSensors.end())
1381 {
1382 dbus::OccDBusSensors::getOccDBus().setChassisAssociation(
1383 sensorPath, {"all_sensors"});
1384 }
1385
1386 existingSensors[sensorPath] = id;
1387 } // End Extended Power Sensors.
1388 } // End For loop on files for Extended Sensors.
1389 return;
1390 }
1391
setSensorValueToNaN(uint32_t id) const1392 void Manager::setSensorValueToNaN(uint32_t id) const
1393 {
1394 for (const auto& [sensorPath, occId] : existingSensors)
1395 {
1396 if (occId == id)
1397 {
1398 dbus::OccDBusSensors::getOccDBus().setValue(
1399 sensorPath, std::numeric_limits<double>::quiet_NaN());
1400
1401 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1402 sensorPath, true);
1403 }
1404 }
1405 return;
1406 }
1407
setSensorValueToNonFunctional(uint32_t id) const1408 void Manager::setSensorValueToNonFunctional(uint32_t id) const
1409 {
1410 for (const auto& [sensorPath, occId] : existingSensors)
1411 {
1412 if (occId == id)
1413 {
1414 dbus::OccDBusSensors::getOccDBus().setValue(
1415 sensorPath, std::numeric_limits<double>::quiet_NaN());
1416
1417 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1418 sensorPath, false);
1419 }
1420 }
1421 return;
1422 }
1423
getSensorValues(std::unique_ptr<Status> & occ)1424 void Manager::getSensorValues(std::unique_ptr<Status>& occ)
1425 {
1426 static bool tracedError[8] = {0};
1427 const fs::path sensorPath = occ->getHwmonPath();
1428 const uint32_t id = occ->getOccInstanceID();
1429
1430 if (fs::exists(sensorPath))
1431 {
1432 // Read temperature sensors
1433 readTempSensors(sensorPath, id);
1434 // Read Extended sensors
1435 readExtnSensors(sensorPath, id);
1436
1437 if (occ->isMasterOcc())
1438 {
1439 // Read power sensors
1440 readPowerSensors(sensorPath, id);
1441 }
1442 tracedError[id] = false;
1443 }
1444 else
1445 {
1446 if (!tracedError[id])
1447 {
1448 lg2::error(
1449 "Manager::getSensorValues: OCC{INST} sensor path missing: {PATH}",
1450 "INST", id, "PATH", sensorPath);
1451 tracedError[id] = true;
1452 }
1453 }
1454
1455 return;
1456 }
1457 #endif
1458
1459 // Read the altitude from DBus
readAltitude()1460 void Manager::readAltitude()
1461 {
1462 static bool traceAltitudeErr = true;
1463
1464 utils::PropertyValue altitudeProperty{};
1465 try
1466 {
1467 altitudeProperty = utils::getProperty(ALTITUDE_PATH, ALTITUDE_INTERFACE,
1468 ALTITUDE_PROP);
1469 auto sensorVal = std::get<double>(altitudeProperty);
1470 if (sensorVal < 0xFFFF)
1471 {
1472 if (sensorVal < 0)
1473 {
1474 altitude = 0;
1475 }
1476 else
1477 {
1478 // Round to nearest meter
1479 altitude = uint16_t(sensorVal + 0.5);
1480 }
1481 lg2::debug("readAltitude: sensor={VALUE} ({ALT}m)", "VALUE",
1482 sensorVal, "ALT", altitude);
1483 traceAltitudeErr = true;
1484 }
1485 else
1486 {
1487 if (traceAltitudeErr)
1488 {
1489 traceAltitudeErr = false;
1490 lg2::debug("Invalid altitude value: {ALT}", "ALT", sensorVal);
1491 }
1492 }
1493 }
1494 catch (const sdbusplus::exception_t& e)
1495 {
1496 if (traceAltitudeErr)
1497 {
1498 traceAltitudeErr = false;
1499 lg2::info("Unable to read Altitude: {ERROR}", "ERROR", e.what());
1500 }
1501 altitude = 0xFFFF; // not available
1502 }
1503 }
1504
1505 // Callback function when ambient temperature changes
ambientCallback(sdbusplus::message_t & msg)1506 void Manager::ambientCallback(sdbusplus::message_t& msg)
1507 {
1508 double currentTemp = 0;
1509 uint8_t truncatedTemp = 0xFF;
1510 std::string msgSensor;
1511 std::map<std::string, std::variant<double>> msgData;
1512 msg.read(msgSensor, msgData);
1513
1514 auto valPropMap = msgData.find(AMBIENT_PROP);
1515 if (valPropMap == msgData.end())
1516 {
1517 lg2::debug("ambientCallback: Unknown ambient property changed");
1518 return;
1519 }
1520 currentTemp = std::get<double>(valPropMap->second);
1521 if (std::isnan(currentTemp))
1522 {
1523 truncatedTemp = 0xFF;
1524 }
1525 else
1526 {
1527 if (currentTemp < 0)
1528 {
1529 truncatedTemp = 0;
1530 }
1531 else
1532 {
1533 // Round to nearest degree C
1534 truncatedTemp = uint8_t(currentTemp + 0.5);
1535 }
1536 }
1537
1538 // If ambient changes, notify OCCs
1539 if (truncatedTemp != ambient)
1540 {
1541 lg2::debug("ambientCallback: Ambient change from {OLD} to {NEW}C",
1542 "OLD", ambient, "NEW", currentTemp);
1543
1544 ambient = truncatedTemp;
1545 if (altitude == 0xFFFF)
1546 {
1547 // No altitude yet, try reading again
1548 readAltitude();
1549 }
1550
1551 lg2::debug("ambientCallback: Ambient: {TEMP}C, altitude: {ALT}m",
1552 "TEMP", ambient, "ALT", altitude);
1553 #ifdef POWER10
1554 // Send ambient and altitude to all OCCs
1555 for (auto& obj : statusObjects)
1556 {
1557 if (obj->occActive())
1558 {
1559 obj->sendAmbient(ambient, altitude);
1560 }
1561 }
1562 #endif // POWER10
1563 }
1564 }
1565
1566 // return the current ambient and altitude readings
getAmbientData(bool & ambientValid,uint8_t & ambientTemp,uint16_t & altitudeValue) const1567 void Manager::getAmbientData(bool& ambientValid, uint8_t& ambientTemp,
1568 uint16_t& altitudeValue) const
1569 {
1570 ambientValid = true;
1571 ambientTemp = ambient;
1572 altitudeValue = altitude;
1573
1574 if (ambient == 0xFF)
1575 {
1576 ambientValid = false;
1577 }
1578 }
1579
1580 #ifdef POWER10
1581 // Called when waitForAllOccsTimer expires
1582 // After the first OCC goes active, this timer will be started (60 seconds)
occsNotAllRunning()1583 void Manager::occsNotAllRunning()
1584 {
1585 if (resetInProgress)
1586 {
1587 lg2::warning(
1588 "occsNotAllRunning: Ignoring waitForAllOccsTimer because reset is in progress");
1589 return;
1590 }
1591 if (activeCount != statusObjects.size())
1592 {
1593 // Not all OCCs went active
1594 lg2::warning(
1595 "occsNotAllRunning: Active OCC count ({COUNT}) does not match expected count ({EXP})",
1596 "COUNT", activeCount, "EXP", statusObjects.size());
1597 // Procs may be garded, so may be expected
1598 }
1599
1600 if (resetRequired)
1601 {
1602 initiateOccRequest(resetInstance);
1603
1604 if (!waitForAllOccsTimer->isEnabled())
1605 {
1606 lg2::warning("occsNotAllRunning: Restarting waitForAllOccTimer");
1607 // restart occ wait timer
1608 waitForAllOccsTimer->restartOnce(60s);
1609 }
1610 }
1611 else
1612 {
1613 validateOccMaster();
1614 }
1615 }
1616
1617 #ifdef PLDM
1618 // Called when throttlePldmTraceTimer expires.
1619 // If this timer expires, that indicates there are no OCC active sensor PDRs
1620 // found which will trigger pldm traces to be throttled.
1621 // The second time this timer expires, a PEL will get created.
throttlePldmTraceExpired()1622 void Manager::throttlePldmTraceExpired()
1623 {
1624 if (utils::isHostRunning())
1625 {
1626 if (!onPldmTimeoutCreatePel)
1627 {
1628 // Throttle traces
1629 pldmHandle->setTraceThrottle(true);
1630 // Restart timer to log a PEL when timer expires
1631 onPldmTimeoutCreatePel = true;
1632 throttlePldmTraceTimer->restartOnce(40min);
1633 }
1634 else
1635 {
1636 lg2::error(
1637 "throttlePldmTraceExpired(): OCC active sensors still not available!");
1638 // Create PEL
1639 createPldmSensorPEL();
1640 }
1641 }
1642 else
1643 {
1644 // Make sure traces are not throttled
1645 pldmHandle->setTraceThrottle(false);
1646 lg2::info(
1647 "throttlePldmTraceExpired(): host it not running ignoring sensor timer");
1648 }
1649 }
1650
createPldmSensorPEL()1651 void Manager::createPldmSensorPEL()
1652 {
1653 Error::Descriptor d = Error::Descriptor(MISSING_OCC_SENSORS_PATH);
1654 std::map<std::string, std::string> additionalData;
1655
1656 additionalData.emplace("_PID", std::to_string(getpid()));
1657
1658 lg2::info(
1659 "createPldmSensorPEL(): Unable to find PLDM sensors for the OCCs");
1660
1661 auto& bus = utils::getBus();
1662
1663 try
1664 {
1665 FFDCFiles ffdc;
1666 // Add occ-control journal traces to PEL FFDC
1667 auto occJournalFile =
1668 FFDC::addJournalEntries(ffdc, "openpower-occ-control", 40);
1669
1670 static constexpr auto loggingObjectPath =
1671 "/xyz/openbmc_project/logging";
1672 static constexpr auto opLoggingInterface = "org.open_power.Logging.PEL";
1673 std::string service =
1674 utils::getService(loggingObjectPath, opLoggingInterface);
1675 auto method =
1676 bus.new_method_call(service.c_str(), loggingObjectPath,
1677 opLoggingInterface, "CreatePELWithFFDCFiles");
1678
1679 // Set level to Warning (Predictive).
1680 auto level =
1681 sdbusplus::xyz::openbmc_project::Logging::server::convertForMessage(
1682 sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level::
1683 Warning);
1684
1685 method.append(d.path, level, additionalData, ffdc);
1686 bus.call(method);
1687 }
1688 catch (const sdbusplus::exception_t& e)
1689 {
1690 lg2::error("Failed to create MISSING_OCC_SENSORS PEL: {ERROR}", "ERROR",
1691 e.what());
1692 }
1693 }
1694 #endif // PLDM
1695 #endif // POWER10
1696
1697 // Verify single master OCC and start presence monitor
validateOccMaster()1698 void Manager::validateOccMaster()
1699 {
1700 int masterInstance = -1;
1701 for (auto& obj : statusObjects)
1702 {
1703 auto instance = obj->getOccInstanceID();
1704 #ifdef POWER10
1705 if (!obj->occActive())
1706 {
1707 if (utils::isHostRunning())
1708 {
1709 // Check if sensor was queued while waiting for discovery
1710 auto match = queuedActiveState.find(instance);
1711 if (match != queuedActiveState.end())
1712 {
1713 queuedActiveState.erase(match);
1714 lg2::info("validateOccMaster: OCC{INST} is ACTIVE (queued)",
1715 "INST", instance);
1716 obj->occActive(true);
1717 }
1718 else
1719 {
1720 // OCC does not appear to be active yet, check active sensor
1721 #ifdef PLDM
1722 pldmHandle->checkActiveSensor(instance);
1723 #endif
1724 if (obj->occActive())
1725 {
1726 lg2::info(
1727 "validateOccMaster: OCC{INST} is ACTIVE after reading sensor",
1728 "INST", instance);
1729 }
1730 }
1731 }
1732 else
1733 {
1734 lg2::warning(
1735 "validateOccMaster: HOST is not running (OCC{INST})",
1736 "INST", instance);
1737 return;
1738 }
1739 }
1740 #endif // POWER10
1741
1742 if (obj->isMasterOcc())
1743 {
1744 obj->addPresenceWatchMaster();
1745
1746 if (masterInstance == -1)
1747 {
1748 masterInstance = instance;
1749 }
1750 else
1751 {
1752 lg2::error(
1753 "validateOccMaster: Multiple OCC masters! ({MAST1} and {MAST2})",
1754 "MAST1", masterInstance, "MAST2", instance);
1755 // request reset
1756 obj->deviceError(Error::Descriptor(PRESENCE_ERROR_PATH));
1757 }
1758 }
1759 }
1760
1761 if (masterInstance < 0)
1762 {
1763 lg2::error("validateOccMaster: Master OCC not found! (of {NUM} OCCs)",
1764 "NUM", statusObjects.size());
1765 // request reset
1766 statusObjects.front()->deviceError(
1767 Error::Descriptor(PRESENCE_ERROR_PATH));
1768 }
1769 else
1770 {
1771 lg2::info("validateOccMaster: OCC{INST} is master of {COUNT} OCCs",
1772 "INST", masterInstance, "COUNT", activeCount);
1773 #ifdef POWER10
1774 pmode->updateDbusSafeMode(false);
1775 #endif
1776 }
1777 }
1778
updatePcapBounds() const1779 void Manager::updatePcapBounds() const
1780 {
1781 if (pcap)
1782 {
1783 pcap->updatePcapBounds();
1784 }
1785 }
1786
1787 } // namespace occ
1788 } // namespace open_power
1789