1 #include "config.h"
2
3 #include "occ_manager.hpp"
4
5 #include "occ_dbus.hpp"
6 #include "occ_errors.hpp"
7 #include "utils.hpp"
8
9 #include <nlohmann/json.hpp>
10 #include <phosphor-logging/elog-errors.hpp>
11 #include <phosphor-logging/lg2.hpp>
12 #include <xyz/openbmc_project/Common/error.hpp>
13
14 #include <chrono>
15 #include <cmath>
16 #include <filesystem>
17 #include <fstream>
18 #include <regex>
19
20 namespace open_power
21 {
22 namespace occ
23 {
24
25 constexpr uint32_t fruTypeNotAvailable = 0xFF;
26 constexpr auto fruTypeSuffix = "fru_type";
27 constexpr auto faultSuffix = "fault";
28 constexpr auto inputSuffix = "input";
29 constexpr auto maxSuffix = "max";
30
31 const auto HOST_ON_FILE = "/run/openbmc/host@0-on";
32 const std::string Manager::dumpFile = "/tmp/occ_control_dump.json";
33
34 using namespace phosphor::logging;
35 using namespace std::literals::chrono_literals;
36 using json = nlohmann::json;
37
38 template <typename T>
readFile(const std::string & path)39 T readFile(const std::string& path)
40 {
41 std::ifstream ifs;
42 ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit |
43 std::ifstream::eofbit);
44 T data;
45
46 try
47 {
48 ifs.open(path);
49 ifs >> data;
50 ifs.close();
51 }
52 catch (const std::exception& e)
53 {
54 auto err = errno;
55 throw std::system_error(err, std::generic_category());
56 }
57
58 return data;
59 }
60
createPldmHandle()61 void Manager::createPldmHandle()
62 {
63 pldmHandle = std::make_unique<pldm::Interface>(
64 std::bind(std::mem_fn(&Manager::updateOCCActive), this,
65 std::placeholders::_1, std::placeholders::_2),
66 std::bind(std::mem_fn(&Manager::sbeHRESETResult), this,
67 std::placeholders::_1, std::placeholders::_2),
68 std::bind(std::mem_fn(&Manager::updateOccSafeMode), this,
69 std::placeholders::_1),
70 std::bind(std::mem_fn(&Manager::hostPoweredOff), this), event);
71 }
72
73 // findAndCreateObjects():
74 // Takes care of getting the required objects created and
75 // finds the available devices/processors.
76 // (function is called everytime the discoverTimer expires)
77 // - create the PowerMode object to control OCC modes
78 // - create statusObjects for each OCC device found
79 // - waits for OCC Active sensors PDRs to become available
80 // - restart discoverTimer if all data is not available yet
findAndCreateObjects()81 void Manager::findAndCreateObjects()
82 {
83 if (!pmode)
84 {
85 // Create the power mode object
86 pmode = std::make_unique<powermode::PowerMode>(
87 *this, powermode::PMODE_PATH, powermode::PIPS_PATH, event);
88 }
89
90 if (!fs::exists(HOST_ON_FILE))
91 {
92 static bool statusObjCreated = false;
93 if (!statusObjCreated)
94 {
95 // Create the OCCs based on on the /dev/occX devices
96 auto occs = findOCCsInDev();
97
98 if (occs.empty() || (prevOCCSearch.size() != occs.size()))
99 {
100 // Something changed or no OCCs yet, try again in 10s.
101 // Note on the first pass prevOCCSearch will be empty,
102 // so there will be at least one delay to give things
103 // a chance to settle.
104 prevOCCSearch = occs;
105
106 lg2::info(
107 "Manager::findAndCreateObjects(): Waiting for OCCs (currently {QTY})",
108 "QTY", occs.size());
109
110 discoverTimer->restartOnce(10s);
111 }
112 else
113 {
114 // All OCCs appear to be available, create status objects
115
116 // createObjects requires OCC0 first.
117 std::sort(occs.begin(), occs.end());
118
119 lg2::info(
120 "Manager::findAndCreateObjects(): Creating {QTY} OCC Status Objects",
121 "QTY", occs.size());
122 for (auto id : occs)
123 {
124 createObjects(std::string(OCC_NAME) + std::to_string(id));
125 }
126 statusObjCreated = true;
127 waitingForAllOccActiveSensors = true;
128
129 // Find/update the processor path associated with each OCC
130 for (auto& obj : statusObjects)
131 {
132 obj->updateProcAssociation();
133 }
134 }
135 }
136
137 if (statusObjCreated && waitingForAllOccActiveSensors)
138 {
139 static bool tracedHostWait = false;
140 if (utils::isHostRunning())
141 {
142 if (tracedHostWait)
143 {
144 lg2::info(
145 "Manager::findAndCreateObjects(): Host is running");
146 tracedHostWait = false;
147 }
148 checkAllActiveSensors();
149 }
150 else
151 {
152 if (!tracedHostWait)
153 {
154 lg2::info(
155 "Manager::findAndCreateObjects(): Waiting for host to start");
156 tracedHostWait = true;
157 }
158 discoverTimer->restartOnce(30s);
159
160 if (throttlePldmTraceTimer->isEnabled())
161 {
162 // Host is no longer running, disable throttle timer and
163 // make sure traces are not throttled
164 lg2::info("findAndCreateObjects(): disabling sensor timer");
165 throttlePldmTraceTimer->setEnabled(false);
166 pldmHandle->setTraceThrottle(false);
167 }
168 }
169 }
170 }
171 else
172 {
173 lg2::info(
174 "Manager::findAndCreateObjects(): Waiting for {FILE} to complete...",
175 "FILE", HOST_ON_FILE);
176 discoverTimer->restartOnce(10s);
177 }
178 }
179
180 // Check if all occActive sensors are available
checkAllActiveSensors()181 void Manager::checkAllActiveSensors()
182 {
183 static bool allActiveSensorAvailable = false;
184 static bool tracedSensorWait = false;
185 static bool waitingForHost = false;
186
187 if (open_power::occ::utils::isHostRunning())
188 {
189 if (waitingForHost)
190 {
191 waitingForHost = false;
192 lg2::info("checkAllActiveSensors(): Host is now running");
193 }
194
195 // Start with the assumption that all are available
196 allActiveSensorAvailable = true;
197 for (auto& obj : statusObjects)
198 {
199 if ((!obj->occActive()) && (!obj->getPldmSensorReceived()))
200 {
201 auto instance = obj->getOccInstanceID();
202 // Check if sensor was queued while waiting for discovery
203 auto match = queuedActiveState.find(instance);
204 if (match != queuedActiveState.end())
205 {
206 queuedActiveState.erase(match);
207 lg2::info(
208 "checkAllActiveSensors(): OCC{INST} is ACTIVE (queued)",
209 "INST", instance);
210 obj->occActive(true);
211 }
212 else
213 {
214 allActiveSensorAvailable = false;
215 if (!tracedSensorWait)
216 {
217 lg2::info(
218 "checkAllActiveSensors(): Waiting on OCC{INST} Active sensor",
219 "INST", instance);
220 tracedSensorWait = true;
221 // Make sure PLDM traces are not throttled
222 pldmHandle->setTraceThrottle(false);
223 // Start timer to throttle PLDM traces when timer
224 // expires
225 onPldmTimeoutCreatePel = false;
226 throttlePldmTraceTimer->restartOnce(5min);
227 }
228 // Ignore active sensor check if the OCCs are being reset
229 if (!resetInProgress)
230 {
231 pldmHandle->checkActiveSensor(obj->getOccInstanceID());
232 }
233 break;
234 }
235 }
236 }
237 }
238 else
239 {
240 if (!waitingForHost)
241 {
242 waitingForHost = true;
243 lg2::info("checkAllActiveSensors(): Waiting for host to start");
244 if (throttlePldmTraceTimer->isEnabled())
245 {
246 // Host is no longer running, disable throttle timer and
247 // make sure traces are not throttled
248 lg2::info("checkAllActiveSensors(): disabling sensor timer");
249 throttlePldmTraceTimer->setEnabled(false);
250 pldmHandle->setTraceThrottle(false);
251 }
252 }
253 }
254
255 if (allActiveSensorAvailable)
256 {
257 // All sensors were found, disable the discovery timer
258 if (discoverTimer->isEnabled())
259 {
260 discoverTimer->setEnabled(false);
261 }
262 if (throttlePldmTraceTimer->isEnabled())
263 {
264 // Disable throttle timer and make sure traces are not throttled
265 throttlePldmTraceTimer->setEnabled(false);
266 pldmHandle->setTraceThrottle(false);
267 }
268 if (waitingForAllOccActiveSensors)
269 {
270 lg2::info(
271 "checkAllActiveSensors(): OCC Active sensors are available");
272 waitingForAllOccActiveSensors = false;
273
274 if (resetRequired)
275 {
276 initiateOccRequest(resetInstance);
277
278 if (!waitForAllOccsTimer->isEnabled())
279 {
280 lg2::warning(
281 "occsNotAllRunning: Restarting waitForAllOccTimer");
282 // restart occ wait timer to check status after reset
283 // completes
284 waitForAllOccsTimer->restartOnce(60s);
285 }
286 }
287 }
288 queuedActiveState.clear();
289 tracedSensorWait = false;
290 }
291 else
292 {
293 // Not all sensors were available, so keep waiting
294 if (!tracedSensorWait)
295 {
296 lg2::info(
297 "checkAllActiveSensors(): Waiting for OCC Active sensors to become available");
298 tracedSensorWait = true;
299 }
300 discoverTimer->restartOnce(10s);
301 }
302 }
303
findOCCsInDev()304 std::vector<int> Manager::findOCCsInDev()
305 {
306 std::vector<int> occs;
307 std::regex expr{R"(occ(\d+)$)"};
308
309 for (auto& file : fs::directory_iterator("/dev"))
310 {
311 std::smatch match;
312 std::string path{file.path().string()};
313 if (std::regex_search(path, match, expr))
314 {
315 auto num = std::stoi(match[1].str());
316
317 // /dev numbering starts at 1, ours starts at 0.
318 occs.push_back(num - 1);
319 }
320 }
321
322 return occs;
323 }
324
cpuCreated(sdbusplus::message_t & msg)325 int Manager::cpuCreated(sdbusplus::message_t& msg)
326 {
327 namespace fs = std::filesystem;
328
329 sdbusplus::message::object_path o;
330 msg.read(o);
331 fs::path cpuPath(std::string(std::move(o)));
332
333 auto name = cpuPath.filename().string();
334 auto index = name.find(CPU_NAME);
335 name.replace(index, std::strlen(CPU_NAME), OCC_NAME);
336
337 createObjects(name);
338
339 return 0;
340 }
341
createObjects(const std::string & occ)342 void Manager::createObjects(const std::string& occ)
343 {
344 auto path = fs::path(OCC_CONTROL_ROOT) / occ;
345
346 statusObjects.emplace_back(std::make_unique<Status>(
347 event, path.c_str(), *this, pmode,
348 std::bind(std::mem_fn(&Manager::statusCallBack), this,
349 std::placeholders::_1, std::placeholders::_2),
350 // Callback will set flag indicating reset needs to be done
351 // instead of immediately issuing a reset via PLDM.
352 std::bind(std::mem_fn(&Manager::resetOccRequest), this,
353 std::placeholders::_1)));
354
355 // Create the power cap monitor object
356 if (!pcap)
357 {
358 pcap = std::make_unique<open_power::occ::powercap::PowerCap>(
359 *statusObjects.back());
360 }
361
362 if (statusObjects.back()->isMasterOcc())
363 {
364 lg2::info("Manager::createObjects(): OCC{INST} is the master", "INST",
365 statusObjects.back()->getOccInstanceID());
366 _pollTimer->setEnabled(false);
367
368 // Set the master OCC on the PowerMode object
369 pmode->setMasterOcc(path);
370 }
371
372 passThroughObjects.emplace_back(
373 std::make_unique<PassThrough>(path.c_str(), pmode));
374 }
375
376 // If a reset is not already outstanding, set a flag to indicate that a reset is
377 // needed.
resetOccRequest(instanceID instance)378 void Manager::resetOccRequest(instanceID instance)
379 {
380 if (!resetRequired)
381 {
382 resetRequired = true;
383 resetInstance = instance;
384 lg2::error(
385 "resetOccRequest: PM Complex reset was requested due to OCC{INST}",
386 "INST", instance);
387 }
388 else if (instance != resetInstance)
389 {
390 lg2::warning(
391 "resetOccRequest: Ignoring PM Complex reset request for OCC{INST}, because reset already outstanding for OCC{RINST}",
392 "INST", instance, "RINST", resetInstance);
393 }
394 }
395
396 // If a reset has not been started, initiate an OCC reset via PLDM
initiateOccRequest(instanceID instance)397 void Manager::initiateOccRequest(instanceID instance)
398 {
399 if (!resetInProgress)
400 {
401 resetInProgress = true;
402 resetInstance = instance;
403 lg2::error(
404 "initiateOccRequest: Initiating PM Complex reset due to OCC{INST}",
405 "INST", instance);
406
407 // Make sure ALL OCC comm stops to all OCCs before the reset
408 for (auto& obj : statusObjects)
409 {
410 if (obj->occActive())
411 {
412 obj->occActive(false);
413 }
414 }
415
416 pldmHandle->resetOCC(instance);
417 resetRequired = false;
418 }
419 else
420 {
421 lg2::warning(
422 "initiateOccRequest: Ignoring PM Complex reset request for OCC{INST}, because reset already in process for OCC{RINST}",
423 "INST", instance, "RINST", resetInstance);
424 }
425 }
426
statusCallBack(instanceID instance,bool status)427 void Manager::statusCallBack(instanceID instance, bool status)
428 {
429 if (status == true)
430 {
431 if (resetInProgress)
432 {
433 lg2::info(
434 "statusCallBack: Ignoring OCC{INST} activate because a reset has been initiated due to OCC{RINST}",
435 "INST", instance, "RINST", resetInstance);
436 return;
437 }
438
439 // OCC went active
440 ++activeCount;
441
442 if (activeCount == 1)
443 {
444 // First OCC went active (allow some time for all OCCs to go active)
445 waitForAllOccsTimer->restartOnce(60s);
446 }
447
448 if (activeCount == statusObjects.size())
449 {
450 // All OCCs are now running
451 if (waitForAllOccsTimer->isEnabled())
452 {
453 // stop occ wait timer
454 waitForAllOccsTimer->setEnabled(false);
455 }
456
457 // All OCCs have been found, check if we need a reset
458 if (resetRequired)
459 {
460 initiateOccRequest(resetInstance);
461
462 if (!waitForAllOccsTimer->isEnabled())
463 {
464 lg2::warning(
465 "occsNotAllRunning: Restarting waitForAllOccTimer");
466 // restart occ wait timer
467 waitForAllOccsTimer->restartOnce(60s);
468 }
469 }
470 else
471 {
472 // Verify master OCC and start presence monitor
473 validateOccMaster();
474 }
475 }
476
477 // Start poll timer if not already started (since at least one OCC is
478 // running)
479 if (!_pollTimer->isEnabled())
480 {
481 // An OCC just went active, PM Complex is just coming online so
482 // clear any outstanding reset requests
483 if (resetRequired)
484 {
485 resetRequired = false;
486 lg2::error(
487 "statusCallBack: clearing resetRequired (since OCC{INST} went active, resetInProgress={RIP})",
488 "INST", instance, "RIP", resetInProgress);
489 }
490
491 lg2::info("Manager: OCCs will be polled every {TIME} seconds",
492 "TIME", pollInterval);
493
494 // Send poll and start OCC poll timer
495 pollerTimerExpired();
496 }
497 }
498 else
499 {
500 // OCC went away
501 if (activeCount > 0)
502 {
503 --activeCount;
504 }
505 else
506 {
507 lg2::info("OCC{INST} disabled, and no other OCCs are active",
508 "INST", instance);
509 }
510
511 if (activeCount == 0)
512 {
513 // No OCCs are running
514
515 if (resetInProgress)
516 {
517 // All OCC active sensors are clear (reset should be in
518 // progress)
519 lg2::info(
520 "statusCallBack: Clearing resetInProgress (activeCount={COUNT}, OCC{INST}, status={STATUS})",
521 "COUNT", activeCount, "INST", instance, "STATUS", status);
522 resetInProgress = false;
523 resetInstance = 255;
524 }
525
526 // Stop OCC poll timer
527 if (_pollTimer->isEnabled())
528 {
529 lg2::info(
530 "Manager::statusCallBack(): OCCs are not running, stopping poll timer");
531 _pollTimer->setEnabled(false);
532 }
533
534 // stop wait timer
535 if (waitForAllOccsTimer->isEnabled())
536 {
537 waitForAllOccsTimer->setEnabled(false);
538 }
539 }
540 else if (resetInProgress)
541 {
542 lg2::info(
543 "statusCallBack: Skipping clear of resetInProgress (activeCount={COUNT}, OCC{INST}, status={STATUS})",
544 "COUNT", activeCount, "INST", instance, "STATUS", status);
545 }
546 // Clear OCC sensors
547 setSensorValueToNaN(instance);
548 }
549
550 if (waitingForAllOccActiveSensors)
551 {
552 if (utils::isHostRunning())
553 {
554 checkAllActiveSensors();
555 }
556 }
557 }
558
sbeTimeout(unsigned int instance)559 void Manager::sbeTimeout(unsigned int instance)
560 {
561 auto obj = std::find_if(statusObjects.begin(), statusObjects.end(),
562 [instance](const auto& obj) {
563 return instance == obj->getOccInstanceID();
564 });
565
566 if (obj != statusObjects.end() && (*obj)->occActive())
567 {
568 lg2::info("SBE timeout, requesting HRESET (OCC{INST})", "INST",
569 instance);
570
571 #ifdef PHAL_SUPPORT
572 setSBEState(instance, SBE_STATE_NOT_USABLE);
573 #endif
574
575 // Stop communication with this OCC
576 (*obj)->occActive(false);
577
578 pldmHandle->sendHRESET(instance);
579 }
580 }
581
updateOCCActive(instanceID instance,bool status)582 bool Manager::updateOCCActive(instanceID instance, bool status)
583 {
584 auto obj = std::find_if(statusObjects.begin(), statusObjects.end(),
585 [instance](const auto& obj) {
586 return instance == obj->getOccInstanceID();
587 });
588
589 const bool hostRunning = open_power::occ::utils::isHostRunning();
590 if (obj != statusObjects.end())
591 {
592 if (!hostRunning && (status == true))
593 {
594 lg2::warning(
595 "updateOCCActive: Host is not running yet (OCC{INST} active={STAT}), clearing sensor received",
596 "INST", instance, "STAT", status);
597 (*obj)->setPldmSensorReceived(false);
598 if (!waitingForAllOccActiveSensors)
599 {
600 lg2::info(
601 "updateOCCActive: Waiting for Host and all OCC Active Sensors");
602 waitingForAllOccActiveSensors = true;
603 }
604 discoverTimer->restartOnce(30s);
605 return false;
606 }
607 else
608 {
609 (*obj)->setPldmSensorReceived(true);
610 return (*obj)->occActive(status);
611 }
612 }
613 else
614 {
615 if (hostRunning)
616 {
617 lg2::warning(
618 "updateOCCActive: No status object to update for OCC{INST} (active={STAT})",
619 "INST", instance, "STAT", status);
620 }
621 else
622 {
623 if (status == true)
624 {
625 lg2::warning(
626 "updateOCCActive: No status objects and Host is not running yet (OCC{INST} active={STAT})",
627 "INST", instance, "STAT", status);
628 }
629 }
630 if (status == true)
631 {
632 // OCC went active
633 queuedActiveState.insert(instance);
634 }
635 else
636 {
637 auto match = queuedActiveState.find(instance);
638 if (match != queuedActiveState.end())
639 {
640 // OCC was disabled
641 queuedActiveState.erase(match);
642 }
643 }
644 return false;
645 }
646 }
647
648 // Called upon pldm event To set powermode Safe Mode State for system.
updateOccSafeMode(bool safeMode)649 void Manager::updateOccSafeMode(bool safeMode)
650 {
651 pmode->updateDbusSafeMode(safeMode);
652 // Update the processor throttle status on dbus
653 for (auto& obj : statusObjects)
654 {
655 obj->updateThrottle(safeMode, THROTTLED_SAFE);
656 }
657 }
658
sbeHRESETResult(instanceID instance,bool success)659 void Manager::sbeHRESETResult(instanceID instance, bool success)
660 {
661 if (success)
662 {
663 lg2::info("HRESET succeeded (OCC{INST})", "INST", instance);
664
665 #ifdef PHAL_SUPPORT
666 setSBEState(instance, SBE_STATE_BOOTED);
667 #endif
668
669 // Re-enable communication with this OCC
670 auto obj = std::find_if(statusObjects.begin(), statusObjects.end(),
671 [instance](const auto& obj) {
672 return instance == obj->getOccInstanceID();
673 });
674 if (obj != statusObjects.end() && (!(*obj)->occActive()))
675 {
676 (*obj)->occActive(true);
677 }
678
679 return;
680 }
681
682 #ifdef PHAL_SUPPORT
683 setSBEState(instance, SBE_STATE_FAILED);
684
685 if (sbeCanDump(instance))
686 {
687 lg2::info("HRESET failed (OCC{INST}), triggering SBE dump", "INST",
688 instance);
689
690 auto& bus = utils::getBus();
691 uint32_t src6 = instance << 16;
692 uint32_t logId =
693 FFDC::createPEL("org.open_power.Processor.Error.SbeChipOpTimeout",
694 src6, "SBE command timeout");
695
696 try
697 {
698 constexpr auto interface = "xyz.openbmc_project.Dump.Create";
699 constexpr auto function = "CreateDump";
700
701 std::string service =
702 utils::getService(OP_DUMP_OBJ_PATH, interface);
703 auto method = bus.new_method_call(service.c_str(), OP_DUMP_OBJ_PATH,
704 interface, function);
705
706 std::map<std::string, std::variant<std::string, uint64_t>>
707 createParams{
708 {"com.ibm.Dump.Create.CreateParameters.ErrorLogId",
709 uint64_t(logId)},
710 {"com.ibm.Dump.Create.CreateParameters.DumpType",
711 "com.ibm.Dump.Create.DumpType.SBE"},
712 {"com.ibm.Dump.Create.CreateParameters.FailingUnitId",
713 uint64_t(instance)},
714 };
715
716 method.append(createParams);
717
718 auto response = bus.call(method);
719 }
720 catch (const sdbusplus::exception_t& e)
721 {
722 constexpr auto ERROR_DUMP_DISABLED =
723 "xyz.openbmc_project.Dump.Create.Error.Disabled";
724 if (e.name() == ERROR_DUMP_DISABLED)
725 {
726 lg2::info("Dump is disabled, skipping");
727 }
728 else
729 {
730 lg2::error("Dump failed");
731 }
732 }
733 }
734 #endif
735
736 // SBE Reset failed, try PM Complex reset
737 lg2::error("sbeHRESETResult: Forcing PM Complex reset");
738 resetOccRequest(instance);
739 }
740
741 #ifdef PHAL_SUPPORT
sbeCanDump(unsigned int instance)742 bool Manager::sbeCanDump(unsigned int instance)
743 {
744 struct pdbg_target* proc = getPdbgTarget(instance);
745
746 if (!proc)
747 {
748 // allow the dump in the error case
749 return true;
750 }
751
752 try
753 {
754 if (!openpower::phal::sbe::isDumpAllowed(proc))
755 {
756 return false;
757 }
758
759 if (openpower::phal::pdbg::isSbeVitalAttnActive(proc))
760 {
761 return false;
762 }
763 }
764 catch (openpower::phal::exception::SbeError& e)
765 {
766 lg2::info("Failed to query SBE state");
767 }
768
769 // allow the dump in the error case
770 return true;
771 }
772
setSBEState(unsigned int instance,enum sbe_state state)773 void Manager::setSBEState(unsigned int instance, enum sbe_state state)
774 {
775 struct pdbg_target* proc = getPdbgTarget(instance);
776
777 if (!proc)
778 {
779 return;
780 }
781
782 try
783 {
784 openpower::phal::sbe::setState(proc, state);
785 }
786 catch (const openpower::phal::exception::SbeError& e)
787 {
788 lg2::error("Failed to set SBE state: {ERROR}", "ERROR", e.what());
789 }
790 }
791
getPdbgTarget(unsigned int instance)792 struct pdbg_target* Manager::getPdbgTarget(unsigned int instance)
793 {
794 if (!pdbgInitialized)
795 {
796 try
797 {
798 openpower::phal::pdbg::init();
799 pdbgInitialized = true;
800 }
801 catch (const openpower::phal::exception::PdbgError& e)
802 {
803 lg2::error("pdbg initialization failed");
804 return nullptr;
805 }
806 }
807
808 struct pdbg_target* proc = nullptr;
809 pdbg_for_each_class_target("proc", proc)
810 {
811 if (pdbg_target_index(proc) == instance)
812 {
813 return proc;
814 }
815 }
816
817 lg2::error("Failed to get pdbg target");
818 return nullptr;
819 }
820 #endif
821
pollerTimerExpired()822 void Manager::pollerTimerExpired()
823 {
824 if (!_pollTimer)
825 {
826 lg2::error("pollerTimerExpired() ERROR: Timer not defined");
827 return;
828 }
829
830 if (resetRequired)
831 {
832 lg2::error("pollerTimerExpired() - Initiating PM Complex reset");
833 initiateOccRequest(resetInstance);
834
835 if (!waitForAllOccsTimer->isEnabled())
836 {
837 lg2::warning("pollerTimerExpired: Restarting waitForAllOccTimer");
838 // restart occ wait timer
839 waitForAllOccsTimer->restartOnce(60s);
840 }
841 return;
842 }
843
844 for (auto& obj : statusObjects)
845 {
846 if (!obj->occActive())
847 {
848 // OCC is not running yet
849 auto id = obj->getOccInstanceID();
850 setSensorValueToNaN(id);
851 continue;
852 }
853
854 // Read sysfs to force kernel to poll OCC
855 obj->readOccState();
856
857 // Read occ sensor values
858 getSensorValues(obj);
859 }
860
861 if (activeCount > 0)
862 {
863 // Restart OCC poll timer
864 _pollTimer->restartOnce(std::chrono::seconds(pollInterval));
865 }
866 else
867 {
868 // No OCCs running, so poll timer will not be restarted
869 lg2::info(
870 "Manager::pollerTimerExpired: poll timer will not be restarted");
871 }
872 }
873
readTempSensors(const fs::path & path,uint32_t occInstance)874 void Manager::readTempSensors(const fs::path& path, uint32_t occInstance)
875 {
876 // There may be more than one sensor with the same FRU type
877 // and label so make two passes: the first to read the temps
878 // from sysfs, and the second to put them on D-Bus after
879 // resolving any conflicts.
880 std::map<std::string, double> sensorData;
881
882 std::regex expr{"temp\\d+_label$"}; // Example: temp5_label
883 for (auto& file : fs::directory_iterator(path))
884 {
885 if (!std::regex_search(file.path().string(), expr))
886 {
887 continue;
888 }
889
890 uint32_t labelValue{0};
891
892 try
893 {
894 labelValue = readFile<uint32_t>(file.path());
895 }
896 catch (const std::system_error& e)
897 {
898 lg2::debug(
899 "readTempSensors: Failed reading {PATH}, errno = {ERROR}",
900 "PATH", file.path().string(), "ERROR", e.code().value());
901 continue;
902 }
903
904 const std::string& tempLabel = "label";
905 const std::string filePathString = file.path().string().substr(
906 0, file.path().string().length() - tempLabel.length());
907
908 uint32_t fruTypeValue{0};
909 try
910 {
911 fruTypeValue = readFile<uint32_t>(filePathString + fruTypeSuffix);
912 }
913 catch (const std::system_error& e)
914 {
915 lg2::debug(
916 "readTempSensors: Failed reading {PATH}, errno = {ERROR}",
917 "PATH", filePathString + fruTypeSuffix, "ERROR",
918 e.code().value());
919 continue;
920 }
921
922 std::string sensorPath =
923 OCC_SENSORS_ROOT + std::string("/temperature/");
924
925 std::string dvfsTempPath;
926
927 if (fruTypeValue == VRMVdd)
928 {
929 sensorPath.append(
930 "vrm_vdd" + std::to_string(occInstance) + "_temp");
931 }
932 else if (fruTypeValue == processorIoRing)
933 {
934 sensorPath.append(
935 "proc" + std::to_string(occInstance) + "_ioring_temp");
936 dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/proc" +
937 std::to_string(occInstance) + "_ioring_dvfs_temp";
938 }
939 else
940 {
941 uint16_t type = (labelValue & 0xFF000000) >> 24;
942 uint16_t instanceID = labelValue & 0x0000FFFF;
943
944 if (type == OCC_DIMM_TEMP_SENSOR_TYPE)
945 {
946 if (fruTypeValue == fruTypeNotAvailable)
947 {
948 // Not all DIMM related temps are available to read
949 // (no _input file in this case)
950 continue;
951 }
952 auto iter = dimmTempSensorName.find(fruTypeValue);
953 if (iter == dimmTempSensorName.end())
954 {
955 lg2::error(
956 "readTempSensors: Fru type error! fruTypeValue = {FRU}) ",
957 "FRU", fruTypeValue);
958 continue;
959 }
960
961 sensorPath.append(
962 "dimm" + std::to_string(instanceID) + iter->second);
963
964 dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/" +
965 dimmDVFSSensorName.at(fruTypeValue);
966 }
967 else if (type == OCC_CPU_TEMP_SENSOR_TYPE)
968 {
969 if (fruTypeValue == processorCore)
970 {
971 // The OCC reports small core temps, of which there are
972 // two per big core. All current P10 systems are in big
973 // core mode, so use a big core name.
974 uint16_t coreNum = instanceID / 2;
975 uint16_t tempNum = instanceID % 2;
976 sensorPath.append("proc" + std::to_string(occInstance) +
977 "_core" + std::to_string(coreNum) + "_" +
978 std::to_string(tempNum) + "_temp");
979
980 dvfsTempPath =
981 std::string{OCC_SENSORS_ROOT} + "/temperature/proc" +
982 std::to_string(occInstance) + "_core_dvfs_temp";
983 }
984 else
985 {
986 continue;
987 }
988 }
989 else
990 {
991 continue;
992 }
993 }
994
995 // The dvfs temp file only needs to be read once per chip per type.
996 if (!dvfsTempPath.empty() &&
997 !dbus::OccDBusSensors::getOccDBus().hasDvfsTemp(dvfsTempPath))
998 {
999 try
1000 {
1001 auto dvfsValue = readFile<double>(filePathString + maxSuffix);
1002
1003 dbus::OccDBusSensors::getOccDBus().setDvfsTemp(
1004 dvfsTempPath, dvfsValue * std::pow(10, -3));
1005 }
1006 catch (const std::system_error& e)
1007 {
1008 lg2::debug(
1009 "readTempSensors: Failed reading {PATH}, errno = {ERROR}",
1010 "PATH", filePathString + maxSuffix, "ERROR",
1011 e.code().value());
1012 }
1013 }
1014
1015 uint32_t faultValue{0};
1016 try
1017 {
1018 faultValue = readFile<uint32_t>(filePathString + faultSuffix);
1019 }
1020 catch (const std::system_error& e)
1021 {
1022 lg2::debug(
1023 "readTempSensors: Failed reading {PATH}, errno = {ERROR}",
1024 "PATH", filePathString + faultSuffix, "ERROR",
1025 e.code().value());
1026 continue;
1027 }
1028
1029 double tempValue{0};
1030 // NOTE: if OCC sends back 0xFF, kernal sets this fault value to 1.
1031 if (faultValue != 0)
1032 {
1033 tempValue = std::numeric_limits<double>::quiet_NaN();
1034 }
1035 else
1036 {
1037 // Read the temperature
1038 try
1039 {
1040 tempValue = readFile<double>(filePathString + inputSuffix);
1041 }
1042 catch (const std::system_error& e)
1043 {
1044 lg2::debug(
1045 "readTempSensors: Failed reading {PATH}, errno = {ERROR}",
1046 "PATH", filePathString + inputSuffix, "ERROR",
1047 e.code().value());
1048
1049 // if errno == EAGAIN(Resource temporarily unavailable) then set
1050 // temp to 0, to avoid using old temp, and affecting FAN
1051 // Control.
1052 if (e.code().value() == EAGAIN)
1053 {
1054 tempValue = 0;
1055 }
1056 // else the errno would be something like
1057 // EBADF(Bad file descriptor)
1058 // or ENOENT(No such file or directory)
1059 else
1060 {
1061 continue;
1062 }
1063 }
1064 }
1065
1066 // If this object path already has a value, only overwite
1067 // it if the previous one was an NaN or a smaller value.
1068 auto existing = sensorData.find(sensorPath);
1069 if (existing != sensorData.end())
1070 {
1071 // Multiple sensors found for this FRU type
1072 if ((std::isnan(existing->second) && (tempValue == 0)) ||
1073 ((existing->second == 0) && std::isnan(tempValue)))
1074 {
1075 // One of the redundant sensors has failed (0xFF/nan), and the
1076 // other sensor has no reading (0), so set the FRU to NaN to
1077 // force fan increase
1078 tempValue = std::numeric_limits<double>::quiet_NaN();
1079 existing->second = tempValue;
1080 }
1081 if (std::isnan(existing->second) || (tempValue > existing->second))
1082 {
1083 existing->second = tempValue;
1084 }
1085 }
1086 else
1087 {
1088 // First sensor for this FRU type
1089 sensorData[sensorPath] = tempValue;
1090 }
1091 }
1092
1093 // Now publish the values on D-Bus.
1094 for (const auto& [objectPath, value] : sensorData)
1095 {
1096 dbus::OccDBusSensors::getOccDBus().setValue(objectPath,
1097 value * std::pow(10, -3));
1098
1099 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1100 objectPath, !std::isnan(value));
1101
1102 if (existingSensors.find(objectPath) == existingSensors.end())
1103 {
1104 dbus::OccDBusSensors::getOccDBus().setChassisAssociation(
1105 objectPath, {"all_sensors"});
1106 }
1107 existingSensors[objectPath] = occInstance;
1108 }
1109 }
1110
getPowerLabelFunctionID(const std::string & value)1111 std::optional<std::string> Manager::getPowerLabelFunctionID(
1112 const std::string& value)
1113 {
1114 // If the value is "system", then the FunctionID is "system".
1115 if (value == "system")
1116 {
1117 return value;
1118 }
1119
1120 // If the value is not "system", then the label value have 3 numbers, of
1121 // which we only care about the middle one:
1122 // <sensor id>_<function id>_<apss channel>
1123 // eg: The value is "0_10_5" , then the FunctionID is "10".
1124 if (value.find("_") == std::string::npos)
1125 {
1126 return std::nullopt;
1127 }
1128
1129 auto powerLabelValue = value.substr((value.find("_") + 1));
1130
1131 if (powerLabelValue.find("_") == std::string::npos)
1132 {
1133 return std::nullopt;
1134 }
1135
1136 return powerLabelValue.substr(0, powerLabelValue.find("_"));
1137 }
1138
readPowerSensors(const fs::path & path,uint32_t id)1139 void Manager::readPowerSensors(const fs::path& path, uint32_t id)
1140 {
1141 std::regex expr{"power\\d+_label$"}; // Example: power5_label
1142 for (auto& file : fs::directory_iterator(path))
1143 {
1144 if (!std::regex_search(file.path().string(), expr))
1145 {
1146 continue;
1147 }
1148
1149 std::string labelValue;
1150 try
1151 {
1152 labelValue = readFile<std::string>(file.path());
1153 }
1154 catch (const std::system_error& e)
1155 {
1156 lg2::debug(
1157 "readPowerSensors: Failed reading {PATH}, errno = {ERROR}",
1158 "PATH", file.path().string(), "ERROR", e.code().value());
1159 continue;
1160 }
1161
1162 auto functionID = getPowerLabelFunctionID(labelValue);
1163 if (functionID == std::nullopt)
1164 {
1165 continue;
1166 }
1167
1168 const std::string& tempLabel = "label";
1169 const std::string filePathString = file.path().string().substr(
1170 0, file.path().string().length() - tempLabel.length());
1171
1172 std::string sensorPath = OCC_SENSORS_ROOT + std::string("/power/");
1173
1174 auto iter = powerSensorName.find(*functionID);
1175 if (iter == powerSensorName.end())
1176 {
1177 continue;
1178 }
1179 sensorPath.append(iter->second);
1180
1181 double tempValue{0};
1182
1183 try
1184 {
1185 tempValue = readFile<double>(filePathString + inputSuffix);
1186 }
1187 catch (const std::system_error& e)
1188 {
1189 lg2::debug(
1190 "readPowerSensors: Failed reading {PATH}, errno = {ERROR}",
1191 "PATH", filePathString + inputSuffix, "ERROR",
1192 e.code().value());
1193 continue;
1194 }
1195
1196 dbus::OccDBusSensors::getOccDBus().setUnit(
1197 sensorPath, "xyz.openbmc_project.Sensor.Value.Unit.Watts");
1198
1199 dbus::OccDBusSensors::getOccDBus().setValue(
1200 sensorPath, tempValue * std::pow(10, -3) * std::pow(10, -3));
1201
1202 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1203 sensorPath, true);
1204
1205 if (existingSensors.find(sensorPath) == existingSensors.end())
1206 {
1207 std::vector<std::string> fTypeList = {"all_sensors"};
1208 if (iter->second == "total_power")
1209 {
1210 // Set sensor purpose as TotalPower
1211 dbus::OccDBusSensors::getOccDBus().setPurpose(
1212 sensorPath,
1213 "xyz.openbmc_project.Sensor.Purpose.SensorPurpose.TotalPower");
1214 }
1215 dbus::OccDBusSensors::getOccDBus().setChassisAssociation(
1216 sensorPath, fTypeList);
1217 }
1218 existingSensors[sensorPath] = id;
1219 }
1220 return;
1221 }
1222
readExtnSensors(const fs::path & path,uint32_t id)1223 void Manager::readExtnSensors(const fs::path& path, uint32_t id)
1224 {
1225 std::regex expr{"extn\\d+_label$"}; // Example: extn5_label
1226 for (auto& file : fs::directory_iterator(path))
1227 {
1228 if (!std::regex_search(file.path().string(), expr))
1229 {
1230 continue;
1231 }
1232
1233 // Read in Label value of the sensor from file.
1234 std::string labelValue;
1235 try
1236 {
1237 labelValue = readFile<std::string>(file.path());
1238 }
1239 catch (const std::system_error& e)
1240 {
1241 lg2::debug(
1242 "readExtnSensors:label Failed reading {PATH}, errno = {ERROR}",
1243 "PATH", file.path().string(), "ERROR", e.code().value());
1244 continue;
1245 }
1246 const std::string& tempLabel = "label";
1247 const std::string filePathString = file.path().string().substr(
1248 0, file.path().string().length() - tempLabel.length());
1249
1250 std::string sensorPath = OCC_SENSORS_ROOT + std::string("/power/");
1251
1252 // Labels of EXTN sections from OCC interface Document
1253 // have different formats.
1254 // 0x464d494e : FMIN 0x46444953 : FDIS
1255 // 0x46424153 : FBAS 0x46555400 : FUT
1256 // 0x464d4158 : FMAX 0x434c4950 : CLIP
1257 // 0x4d4f4445 : MODE 0x574f4643 : WOFC
1258 // 0x574f4649 : WOFI 0x5057524d : PWRM
1259 // 0x50575250 : PWRP 0x45525248 : ERRH
1260 // Label indicating byte 5 and 6 is the current (mem,proc) power in
1261 // Watts.
1262 if ((labelValue == EXTN_LABEL_PWRM_MEMORY_POWER) ||
1263 (labelValue == EXTN_LABEL_PWRP_PROCESSOR_POWER))
1264 {
1265 // Build the dbus String for this chiplet power asset.
1266 if (labelValue == EXTN_LABEL_PWRP_PROCESSOR_POWER)
1267 {
1268 labelValue = "_power";
1269 }
1270 else // else EXTN_LABEL_PWRM_MEMORY_POWER
1271 {
1272 labelValue = "_mem_power";
1273 }
1274 sensorPath.append("chiplet" + std::to_string(id) + labelValue);
1275
1276 // Read in data value of the sensor from file.
1277 // Read in as string due to different format of data in sensors.
1278 std::string extnValue;
1279 try
1280 {
1281 extnValue = readFile<std::string>(filePathString + inputSuffix);
1282 }
1283 catch (const std::system_error& e)
1284 {
1285 lg2::debug(
1286 "readExtnSensors:value Failed reading {PATH}, errno = {ERROR}",
1287 "PATH", filePathString + inputSuffix, "ERROR",
1288 e.code().value());
1289 continue;
1290 }
1291
1292 // For Power field, Convert last 4 bytes of hex string into number
1293 // value.
1294 std::stringstream ssData;
1295 ssData << std::hex << extnValue.substr(extnValue.length() - 4);
1296 uint16_t MyHexNumber;
1297 ssData >> MyHexNumber;
1298
1299 // Convert output/DC power to input/AC power in Watts (round up)
1300 MyHexNumber =
1301 std::round(((MyHexNumber / (PS_DERATING_FACTOR / 100.0))));
1302
1303 dbus::OccDBusSensors::getOccDBus().setUnit(
1304 sensorPath, "xyz.openbmc_project.Sensor.Value.Unit.Watts");
1305
1306 dbus::OccDBusSensors::getOccDBus().setValue(sensorPath,
1307 MyHexNumber);
1308
1309 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1310 sensorPath, true);
1311
1312 if (existingSensors.find(sensorPath) == existingSensors.end())
1313 {
1314 dbus::OccDBusSensors::getOccDBus().setChassisAssociation(
1315 sensorPath, {"all_sensors"});
1316 }
1317
1318 existingSensors[sensorPath] = id;
1319 } // End Extended Power Sensors.
1320 } // End For loop on files for Extended Sensors.
1321 return;
1322 }
1323
setSensorValueToNaN(uint32_t id) const1324 void Manager::setSensorValueToNaN(uint32_t id) const
1325 {
1326 for (const auto& [sensorPath, occId] : existingSensors)
1327 {
1328 if (occId == id)
1329 {
1330 dbus::OccDBusSensors::getOccDBus().setValue(
1331 sensorPath, std::numeric_limits<double>::quiet_NaN());
1332
1333 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1334 sensorPath, true);
1335 }
1336 }
1337 return;
1338 }
1339
setSensorValueToNonFunctional(uint32_t id) const1340 void Manager::setSensorValueToNonFunctional(uint32_t id) const
1341 {
1342 for (const auto& [sensorPath, occId] : existingSensors)
1343 {
1344 if (occId == id)
1345 {
1346 dbus::OccDBusSensors::getOccDBus().setValue(
1347 sensorPath, std::numeric_limits<double>::quiet_NaN());
1348
1349 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1350 sensorPath, false);
1351 }
1352 }
1353 return;
1354 }
1355
getSensorValues(std::unique_ptr<Status> & occ)1356 void Manager::getSensorValues(std::unique_ptr<Status>& occ)
1357 {
1358 static bool tracedError[8] = {0};
1359 const fs::path sensorPath = occ->getHwmonPath();
1360 const uint32_t id = occ->getOccInstanceID();
1361
1362 if (fs::exists(sensorPath))
1363 {
1364 // Read temperature sensors
1365 readTempSensors(sensorPath, id);
1366 // Read Extended sensors
1367 readExtnSensors(sensorPath, id);
1368
1369 if (occ->isMasterOcc())
1370 {
1371 // Read power sensors
1372 readPowerSensors(sensorPath, id);
1373 }
1374 tracedError[id] = false;
1375 }
1376 else
1377 {
1378 if (!tracedError[id])
1379 {
1380 lg2::error(
1381 "Manager::getSensorValues: OCC{INST} sensor path missing: {PATH}",
1382 "INST", id, "PATH", sensorPath);
1383 tracedError[id] = true;
1384 }
1385 }
1386
1387 return;
1388 }
1389
1390 // Read the altitude from DBus
readAltitude()1391 void Manager::readAltitude()
1392 {
1393 static bool traceAltitudeErr = true;
1394
1395 utils::PropertyValue altitudeProperty{};
1396 try
1397 {
1398 altitudeProperty = utils::getProperty(ALTITUDE_PATH, ALTITUDE_INTERFACE,
1399 ALTITUDE_PROP);
1400 auto sensorVal = std::get<double>(altitudeProperty);
1401 if (sensorVal < 0xFFFF)
1402 {
1403 if (sensorVal < 0)
1404 {
1405 altitude = 0;
1406 }
1407 else
1408 {
1409 // Round to nearest meter
1410 altitude = uint16_t(sensorVal + 0.5);
1411 }
1412 lg2::debug("readAltitude: sensor={VALUE} ({ALT}m)", "VALUE",
1413 sensorVal, "ALT", altitude);
1414 traceAltitudeErr = true;
1415 }
1416 else
1417 {
1418 if (traceAltitudeErr)
1419 {
1420 traceAltitudeErr = false;
1421 lg2::debug("Invalid altitude value: {ALT}", "ALT", sensorVal);
1422 }
1423 }
1424 }
1425 catch (const sdbusplus::exception_t& e)
1426 {
1427 if (traceAltitudeErr)
1428 {
1429 traceAltitudeErr = false;
1430 lg2::info("Unable to read Altitude: {ERROR}", "ERROR", e.what());
1431 }
1432 altitude = 0xFFFF; // not available
1433 }
1434 }
1435
1436 // Callback function when ambient temperature changes
ambientCallback(sdbusplus::message_t & msg)1437 void Manager::ambientCallback(sdbusplus::message_t& msg)
1438 {
1439 double currentTemp = 0;
1440 uint8_t truncatedTemp = 0xFF;
1441 std::string msgSensor;
1442 std::map<std::string, std::variant<double>> msgData;
1443 msg.read(msgSensor, msgData);
1444
1445 auto valPropMap = msgData.find(AMBIENT_PROP);
1446 if (valPropMap == msgData.end())
1447 {
1448 lg2::debug("ambientCallback: Unknown ambient property changed");
1449 return;
1450 }
1451 currentTemp = std::get<double>(valPropMap->second);
1452 if (std::isnan(currentTemp))
1453 {
1454 truncatedTemp = 0xFF;
1455 }
1456 else
1457 {
1458 if (currentTemp < 0)
1459 {
1460 truncatedTemp = 0;
1461 }
1462 else
1463 {
1464 // Round to nearest degree C
1465 truncatedTemp = uint8_t(currentTemp + 0.5);
1466 }
1467 }
1468
1469 // If ambient changes, notify OCCs
1470 if (truncatedTemp != ambient)
1471 {
1472 lg2::debug("ambientCallback: Ambient change from {OLD} to {NEW}C",
1473 "OLD", ambient, "NEW", currentTemp);
1474
1475 ambient = truncatedTemp;
1476 if (altitude == 0xFFFF)
1477 {
1478 // No altitude yet, try reading again
1479 readAltitude();
1480 }
1481
1482 lg2::debug("ambientCallback: Ambient: {TEMP}C, altitude: {ALT}m",
1483 "TEMP", ambient, "ALT", altitude);
1484
1485 // Send ambient and altitude to all OCCs
1486 for (auto& obj : statusObjects)
1487 {
1488 if (obj->occActive())
1489 {
1490 obj->sendAmbient(ambient, altitude);
1491 }
1492 }
1493 }
1494 }
1495
1496 // return the current ambient and altitude readings
getAmbientData(bool & ambientValid,uint8_t & ambientTemp,uint16_t & altitudeValue) const1497 void Manager::getAmbientData(bool& ambientValid, uint8_t& ambientTemp,
1498 uint16_t& altitudeValue) const
1499 {
1500 ambientValid = true;
1501 ambientTemp = ambient;
1502 altitudeValue = altitude;
1503
1504 if (ambient == 0xFF)
1505 {
1506 ambientValid = false;
1507 }
1508 }
1509
1510 // Called when waitForAllOccsTimer expires
1511 // After the first OCC goes active, this timer will be started (60 seconds)
occsNotAllRunning()1512 void Manager::occsNotAllRunning()
1513 {
1514 if (resetInProgress)
1515 {
1516 lg2::warning(
1517 "occsNotAllRunning: Ignoring waitForAllOccsTimer because reset is in progress");
1518 return;
1519 }
1520 if (activeCount != statusObjects.size())
1521 {
1522 // Not all OCCs went active
1523 lg2::warning(
1524 "occsNotAllRunning: Active OCC count ({COUNT}) does not match expected count ({EXP})",
1525 "COUNT", activeCount, "EXP", statusObjects.size());
1526 // Procs may be garded, so may be expected
1527 }
1528
1529 if (resetRequired)
1530 {
1531 initiateOccRequest(resetInstance);
1532
1533 if (!waitForAllOccsTimer->isEnabled())
1534 {
1535 lg2::warning("occsNotAllRunning: Restarting waitForAllOccTimer");
1536 // restart occ wait timer
1537 waitForAllOccsTimer->restartOnce(60s);
1538 }
1539 }
1540 else
1541 {
1542 validateOccMaster();
1543 }
1544 }
1545
1546 // Called when throttlePldmTraceTimer expires.
1547 // If this timer expires, that indicates there are no OCC active sensor PDRs
1548 // found which will trigger pldm traces to be throttled.
1549 // The second time this timer expires, a PEL will get created.
throttlePldmTraceExpired()1550 void Manager::throttlePldmTraceExpired()
1551 {
1552 if (utils::isHostRunning())
1553 {
1554 if (!onPldmTimeoutCreatePel)
1555 {
1556 // Throttle traces
1557 pldmHandle->setTraceThrottle(true);
1558 // Restart timer to log a PEL when timer expires
1559 onPldmTimeoutCreatePel = true;
1560 throttlePldmTraceTimer->restartOnce(40min);
1561 }
1562 else
1563 {
1564 lg2::error(
1565 "throttlePldmTraceExpired(): OCC active sensors still not available!");
1566 // Create PEL
1567 createPldmSensorPEL();
1568 }
1569 }
1570 else
1571 {
1572 // Make sure traces are not throttled
1573 pldmHandle->setTraceThrottle(false);
1574 lg2::info(
1575 "throttlePldmTraceExpired(): host it not running ignoring sensor timer");
1576 }
1577 }
1578
createPldmSensorPEL()1579 void Manager::createPldmSensorPEL()
1580 {
1581 Error::Descriptor d = Error::Descriptor(MISSING_OCC_SENSORS_PATH);
1582 std::map<std::string, std::string> additionalData;
1583
1584 additionalData.emplace("_PID", std::to_string(getpid()));
1585
1586 lg2::info(
1587 "createPldmSensorPEL(): Unable to find PLDM sensors for the OCCs");
1588
1589 auto& bus = utils::getBus();
1590
1591 try
1592 {
1593 FFDCFiles ffdc;
1594 // Add occ-control journal traces to PEL FFDC
1595 auto occJournalFile =
1596 FFDC::addJournalEntries(ffdc, "openpower-occ-control", 40);
1597
1598 static constexpr auto loggingObjectPath =
1599 "/xyz/openbmc_project/logging";
1600 static constexpr auto opLoggingInterface = "org.open_power.Logging.PEL";
1601 std::string service =
1602 utils::getService(loggingObjectPath, opLoggingInterface);
1603 auto method =
1604 bus.new_method_call(service.c_str(), loggingObjectPath,
1605 opLoggingInterface, "CreatePELWithFFDCFiles");
1606
1607 // Set level to Warning (Predictive).
1608 auto level =
1609 sdbusplus::xyz::openbmc_project::Logging::server::convertForMessage(
1610 sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level::
1611 Warning);
1612
1613 method.append(d.path, level, additionalData, ffdc);
1614 bus.call(method);
1615 }
1616 catch (const sdbusplus::exception_t& e)
1617 {
1618 lg2::error("Failed to create MISSING_OCC_SENSORS PEL: {ERROR}", "ERROR",
1619 e.what());
1620 }
1621 }
1622
1623 // Verify single master OCC and start presence monitor
validateOccMaster()1624 void Manager::validateOccMaster()
1625 {
1626 int masterInstance = -1;
1627 for (auto& obj : statusObjects)
1628 {
1629 auto instance = obj->getOccInstanceID();
1630
1631 if (!obj->occActive())
1632 {
1633 if (utils::isHostRunning())
1634 {
1635 // Check if sensor was queued while waiting for discovery
1636 auto match = queuedActiveState.find(instance);
1637 if (match != queuedActiveState.end())
1638 {
1639 queuedActiveState.erase(match);
1640 lg2::info("validateOccMaster: OCC{INST} is ACTIVE (queued)",
1641 "INST", instance);
1642 obj->occActive(true);
1643 }
1644 else
1645 {
1646 // OCC does not appear to be active yet, check active sensor
1647 pldmHandle->checkActiveSensor(instance);
1648 if (obj->occActive())
1649 {
1650 lg2::info(
1651 "validateOccMaster: OCC{INST} is ACTIVE after reading sensor",
1652 "INST", instance);
1653 }
1654 }
1655 }
1656 else
1657 {
1658 lg2::warning(
1659 "validateOccMaster: HOST is not running (OCC{INST})",
1660 "INST", instance);
1661 return;
1662 }
1663 }
1664
1665 if (obj->isMasterOcc())
1666 {
1667 obj->addPresenceWatchMaster();
1668
1669 if (masterInstance == -1)
1670 {
1671 masterInstance = instance;
1672 }
1673 else
1674 {
1675 lg2::error(
1676 "validateOccMaster: Multiple OCC masters! ({MAST1} and {MAST2})",
1677 "MAST1", masterInstance, "MAST2", instance);
1678 // request reset
1679 obj->deviceError(Error::Descriptor(PRESENCE_ERROR_PATH));
1680 }
1681 }
1682 }
1683
1684 if (masterInstance < 0)
1685 {
1686 lg2::error("validateOccMaster: Master OCC not found! (of {NUM} OCCs)",
1687 "NUM", statusObjects.size());
1688 // request reset
1689 statusObjects.front()->deviceError(
1690 Error::Descriptor(PRESENCE_ERROR_PATH));
1691 }
1692 else
1693 {
1694 lg2::info("validateOccMaster: OCC{INST} is master of {COUNT} OCCs",
1695 "INST", masterInstance, "COUNT", activeCount);
1696
1697 pmode->updateDbusSafeMode(false);
1698 }
1699 }
1700
updatePcapBounds() const1701 void Manager::updatePcapBounds() const
1702 {
1703 if (pcap)
1704 {
1705 pcap->updatePcapBounds();
1706 }
1707 }
1708
1709 // Clean up any variables since the OCC is no longer running.
1710 // Called when pldm receives an event indicating host is powered off.
hostPoweredOff()1711 void Manager::hostPoweredOff()
1712 {
1713 if (resetRequired)
1714 {
1715 lg2::info("hostPoweredOff: Clearing resetRequired for OCC{INST}",
1716 "INST", resetInstance);
1717 resetRequired = false;
1718 }
1719 if (resetInProgress)
1720 {
1721 lg2::info("hostPoweredOff: Clearing resetInProgress for OCC{INST}",
1722 "INST", resetInstance);
1723 resetInProgress = false;
1724 }
1725 resetInstance = 255;
1726 }
1727
collectDumpData(sdeventplus::source::Signal &,const struct signalfd_siginfo *)1728 void Manager::collectDumpData(sdeventplus::source::Signal&,
1729 const struct signalfd_siginfo*)
1730 {
1731 json data;
1732 lg2::info("collectDumpData()");
1733 data["objectCount"] = std::to_string(statusObjects.size()) + " OCC objects";
1734 if (statusObjects.size() > 0)
1735 {
1736 try
1737 {
1738 for (auto& occ : statusObjects)
1739 {
1740 json occData;
1741 auto instance = occ->getOccInstanceID();
1742 std::string occName = "occ" + std::to_string(instance);
1743
1744 if (occ->occActive())
1745 {
1746 // OCC General Info
1747 occData["occState"] = "ACTIVE";
1748 occData["occRole"] =
1749 occ->isMasterOcc() ? "MASTER" : "SECONDARY";
1750 occData["occHwmonPath"] =
1751 occ->getHwmonPath().generic_string();
1752
1753 // OCC Poll Response
1754 std::vector<std::uint8_t> cmd = {0x00, 0x00, 0x01, 0x20};
1755 std::vector<std::uint8_t> rsp;
1756 std::vector<std::string> rspHex;
1757 rsp = passThroughObjects[instance]->send(cmd);
1758 if (rsp.size() > 5)
1759 {
1760 rsp.erase(rsp.begin(),
1761 rsp.begin() + 5); // Strip rsp header
1762 rspHex = utils::hex_dump(rsp);
1763 occData["pollResponse"] = rspHex;
1764 }
1765
1766 // Debug Data: WOF Dynamic Data
1767 cmd = {0x40, 0x00, 0x01, 0x01};
1768 rsp = passThroughObjects[instance]->send(cmd);
1769 if (rsp.size() > 5)
1770 {
1771 rsp.erase(rsp.begin(),
1772 rsp.begin() + 5); // Strip rsp header
1773 rspHex = utils::hex_dump(rsp);
1774 occData["wofDataDynamic"] = rspHex;
1775 }
1776
1777 // Debug Data: WOF Dynamic Data
1778 cmd = {0x40, 0x00, 0x01, 0x0A};
1779 rsp = passThroughObjects[instance]->send(cmd);
1780 if (rsp.size() > 5)
1781 {
1782 rsp.erase(rsp.begin(),
1783 rsp.begin() + 5); // Strip rsp header
1784 rspHex = utils::hex_dump(rsp);
1785 occData["wofDataStatic"] = rspHex;
1786 }
1787 }
1788 else
1789 {
1790 occData["occState"] = "NOT ACTIVE";
1791 }
1792
1793 data[occName] = occData;
1794 }
1795 }
1796 catch (const std::exception& e)
1797 {
1798 lg2::error("Failed to collect OCC dump data: {ERR}", "ERR",
1799 e.what());
1800 }
1801 }
1802
1803 std::ofstream file{Manager::dumpFile};
1804 if (!file)
1805 {
1806 lg2::error("Failed to open {FILE} for occ-control data", "FILE",
1807 Manager::dumpFile);
1808 return;
1809 }
1810
1811 file << std::setw(4) << data;
1812 }
1813
1814 } // namespace occ
1815 } // namespace open_power
1816