1 #include "config.h"
2
3 #include "occ_manager.hpp"
4
5 #include "i2c_occ.hpp"
6 #include "occ_dbus.hpp"
7 #include "occ_errors.hpp"
8 #include "utils.hpp"
9
10 #include <phosphor-logging/elog-errors.hpp>
11 #include <phosphor-logging/lg2.hpp>
12 #include <xyz/openbmc_project/Common/error.hpp>
13
14 #include <chrono>
15 #include <cmath>
16 #include <filesystem>
17 #include <fstream>
18 #include <regex>
19
20 namespace open_power
21 {
22 namespace occ
23 {
24
25 constexpr uint32_t fruTypeNotAvailable = 0xFF;
26 constexpr auto fruTypeSuffix = "fru_type";
27 constexpr auto faultSuffix = "fault";
28 constexpr auto inputSuffix = "input";
29 constexpr auto maxSuffix = "max";
30
31 const auto HOST_ON_FILE = "/run/openbmc/host@0-on";
32
33 using namespace phosphor::logging;
34 using namespace std::literals::chrono_literals;
35
36 template <typename T>
readFile(const std::string & path)37 T readFile(const std::string& path)
38 {
39 std::ifstream ifs;
40 ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit |
41 std::ifstream::eofbit);
42 T data;
43
44 try
45 {
46 ifs.open(path);
47 ifs >> data;
48 ifs.close();
49 }
50 catch (const std::exception& e)
51 {
52 auto err = errno;
53 throw std::system_error(err, std::generic_category());
54 }
55
56 return data;
57 }
58
createPldmHandle()59 void Manager::createPldmHandle()
60 {
61 #ifdef PLDM
62 pldmHandle = std::make_unique<pldm::Interface>(
63 std::bind(std::mem_fn(&Manager::updateOCCActive), this,
64 std::placeholders::_1, std::placeholders::_2),
65 std::bind(std::mem_fn(&Manager::sbeHRESETResult), this,
66 std::placeholders::_1, std::placeholders::_2),
67 std::bind(std::mem_fn(&Manager::updateOccSafeMode), this,
68 std::placeholders::_1),
69 std::bind(std::mem_fn(&Manager::hostPoweredOff), this), event);
70 #endif
71 }
72
73 // findAndCreateObjects():
74 // Takes care of getting the required objects created and
75 // finds the available devices/processors.
76 // (function is called everytime the discoverTimer expires)
77 // - create the PowerMode object to control OCC modes
78 // - create statusObjects for each OCC device found
79 // - waits for OCC Active sensors PDRs to become available
80 // - restart discoverTimer if all data is not available yet
findAndCreateObjects()81 void Manager::findAndCreateObjects()
82 {
83 #ifndef POWER10
84 for (auto id = 0; id < MAX_CPUS; ++id)
85 {
86 // Create one occ per cpu
87 auto occ = std::string(OCC_NAME) + std::to_string(id);
88 createObjects(occ);
89 }
90 #else
91 if (!pmode)
92 {
93 // Create the power mode object
94 pmode = std::make_unique<powermode::PowerMode>(
95 *this, powermode::PMODE_PATH, powermode::PIPS_PATH, event);
96 }
97
98 if (!fs::exists(HOST_ON_FILE))
99 {
100 static bool statusObjCreated = false;
101 if (!statusObjCreated)
102 {
103 // Create the OCCs based on on the /dev/occX devices
104 auto occs = findOCCsInDev();
105
106 if (occs.empty() || (prevOCCSearch.size() != occs.size()))
107 {
108 // Something changed or no OCCs yet, try again in 10s.
109 // Note on the first pass prevOCCSearch will be empty,
110 // so there will be at least one delay to give things
111 // a chance to settle.
112 prevOCCSearch = occs;
113
114 lg2::info(
115 "Manager::findAndCreateObjects(): Waiting for OCCs (currently {QTY})",
116 "QTY", occs.size());
117
118 discoverTimer->restartOnce(10s);
119 }
120 else
121 {
122 // All OCCs appear to be available, create status objects
123
124 // createObjects requires OCC0 first.
125 std::sort(occs.begin(), occs.end());
126
127 lg2::info(
128 "Manager::findAndCreateObjects(): Creating {QTY} OCC Status Objects",
129 "QTY", occs.size());
130 for (auto id : occs)
131 {
132 createObjects(std::string(OCC_NAME) + std::to_string(id));
133 }
134 statusObjCreated = true;
135 waitingForAllOccActiveSensors = true;
136
137 // Find/update the processor path associated with each OCC
138 for (auto& obj : statusObjects)
139 {
140 obj->updateProcAssociation();
141 }
142 }
143 }
144
145 if (statusObjCreated && waitingForAllOccActiveSensors)
146 {
147 static bool tracedHostWait = false;
148 if (utils::isHostRunning())
149 {
150 if (tracedHostWait)
151 {
152 lg2::info(
153 "Manager::findAndCreateObjects(): Host is running");
154 tracedHostWait = false;
155 }
156 checkAllActiveSensors();
157 }
158 else
159 {
160 if (!tracedHostWait)
161 {
162 lg2::info(
163 "Manager::findAndCreateObjects(): Waiting for host to start");
164 tracedHostWait = true;
165 }
166 discoverTimer->restartOnce(30s);
167 #ifdef PLDM
168 if (throttlePldmTraceTimer->isEnabled())
169 {
170 // Host is no longer running, disable throttle timer and
171 // make sure traces are not throttled
172 lg2::info("findAndCreateObjects(): disabling sensor timer");
173 throttlePldmTraceTimer->setEnabled(false);
174 pldmHandle->setTraceThrottle(false);
175 }
176 #endif
177 }
178 }
179 }
180 else
181 {
182 lg2::info(
183 "Manager::findAndCreateObjects(): Waiting for {FILE} to complete...",
184 "FILE", HOST_ON_FILE);
185 discoverTimer->restartOnce(10s);
186 }
187 #endif
188 }
189
190 #ifdef POWER10
191 // Check if all occActive sensors are available
checkAllActiveSensors()192 void Manager::checkAllActiveSensors()
193 {
194 static bool allActiveSensorAvailable = false;
195 static bool tracedSensorWait = false;
196 static bool waitingForHost = false;
197
198 if (open_power::occ::utils::isHostRunning())
199 {
200 if (waitingForHost)
201 {
202 waitingForHost = false;
203 lg2::info("checkAllActiveSensors(): Host is now running");
204 }
205
206 // Start with the assumption that all are available
207 allActiveSensorAvailable = true;
208 for (auto& obj : statusObjects)
209 {
210 if ((!obj->occActive()) && (!obj->getPldmSensorReceived()))
211 {
212 auto instance = obj->getOccInstanceID();
213 // Check if sensor was queued while waiting for discovery
214 auto match = queuedActiveState.find(instance);
215 if (match != queuedActiveState.end())
216 {
217 queuedActiveState.erase(match);
218 lg2::info(
219 "checkAllActiveSensors(): OCC{INST} is ACTIVE (queued)",
220 "INST", instance);
221 obj->occActive(true);
222 }
223 else
224 {
225 allActiveSensorAvailable = false;
226 if (!tracedSensorWait)
227 {
228 lg2::info(
229 "checkAllActiveSensors(): Waiting on OCC{INST} Active sensor",
230 "INST", instance);
231 tracedSensorWait = true;
232 #ifdef PLDM
233 // Make sure PLDM traces are not throttled
234 pldmHandle->setTraceThrottle(false);
235 // Start timer to throttle PLDM traces when timer
236 // expires
237 onPldmTimeoutCreatePel = false;
238 throttlePldmTraceTimer->restartOnce(5min);
239 #endif
240 }
241 #ifdef PLDM
242 // Ignore active sensor check if the OCCs are being reset
243 if (!resetInProgress)
244 {
245 pldmHandle->checkActiveSensor(obj->getOccInstanceID());
246 }
247 #endif
248 break;
249 }
250 }
251 }
252 }
253 else
254 {
255 if (!waitingForHost)
256 {
257 waitingForHost = true;
258 lg2::info("checkAllActiveSensors(): Waiting for host to start");
259 #ifdef PLDM
260 if (throttlePldmTraceTimer->isEnabled())
261 {
262 // Host is no longer running, disable throttle timer and
263 // make sure traces are not throttled
264 lg2::info("checkAllActiveSensors(): disabling sensor timer");
265 throttlePldmTraceTimer->setEnabled(false);
266 pldmHandle->setTraceThrottle(false);
267 }
268 #endif
269 }
270 }
271
272 if (allActiveSensorAvailable)
273 {
274 // All sensors were found, disable the discovery timer
275 if (discoverTimer->isEnabled())
276 {
277 discoverTimer->setEnabled(false);
278 }
279 #ifdef PLDM
280 if (throttlePldmTraceTimer->isEnabled())
281 {
282 // Disable throttle timer and make sure traces are not throttled
283 throttlePldmTraceTimer->setEnabled(false);
284 pldmHandle->setTraceThrottle(false);
285 }
286 #endif
287 if (waitingForAllOccActiveSensors)
288 {
289 lg2::info(
290 "checkAllActiveSensors(): OCC Active sensors are available");
291 waitingForAllOccActiveSensors = false;
292
293 if (resetRequired)
294 {
295 initiateOccRequest(resetInstance);
296
297 if (!waitForAllOccsTimer->isEnabled())
298 {
299 lg2::warning(
300 "occsNotAllRunning: Restarting waitForAllOccTimer");
301 // restart occ wait timer to check status after reset
302 // completes
303 waitForAllOccsTimer->restartOnce(60s);
304 }
305 }
306 }
307 queuedActiveState.clear();
308 tracedSensorWait = false;
309 }
310 else
311 {
312 // Not all sensors were available, so keep waiting
313 if (!tracedSensorWait)
314 {
315 lg2::info(
316 "checkAllActiveSensors(): Waiting for OCC Active sensors to become available");
317 tracedSensorWait = true;
318 }
319 discoverTimer->restartOnce(10s);
320 }
321 }
322 #endif
323
findOCCsInDev()324 std::vector<int> Manager::findOCCsInDev()
325 {
326 std::vector<int> occs;
327 std::regex expr{R"(occ(\d+)$)"};
328
329 for (auto& file : fs::directory_iterator("/dev"))
330 {
331 std::smatch match;
332 std::string path{file.path().string()};
333 if (std::regex_search(path, match, expr))
334 {
335 auto num = std::stoi(match[1].str());
336
337 // /dev numbering starts at 1, ours starts at 0.
338 occs.push_back(num - 1);
339 }
340 }
341
342 return occs;
343 }
344
cpuCreated(sdbusplus::message_t & msg)345 int Manager::cpuCreated(sdbusplus::message_t& msg)
346 {
347 namespace fs = std::filesystem;
348
349 sdbusplus::message::object_path o;
350 msg.read(o);
351 fs::path cpuPath(std::string(std::move(o)));
352
353 auto name = cpuPath.filename().string();
354 auto index = name.find(CPU_NAME);
355 name.replace(index, std::strlen(CPU_NAME), OCC_NAME);
356
357 createObjects(name);
358
359 return 0;
360 }
361
createObjects(const std::string & occ)362 void Manager::createObjects(const std::string& occ)
363 {
364 auto path = fs::path(OCC_CONTROL_ROOT) / occ;
365
366 statusObjects.emplace_back(std::make_unique<Status>(
367 event, path.c_str(), *this,
368 #ifdef POWER10
369 pmode,
370 #endif
371 std::bind(std::mem_fn(&Manager::statusCallBack), this,
372 std::placeholders::_1, std::placeholders::_2)
373 #ifdef PLDM
374 ,
375 // Callback will set flag indicating reset needs to be done
376 // instead of immediately issuing a reset via PLDM.
377 std::bind(std::mem_fn(&Manager::resetOccRequest), this,
378 std::placeholders::_1)
379 #endif
380 ));
381
382 // Create the power cap monitor object
383 if (!pcap)
384 {
385 pcap = std::make_unique<open_power::occ::powercap::PowerCap>(
386 *statusObjects.back());
387 }
388
389 if (statusObjects.back()->isMasterOcc())
390 {
391 lg2::info("Manager::createObjects(): OCC{INST} is the master", "INST",
392 statusObjects.back()->getOccInstanceID());
393 _pollTimer->setEnabled(false);
394
395 #ifdef POWER10
396 // Set the master OCC on the PowerMode object
397 pmode->setMasterOcc(path);
398 #endif
399 }
400
401 passThroughObjects.emplace_back(std::make_unique<PassThrough>(
402 path.c_str()
403 #ifdef POWER10
404 ,
405 pmode
406 #endif
407 ));
408 }
409
410 // If a reset is not already outstanding, set a flag to indicate that a reset is
411 // needed.
resetOccRequest(instanceID instance)412 void Manager::resetOccRequest(instanceID instance)
413 {
414 if (!resetRequired)
415 {
416 resetRequired = true;
417 resetInstance = instance;
418 lg2::error(
419 "resetOccRequest: PM Complex reset was requested due to OCC{INST}",
420 "INST", instance);
421 }
422 else if (instance != resetInstance)
423 {
424 lg2::warning(
425 "resetOccRequest: Ignoring PM Complex reset request for OCC{INST}, because reset already outstanding for OCC{RINST}",
426 "INST", instance, "RINST", resetInstance);
427 }
428 }
429
430 // If a reset has not been started, initiate an OCC reset via PLDM
initiateOccRequest(instanceID instance)431 void Manager::initiateOccRequest(instanceID instance)
432 {
433 if (!resetInProgress)
434 {
435 resetInProgress = true;
436 resetInstance = instance;
437 lg2::error(
438 "initiateOccRequest: Initiating PM Complex reset due to OCC{INST}",
439 "INST", instance);
440
441 // Make sure ALL OCC comm stops to all OCCs before the reset
442 for (auto& obj : statusObjects)
443 {
444 if (obj->occActive())
445 {
446 obj->occActive(false);
447 }
448 }
449
450 #ifdef PLDM
451 pldmHandle->resetOCC(instance);
452 #endif
453 resetRequired = false;
454 }
455 else
456 {
457 lg2::warning(
458 "initiateOccRequest: Ignoring PM Complex reset request for OCC{INST}, because reset already in process for OCC{RINST}",
459 "INST", instance, "RINST", resetInstance);
460 }
461 }
462
statusCallBack(instanceID instance,bool status)463 void Manager::statusCallBack(instanceID instance, bool status)
464 {
465 if (status == true)
466 {
467 if (resetInProgress)
468 {
469 lg2::info(
470 "statusCallBack: Ignoring OCC{INST} activate because a reset has been initiated due to OCC{RINST}",
471 "INST", instance, "RINST", resetInstance);
472 return;
473 }
474
475 // OCC went active
476 ++activeCount;
477
478 #ifdef POWER10
479 if (activeCount == 1)
480 {
481 // First OCC went active (allow some time for all OCCs to go active)
482 waitForAllOccsTimer->restartOnce(60s);
483 }
484 #endif
485
486 if (activeCount == statusObjects.size())
487 {
488 #ifdef POWER10
489 // All OCCs are now running
490 if (waitForAllOccsTimer->isEnabled())
491 {
492 // stop occ wait timer
493 waitForAllOccsTimer->setEnabled(false);
494 }
495
496 // All OCCs have been found, check if we need a reset
497 if (resetRequired)
498 {
499 initiateOccRequest(resetInstance);
500
501 if (!waitForAllOccsTimer->isEnabled())
502 {
503 lg2::warning(
504 "occsNotAllRunning: Restarting waitForAllOccTimer");
505 // restart occ wait timer
506 waitForAllOccsTimer->restartOnce(60s);
507 }
508 }
509 else
510 {
511 // Verify master OCC and start presence monitor
512 validateOccMaster();
513 }
514 #else
515 // Verify master OCC and start presence monitor
516 validateOccMaster();
517 #endif
518 }
519
520 // Start poll timer if not already started (since at least one OCC is
521 // running)
522 if (!_pollTimer->isEnabled())
523 {
524 // An OCC just went active, PM Complex is just coming online so
525 // clear any outstanding reset requests
526 if (resetRequired)
527 {
528 resetRequired = false;
529 lg2::error(
530 "statusCallBack: clearing resetRequired (since OCC{INST} went active, resetInProgress={RIP})",
531 "INST", instance, "RIP", resetInProgress);
532 }
533
534 lg2::info("Manager: OCCs will be polled every {TIME} seconds",
535 "TIME", pollInterval);
536
537 // Send poll and start OCC poll timer
538 pollerTimerExpired();
539 }
540 }
541 else
542 {
543 // OCC went away
544 if (activeCount > 0)
545 {
546 --activeCount;
547 }
548 else
549 {
550 lg2::info("OCC{INST} disabled, and no other OCCs are active",
551 "INST", instance);
552 }
553
554 if (activeCount == 0)
555 {
556 // No OCCs are running
557
558 if (resetInProgress)
559 {
560 // All OCC active sensors are clear (reset should be in
561 // progress)
562 lg2::info(
563 "statusCallBack: Clearing resetInProgress (activeCount={COUNT}, OCC{INST}, status={STATUS})",
564 "COUNT", activeCount, "INST", instance, "STATUS", status);
565 resetInProgress = false;
566 resetInstance = 255;
567 }
568
569 // Stop OCC poll timer
570 if (_pollTimer->isEnabled())
571 {
572 lg2::info(
573 "Manager::statusCallBack(): OCCs are not running, stopping poll timer");
574 _pollTimer->setEnabled(false);
575 }
576
577 #ifdef POWER10
578 // stop wait timer
579 if (waitForAllOccsTimer->isEnabled())
580 {
581 waitForAllOccsTimer->setEnabled(false);
582 }
583 #endif
584 }
585 else if (resetInProgress)
586 {
587 lg2::info(
588 "statusCallBack: Skipping clear of resetInProgress (activeCount={COUNT}, OCC{INST}, status={STATUS})",
589 "COUNT", activeCount, "INST", instance, "STATUS", status);
590 }
591 #ifdef READ_OCC_SENSORS
592 // Clear OCC sensors
593 setSensorValueToNaN(instance);
594 #endif
595 }
596
597 #ifdef POWER10
598 if (waitingForAllOccActiveSensors)
599 {
600 if (utils::isHostRunning())
601 {
602 checkAllActiveSensors();
603 }
604 }
605 #endif
606 }
607
608 #ifdef I2C_OCC
initStatusObjects()609 void Manager::initStatusObjects()
610 {
611 // Make sure we have a valid path string
612 static_assert(sizeof(DEV_PATH) != 0);
613
614 auto deviceNames = i2c_occ::getOccHwmonDevices(DEV_PATH);
615 for (auto& name : deviceNames)
616 {
617 i2c_occ::i2cToDbus(name);
618 name = std::string(OCC_NAME) + '_' + name;
619 auto path = fs::path(OCC_CONTROL_ROOT) / name;
620 statusObjects.emplace_back(
621 std::make_unique<Status>(event, path.c_str(), *this));
622 }
623 // The first device is master occ
624 pcap = std::make_unique<open_power::occ::powercap::PowerCap>(
625 *statusObjects.front());
626 #ifdef POWER10
627 pmode = std::make_unique<powermode::PowerMode>(*this, powermode::PMODE_PATH,
628 powermode::PIPS_PATH);
629 // Set the master OCC on the PowerMode object
630 pmode->setMasterOcc(path);
631 #endif
632 }
633 #endif
634
635 #ifdef PLDM
sbeTimeout(unsigned int instance)636 void Manager::sbeTimeout(unsigned int instance)
637 {
638 auto obj = std::find_if(statusObjects.begin(), statusObjects.end(),
639 [instance](const auto& obj) {
640 return instance == obj->getOccInstanceID();
641 });
642
643 if (obj != statusObjects.end() && (*obj)->occActive())
644 {
645 lg2::info("SBE timeout, requesting HRESET (OCC{INST})", "INST",
646 instance);
647
648 #ifdef PHAL_SUPPORT
649 setSBEState(instance, SBE_STATE_NOT_USABLE);
650 #endif
651
652 // Stop communication with this OCC
653 (*obj)->occActive(false);
654
655 pldmHandle->sendHRESET(instance);
656 }
657 }
658
updateOCCActive(instanceID instance,bool status)659 bool Manager::updateOCCActive(instanceID instance, bool status)
660 {
661 auto obj = std::find_if(statusObjects.begin(), statusObjects.end(),
662 [instance](const auto& obj) {
663 return instance == obj->getOccInstanceID();
664 });
665
666 const bool hostRunning = open_power::occ::utils::isHostRunning();
667 if (obj != statusObjects.end())
668 {
669 if (!hostRunning && (status == true))
670 {
671 lg2::warning(
672 "updateOCCActive: Host is not running yet (OCC{INST} active={STAT}), clearing sensor received",
673 "INST", instance, "STAT", status);
674 (*obj)->setPldmSensorReceived(false);
675 if (!waitingForAllOccActiveSensors)
676 {
677 lg2::info(
678 "updateOCCActive: Waiting for Host and all OCC Active Sensors");
679 waitingForAllOccActiveSensors = true;
680 }
681 #ifdef POWER10
682 discoverTimer->restartOnce(30s);
683 #endif
684 return false;
685 }
686 else
687 {
688 (*obj)->setPldmSensorReceived(true);
689 return (*obj)->occActive(status);
690 }
691 }
692 else
693 {
694 if (hostRunning)
695 {
696 lg2::warning(
697 "updateOCCActive: No status object to update for OCC{INST} (active={STAT})",
698 "INST", instance, "STAT", status);
699 }
700 else
701 {
702 if (status == true)
703 {
704 lg2::warning(
705 "updateOCCActive: No status objects and Host is not running yet (OCC{INST} active={STAT})",
706 "INST", instance, "STAT", status);
707 }
708 }
709 if (status == true)
710 {
711 // OCC went active
712 queuedActiveState.insert(instance);
713 }
714 else
715 {
716 auto match = queuedActiveState.find(instance);
717 if (match != queuedActiveState.end())
718 {
719 // OCC was disabled
720 queuedActiveState.erase(match);
721 }
722 }
723 return false;
724 }
725 }
726
727 // Called upon pldm event To set powermode Safe Mode State for system.
updateOccSafeMode(bool safeMode)728 void Manager::updateOccSafeMode(bool safeMode)
729 {
730 #ifdef POWER10
731 pmode->updateDbusSafeMode(safeMode);
732 #endif
733 // Update the processor throttle status on dbus
734 for (auto& obj : statusObjects)
735 {
736 obj->updateThrottle(safeMode, THROTTLED_SAFE);
737 }
738 }
739
sbeHRESETResult(instanceID instance,bool success)740 void Manager::sbeHRESETResult(instanceID instance, bool success)
741 {
742 if (success)
743 {
744 lg2::info("HRESET succeeded (OCC{INST})", "INST", instance);
745
746 #ifdef PHAL_SUPPORT
747 setSBEState(instance, SBE_STATE_BOOTED);
748 #endif
749
750 // Re-enable communication with this OCC
751 auto obj = std::find_if(statusObjects.begin(), statusObjects.end(),
752 [instance](const auto& obj) {
753 return instance == obj->getOccInstanceID();
754 });
755 if (obj != statusObjects.end() && (!(*obj)->occActive()))
756 {
757 (*obj)->occActive(true);
758 }
759
760 return;
761 }
762
763 #ifdef PHAL_SUPPORT
764 setSBEState(instance, SBE_STATE_FAILED);
765
766 if (sbeCanDump(instance))
767 {
768 lg2::info("HRESET failed (OCC{INST}), triggering SBE dump", "INST",
769 instance);
770
771 auto& bus = utils::getBus();
772 uint32_t src6 = instance << 16;
773 uint32_t logId =
774 FFDC::createPEL("org.open_power.Processor.Error.SbeChipOpTimeout",
775 src6, "SBE command timeout");
776
777 try
778 {
779 constexpr auto interface = "xyz.openbmc_project.Dump.Create";
780 constexpr auto function = "CreateDump";
781
782 std::string service =
783 utils::getService(OP_DUMP_OBJ_PATH, interface);
784 auto method = bus.new_method_call(service.c_str(), OP_DUMP_OBJ_PATH,
785 interface, function);
786
787 std::map<std::string, std::variant<std::string, uint64_t>>
788 createParams{
789 {"com.ibm.Dump.Create.CreateParameters.ErrorLogId",
790 uint64_t(logId)},
791 {"com.ibm.Dump.Create.CreateParameters.DumpType",
792 "com.ibm.Dump.Create.DumpType.SBE"},
793 {"com.ibm.Dump.Create.CreateParameters.FailingUnitId",
794 uint64_t(instance)},
795 };
796
797 method.append(createParams);
798
799 auto response = bus.call(method);
800 }
801 catch (const sdbusplus::exception_t& e)
802 {
803 constexpr auto ERROR_DUMP_DISABLED =
804 "xyz.openbmc_project.Dump.Create.Error.Disabled";
805 if (e.name() == ERROR_DUMP_DISABLED)
806 {
807 lg2::info("Dump is disabled, skipping");
808 }
809 else
810 {
811 lg2::error("Dump failed");
812 }
813 }
814 }
815 #endif
816
817 // SBE Reset failed, try PM Complex reset
818 lg2::error("sbeHRESETResult: Forcing PM Complex reset");
819 resetOccRequest(instance);
820 }
821
822 #ifdef PHAL_SUPPORT
sbeCanDump(unsigned int instance)823 bool Manager::sbeCanDump(unsigned int instance)
824 {
825 struct pdbg_target* proc = getPdbgTarget(instance);
826
827 if (!proc)
828 {
829 // allow the dump in the error case
830 return true;
831 }
832
833 try
834 {
835 if (!openpower::phal::sbe::isDumpAllowed(proc))
836 {
837 return false;
838 }
839
840 if (openpower::phal::pdbg::isSbeVitalAttnActive(proc))
841 {
842 return false;
843 }
844 }
845 catch (openpower::phal::exception::SbeError& e)
846 {
847 lg2::info("Failed to query SBE state");
848 }
849
850 // allow the dump in the error case
851 return true;
852 }
853
setSBEState(unsigned int instance,enum sbe_state state)854 void Manager::setSBEState(unsigned int instance, enum sbe_state state)
855 {
856 struct pdbg_target* proc = getPdbgTarget(instance);
857
858 if (!proc)
859 {
860 return;
861 }
862
863 try
864 {
865 openpower::phal::sbe::setState(proc, state);
866 }
867 catch (const openpower::phal::exception::SbeError& e)
868 {
869 lg2::error("Failed to set SBE state: {ERROR}", "ERROR", e.what());
870 }
871 }
872
getPdbgTarget(unsigned int instance)873 struct pdbg_target* Manager::getPdbgTarget(unsigned int instance)
874 {
875 if (!pdbgInitialized)
876 {
877 try
878 {
879 openpower::phal::pdbg::init();
880 pdbgInitialized = true;
881 }
882 catch (const openpower::phal::exception::PdbgError& e)
883 {
884 lg2::error("pdbg initialization failed");
885 return nullptr;
886 }
887 }
888
889 struct pdbg_target* proc = nullptr;
890 pdbg_for_each_class_target("proc", proc)
891 {
892 if (pdbg_target_index(proc) == instance)
893 {
894 return proc;
895 }
896 }
897
898 lg2::error("Failed to get pdbg target");
899 return nullptr;
900 }
901 #endif
902 #endif
903
pollerTimerExpired()904 void Manager::pollerTimerExpired()
905 {
906 if (!_pollTimer)
907 {
908 lg2::error("pollerTimerExpired() ERROR: Timer not defined");
909 return;
910 }
911
912 #ifdef POWER10
913 if (resetRequired)
914 {
915 lg2::error("pollerTimerExpired() - Initiating PM Complex reset");
916 initiateOccRequest(resetInstance);
917
918 if (!waitForAllOccsTimer->isEnabled())
919 {
920 lg2::warning("pollerTimerExpired: Restarting waitForAllOccTimer");
921 // restart occ wait timer
922 waitForAllOccsTimer->restartOnce(60s);
923 }
924 return;
925 }
926 #endif
927
928 for (auto& obj : statusObjects)
929 {
930 if (!obj->occActive())
931 {
932 // OCC is not running yet
933 #ifdef READ_OCC_SENSORS
934 auto id = obj->getOccInstanceID();
935 setSensorValueToNaN(id);
936 #endif
937 continue;
938 }
939
940 // Read sysfs to force kernel to poll OCC
941 obj->readOccState();
942
943 #ifdef READ_OCC_SENSORS
944 // Read occ sensor values
945 getSensorValues(obj);
946 #endif
947 }
948
949 if (activeCount > 0)
950 {
951 // Restart OCC poll timer
952 _pollTimer->restartOnce(std::chrono::seconds(pollInterval));
953 }
954 else
955 {
956 // No OCCs running, so poll timer will not be restarted
957 lg2::info(
958 "Manager::pollerTimerExpired: poll timer will not be restarted");
959 }
960 }
961
962 #ifdef READ_OCC_SENSORS
readTempSensors(const fs::path & path,uint32_t occInstance)963 void Manager::readTempSensors(const fs::path& path, uint32_t occInstance)
964 {
965 // There may be more than one sensor with the same FRU type
966 // and label so make two passes: the first to read the temps
967 // from sysfs, and the second to put them on D-Bus after
968 // resolving any conflicts.
969 std::map<std::string, double> sensorData;
970
971 std::regex expr{"temp\\d+_label$"}; // Example: temp5_label
972 for (auto& file : fs::directory_iterator(path))
973 {
974 if (!std::regex_search(file.path().string(), expr))
975 {
976 continue;
977 }
978
979 uint32_t labelValue{0};
980
981 try
982 {
983 labelValue = readFile<uint32_t>(file.path());
984 }
985 catch (const std::system_error& e)
986 {
987 lg2::debug(
988 "readTempSensors: Failed reading {PATH}, errno = {ERROR}",
989 "PATH", file.path().string(), "ERROR", e.code().value());
990 continue;
991 }
992
993 const std::string& tempLabel = "label";
994 const std::string filePathString = file.path().string().substr(
995 0, file.path().string().length() - tempLabel.length());
996
997 uint32_t fruTypeValue{0};
998 try
999 {
1000 fruTypeValue = readFile<uint32_t>(filePathString + fruTypeSuffix);
1001 }
1002 catch (const std::system_error& e)
1003 {
1004 lg2::debug(
1005 "readTempSensors: Failed reading {PATH}, errno = {ERROR}",
1006 "PATH", filePathString + fruTypeSuffix, "ERROR",
1007 e.code().value());
1008 continue;
1009 }
1010
1011 std::string sensorPath =
1012 OCC_SENSORS_ROOT + std::string("/temperature/");
1013
1014 std::string dvfsTempPath;
1015
1016 if (fruTypeValue == VRMVdd)
1017 {
1018 sensorPath.append(
1019 "vrm_vdd" + std::to_string(occInstance) + "_temp");
1020 }
1021 else if (fruTypeValue == processorIoRing)
1022 {
1023 sensorPath.append(
1024 "proc" + std::to_string(occInstance) + "_ioring_temp");
1025 dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/proc" +
1026 std::to_string(occInstance) + "_ioring_dvfs_temp";
1027 }
1028 else
1029 {
1030 uint16_t type = (labelValue & 0xFF000000) >> 24;
1031 uint16_t instanceID = labelValue & 0x0000FFFF;
1032
1033 if (type == OCC_DIMM_TEMP_SENSOR_TYPE)
1034 {
1035 if (fruTypeValue == fruTypeNotAvailable)
1036 {
1037 // Not all DIMM related temps are available to read
1038 // (no _input file in this case)
1039 continue;
1040 }
1041 auto iter = dimmTempSensorName.find(fruTypeValue);
1042 if (iter == dimmTempSensorName.end())
1043 {
1044 lg2::error(
1045 "readTempSensors: Fru type error! fruTypeValue = {FRU}) ",
1046 "FRU", fruTypeValue);
1047 continue;
1048 }
1049
1050 sensorPath.append(
1051 "dimm" + std::to_string(instanceID) + iter->second);
1052
1053 dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/" +
1054 dimmDVFSSensorName.at(fruTypeValue);
1055 }
1056 else if (type == OCC_CPU_TEMP_SENSOR_TYPE)
1057 {
1058 if (fruTypeValue == processorCore)
1059 {
1060 // The OCC reports small core temps, of which there are
1061 // two per big core. All current P10 systems are in big
1062 // core mode, so use a big core name.
1063 uint16_t coreNum = instanceID / 2;
1064 uint16_t tempNum = instanceID % 2;
1065 sensorPath.append("proc" + std::to_string(occInstance) +
1066 "_core" + std::to_string(coreNum) + "_" +
1067 std::to_string(tempNum) + "_temp");
1068
1069 dvfsTempPath =
1070 std::string{OCC_SENSORS_ROOT} + "/temperature/proc" +
1071 std::to_string(occInstance) + "_core_dvfs_temp";
1072 }
1073 else
1074 {
1075 continue;
1076 }
1077 }
1078 else
1079 {
1080 continue;
1081 }
1082 }
1083
1084 // The dvfs temp file only needs to be read once per chip per type.
1085 if (!dvfsTempPath.empty() &&
1086 !dbus::OccDBusSensors::getOccDBus().hasDvfsTemp(dvfsTempPath))
1087 {
1088 try
1089 {
1090 auto dvfsValue = readFile<double>(filePathString + maxSuffix);
1091
1092 dbus::OccDBusSensors::getOccDBus().setDvfsTemp(
1093 dvfsTempPath, dvfsValue * std::pow(10, -3));
1094 }
1095 catch (const std::system_error& e)
1096 {
1097 lg2::debug(
1098 "readTempSensors: Failed reading {PATH}, errno = {ERROR}",
1099 "PATH", filePathString + maxSuffix, "ERROR",
1100 e.code().value());
1101 }
1102 }
1103
1104 uint32_t faultValue{0};
1105 try
1106 {
1107 faultValue = readFile<uint32_t>(filePathString + faultSuffix);
1108 }
1109 catch (const std::system_error& e)
1110 {
1111 lg2::debug(
1112 "readTempSensors: Failed reading {PATH}, errno = {ERROR}",
1113 "PATH", filePathString + faultSuffix, "ERROR",
1114 e.code().value());
1115 continue;
1116 }
1117
1118 double tempValue{0};
1119 // NOTE: if OCC sends back 0xFF, kernal sets this fault value to 1.
1120 if (faultValue != 0)
1121 {
1122 tempValue = std::numeric_limits<double>::quiet_NaN();
1123 }
1124 else
1125 {
1126 // Read the temperature
1127 try
1128 {
1129 tempValue = readFile<double>(filePathString + inputSuffix);
1130 }
1131 catch (const std::system_error& e)
1132 {
1133 lg2::debug(
1134 "readTempSensors: Failed reading {PATH}, errno = {ERROR}",
1135 "PATH", filePathString + inputSuffix, "ERROR",
1136 e.code().value());
1137
1138 // if errno == EAGAIN(Resource temporarily unavailable) then set
1139 // temp to 0, to avoid using old temp, and affecting FAN
1140 // Control.
1141 if (e.code().value() == EAGAIN)
1142 {
1143 tempValue = 0;
1144 }
1145 // else the errno would be something like
1146 // EBADF(Bad file descriptor)
1147 // or ENOENT(No such file or directory)
1148 else
1149 {
1150 continue;
1151 }
1152 }
1153 }
1154
1155 // If this object path already has a value, only overwite
1156 // it if the previous one was an NaN or a smaller value.
1157 auto existing = sensorData.find(sensorPath);
1158 if (existing != sensorData.end())
1159 {
1160 // Multiple sensors found for this FRU type
1161 if ((std::isnan(existing->second) && (tempValue == 0)) ||
1162 ((existing->second == 0) && std::isnan(tempValue)))
1163 {
1164 // One of the redundant sensors has failed (0xFF/nan), and the
1165 // other sensor has no reading (0), so set the FRU to NaN to
1166 // force fan increase
1167 tempValue = std::numeric_limits<double>::quiet_NaN();
1168 existing->second = tempValue;
1169 }
1170 if (std::isnan(existing->second) || (tempValue > existing->second))
1171 {
1172 existing->second = tempValue;
1173 }
1174 }
1175 else
1176 {
1177 // First sensor for this FRU type
1178 sensorData[sensorPath] = tempValue;
1179 }
1180 }
1181
1182 // Now publish the values on D-Bus.
1183 for (const auto& [objectPath, value] : sensorData)
1184 {
1185 dbus::OccDBusSensors::getOccDBus().setValue(objectPath,
1186 value * std::pow(10, -3));
1187
1188 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1189 objectPath, !std::isnan(value));
1190
1191 if (existingSensors.find(objectPath) == existingSensors.end())
1192 {
1193 dbus::OccDBusSensors::getOccDBus().setChassisAssociation(
1194 objectPath, {"all_sensors"});
1195 }
1196 existingSensors[objectPath] = occInstance;
1197 }
1198 }
1199
getPowerLabelFunctionID(const std::string & value)1200 std::optional<std::string> Manager::getPowerLabelFunctionID(
1201 const std::string& value)
1202 {
1203 // If the value is "system", then the FunctionID is "system".
1204 if (value == "system")
1205 {
1206 return value;
1207 }
1208
1209 // If the value is not "system", then the label value have 3 numbers, of
1210 // which we only care about the middle one:
1211 // <sensor id>_<function id>_<apss channel>
1212 // eg: The value is "0_10_5" , then the FunctionID is "10".
1213 if (value.find("_") == std::string::npos)
1214 {
1215 return std::nullopt;
1216 }
1217
1218 auto powerLabelValue = value.substr((value.find("_") + 1));
1219
1220 if (powerLabelValue.find("_") == std::string::npos)
1221 {
1222 return std::nullopt;
1223 }
1224
1225 return powerLabelValue.substr(0, powerLabelValue.find("_"));
1226 }
1227
readPowerSensors(const fs::path & path,uint32_t id)1228 void Manager::readPowerSensors(const fs::path& path, uint32_t id)
1229 {
1230 std::regex expr{"power\\d+_label$"}; // Example: power5_label
1231 for (auto& file : fs::directory_iterator(path))
1232 {
1233 if (!std::regex_search(file.path().string(), expr))
1234 {
1235 continue;
1236 }
1237
1238 std::string labelValue;
1239 try
1240 {
1241 labelValue = readFile<std::string>(file.path());
1242 }
1243 catch (const std::system_error& e)
1244 {
1245 lg2::debug(
1246 "readPowerSensors: Failed reading {PATH}, errno = {ERROR}",
1247 "PATH", file.path().string(), "ERROR", e.code().value());
1248 continue;
1249 }
1250
1251 auto functionID = getPowerLabelFunctionID(labelValue);
1252 if (functionID == std::nullopt)
1253 {
1254 continue;
1255 }
1256
1257 const std::string& tempLabel = "label";
1258 const std::string filePathString = file.path().string().substr(
1259 0, file.path().string().length() - tempLabel.length());
1260
1261 std::string sensorPath = OCC_SENSORS_ROOT + std::string("/power/");
1262
1263 auto iter = powerSensorName.find(*functionID);
1264 if (iter == powerSensorName.end())
1265 {
1266 continue;
1267 }
1268 sensorPath.append(iter->second);
1269
1270 double tempValue{0};
1271
1272 try
1273 {
1274 tempValue = readFile<double>(filePathString + inputSuffix);
1275 }
1276 catch (const std::system_error& e)
1277 {
1278 lg2::debug(
1279 "readPowerSensors: Failed reading {PATH}, errno = {ERROR}",
1280 "PATH", filePathString + inputSuffix, "ERROR",
1281 e.code().value());
1282 continue;
1283 }
1284
1285 dbus::OccDBusSensors::getOccDBus().setUnit(
1286 sensorPath, "xyz.openbmc_project.Sensor.Value.Unit.Watts");
1287
1288 dbus::OccDBusSensors::getOccDBus().setValue(
1289 sensorPath, tempValue * std::pow(10, -3) * std::pow(10, -3));
1290
1291 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1292 sensorPath, true);
1293
1294 if (existingSensors.find(sensorPath) == existingSensors.end())
1295 {
1296 std::vector<std::string> fTypeList = {"all_sensors"};
1297 if (iter->second == "total_power")
1298 {
1299 // Set sensor purpose as TotalPower
1300 dbus::OccDBusSensors::getOccDBus().setPurpose(
1301 sensorPath,
1302 "xyz.openbmc_project.Sensor.Purpose.SensorPurpose.TotalPower");
1303 }
1304 dbus::OccDBusSensors::getOccDBus().setChassisAssociation(
1305 sensorPath, fTypeList);
1306 }
1307 existingSensors[sensorPath] = id;
1308 }
1309 return;
1310 }
1311
readExtnSensors(const fs::path & path,uint32_t id)1312 void Manager::readExtnSensors(const fs::path& path, uint32_t id)
1313 {
1314 std::regex expr{"extn\\d+_label$"}; // Example: extn5_label
1315 for (auto& file : fs::directory_iterator(path))
1316 {
1317 if (!std::regex_search(file.path().string(), expr))
1318 {
1319 continue;
1320 }
1321
1322 // Read in Label value of the sensor from file.
1323 std::string labelValue;
1324 try
1325 {
1326 labelValue = readFile<std::string>(file.path());
1327 }
1328 catch (const std::system_error& e)
1329 {
1330 lg2::debug(
1331 "readExtnSensors:label Failed reading {PATH}, errno = {ERROR}",
1332 "PATH", file.path().string(), "ERROR", e.code().value());
1333 continue;
1334 }
1335 const std::string& tempLabel = "label";
1336 const std::string filePathString = file.path().string().substr(
1337 0, file.path().string().length() - tempLabel.length());
1338
1339 std::string sensorPath = OCC_SENSORS_ROOT + std::string("/power/");
1340
1341 // Labels of EXTN sections from OCC interface Document
1342 // have different formats.
1343 // 0x464d494e : FMIN 0x46444953 : FDIS
1344 // 0x46424153 : FBAS 0x46555400 : FUT
1345 // 0x464d4158 : FMAX 0x434c4950 : CLIP
1346 // 0x4d4f4445 : MODE 0x574f4643 : WOFC
1347 // 0x574f4649 : WOFI 0x5057524d : PWRM
1348 // 0x50575250 : PWRP 0x45525248 : ERRH
1349 // Label indicating byte 5 and 6 is the current (mem,proc) power in
1350 // Watts.
1351 if ((labelValue == EXTN_LABEL_PWRM_MEMORY_POWER) ||
1352 (labelValue == EXTN_LABEL_PWRP_PROCESSOR_POWER))
1353 {
1354 // Build the dbus String for this chiplet power asset.
1355 if (labelValue == EXTN_LABEL_PWRP_PROCESSOR_POWER)
1356 {
1357 labelValue = "_power";
1358 }
1359 else // else EXTN_LABEL_PWRM_MEMORY_POWER
1360 {
1361 labelValue = "_mem_power";
1362 }
1363 sensorPath.append("chiplet" + std::to_string(id) + labelValue);
1364
1365 // Read in data value of the sensor from file.
1366 // Read in as string due to different format of data in sensors.
1367 std::string extnValue;
1368 try
1369 {
1370 extnValue = readFile<std::string>(filePathString + inputSuffix);
1371 }
1372 catch (const std::system_error& e)
1373 {
1374 lg2::debug(
1375 "readExtnSensors:value Failed reading {PATH}, errno = {ERROR}",
1376 "PATH", filePathString + inputSuffix, "ERROR",
1377 e.code().value());
1378 continue;
1379 }
1380
1381 // For Power field, Convert last 4 bytes of hex string into number
1382 // value.
1383 std::stringstream ssData;
1384 ssData << std::hex << extnValue.substr(extnValue.length() - 4);
1385 uint16_t MyHexNumber;
1386 ssData >> MyHexNumber;
1387
1388 // Convert output/DC power to input/AC power in Watts (round up)
1389 MyHexNumber =
1390 std::round(((MyHexNumber / (PS_DERATING_FACTOR / 100.0))));
1391
1392 dbus::OccDBusSensors::getOccDBus().setUnit(
1393 sensorPath, "xyz.openbmc_project.Sensor.Value.Unit.Watts");
1394
1395 dbus::OccDBusSensors::getOccDBus().setValue(sensorPath,
1396 MyHexNumber);
1397
1398 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1399 sensorPath, true);
1400
1401 if (existingSensors.find(sensorPath) == existingSensors.end())
1402 {
1403 dbus::OccDBusSensors::getOccDBus().setChassisAssociation(
1404 sensorPath, {"all_sensors"});
1405 }
1406
1407 existingSensors[sensorPath] = id;
1408 } // End Extended Power Sensors.
1409 } // End For loop on files for Extended Sensors.
1410 return;
1411 }
1412
setSensorValueToNaN(uint32_t id) const1413 void Manager::setSensorValueToNaN(uint32_t id) const
1414 {
1415 for (const auto& [sensorPath, occId] : existingSensors)
1416 {
1417 if (occId == id)
1418 {
1419 dbus::OccDBusSensors::getOccDBus().setValue(
1420 sensorPath, std::numeric_limits<double>::quiet_NaN());
1421
1422 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1423 sensorPath, true);
1424 }
1425 }
1426 return;
1427 }
1428
setSensorValueToNonFunctional(uint32_t id) const1429 void Manager::setSensorValueToNonFunctional(uint32_t id) const
1430 {
1431 for (const auto& [sensorPath, occId] : existingSensors)
1432 {
1433 if (occId == id)
1434 {
1435 dbus::OccDBusSensors::getOccDBus().setValue(
1436 sensorPath, std::numeric_limits<double>::quiet_NaN());
1437
1438 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1439 sensorPath, false);
1440 }
1441 }
1442 return;
1443 }
1444
getSensorValues(std::unique_ptr<Status> & occ)1445 void Manager::getSensorValues(std::unique_ptr<Status>& occ)
1446 {
1447 static bool tracedError[8] = {0};
1448 const fs::path sensorPath = occ->getHwmonPath();
1449 const uint32_t id = occ->getOccInstanceID();
1450
1451 if (fs::exists(sensorPath))
1452 {
1453 // Read temperature sensors
1454 readTempSensors(sensorPath, id);
1455 // Read Extended sensors
1456 readExtnSensors(sensorPath, id);
1457
1458 if (occ->isMasterOcc())
1459 {
1460 // Read power sensors
1461 readPowerSensors(sensorPath, id);
1462 }
1463 tracedError[id] = false;
1464 }
1465 else
1466 {
1467 if (!tracedError[id])
1468 {
1469 lg2::error(
1470 "Manager::getSensorValues: OCC{INST} sensor path missing: {PATH}",
1471 "INST", id, "PATH", sensorPath);
1472 tracedError[id] = true;
1473 }
1474 }
1475
1476 return;
1477 }
1478 #endif
1479
1480 // Read the altitude from DBus
readAltitude()1481 void Manager::readAltitude()
1482 {
1483 static bool traceAltitudeErr = true;
1484
1485 utils::PropertyValue altitudeProperty{};
1486 try
1487 {
1488 altitudeProperty = utils::getProperty(ALTITUDE_PATH, ALTITUDE_INTERFACE,
1489 ALTITUDE_PROP);
1490 auto sensorVal = std::get<double>(altitudeProperty);
1491 if (sensorVal < 0xFFFF)
1492 {
1493 if (sensorVal < 0)
1494 {
1495 altitude = 0;
1496 }
1497 else
1498 {
1499 // Round to nearest meter
1500 altitude = uint16_t(sensorVal + 0.5);
1501 }
1502 lg2::debug("readAltitude: sensor={VALUE} ({ALT}m)", "VALUE",
1503 sensorVal, "ALT", altitude);
1504 traceAltitudeErr = true;
1505 }
1506 else
1507 {
1508 if (traceAltitudeErr)
1509 {
1510 traceAltitudeErr = false;
1511 lg2::debug("Invalid altitude value: {ALT}", "ALT", sensorVal);
1512 }
1513 }
1514 }
1515 catch (const sdbusplus::exception_t& e)
1516 {
1517 if (traceAltitudeErr)
1518 {
1519 traceAltitudeErr = false;
1520 lg2::info("Unable to read Altitude: {ERROR}", "ERROR", e.what());
1521 }
1522 altitude = 0xFFFF; // not available
1523 }
1524 }
1525
1526 // Callback function when ambient temperature changes
ambientCallback(sdbusplus::message_t & msg)1527 void Manager::ambientCallback(sdbusplus::message_t& msg)
1528 {
1529 double currentTemp = 0;
1530 uint8_t truncatedTemp = 0xFF;
1531 std::string msgSensor;
1532 std::map<std::string, std::variant<double>> msgData;
1533 msg.read(msgSensor, msgData);
1534
1535 auto valPropMap = msgData.find(AMBIENT_PROP);
1536 if (valPropMap == msgData.end())
1537 {
1538 lg2::debug("ambientCallback: Unknown ambient property changed");
1539 return;
1540 }
1541 currentTemp = std::get<double>(valPropMap->second);
1542 if (std::isnan(currentTemp))
1543 {
1544 truncatedTemp = 0xFF;
1545 }
1546 else
1547 {
1548 if (currentTemp < 0)
1549 {
1550 truncatedTemp = 0;
1551 }
1552 else
1553 {
1554 // Round to nearest degree C
1555 truncatedTemp = uint8_t(currentTemp + 0.5);
1556 }
1557 }
1558
1559 // If ambient changes, notify OCCs
1560 if (truncatedTemp != ambient)
1561 {
1562 lg2::debug("ambientCallback: Ambient change from {OLD} to {NEW}C",
1563 "OLD", ambient, "NEW", currentTemp);
1564
1565 ambient = truncatedTemp;
1566 if (altitude == 0xFFFF)
1567 {
1568 // No altitude yet, try reading again
1569 readAltitude();
1570 }
1571
1572 lg2::debug("ambientCallback: Ambient: {TEMP}C, altitude: {ALT}m",
1573 "TEMP", ambient, "ALT", altitude);
1574 #ifdef POWER10
1575 // Send ambient and altitude to all OCCs
1576 for (auto& obj : statusObjects)
1577 {
1578 if (obj->occActive())
1579 {
1580 obj->sendAmbient(ambient, altitude);
1581 }
1582 }
1583 #endif // POWER10
1584 }
1585 }
1586
1587 // return the current ambient and altitude readings
getAmbientData(bool & ambientValid,uint8_t & ambientTemp,uint16_t & altitudeValue) const1588 void Manager::getAmbientData(bool& ambientValid, uint8_t& ambientTemp,
1589 uint16_t& altitudeValue) const
1590 {
1591 ambientValid = true;
1592 ambientTemp = ambient;
1593 altitudeValue = altitude;
1594
1595 if (ambient == 0xFF)
1596 {
1597 ambientValid = false;
1598 }
1599 }
1600
1601 #ifdef POWER10
1602 // Called when waitForAllOccsTimer expires
1603 // After the first OCC goes active, this timer will be started (60 seconds)
occsNotAllRunning()1604 void Manager::occsNotAllRunning()
1605 {
1606 if (resetInProgress)
1607 {
1608 lg2::warning(
1609 "occsNotAllRunning: Ignoring waitForAllOccsTimer because reset is in progress");
1610 return;
1611 }
1612 if (activeCount != statusObjects.size())
1613 {
1614 // Not all OCCs went active
1615 lg2::warning(
1616 "occsNotAllRunning: Active OCC count ({COUNT}) does not match expected count ({EXP})",
1617 "COUNT", activeCount, "EXP", statusObjects.size());
1618 // Procs may be garded, so may be expected
1619 }
1620
1621 if (resetRequired)
1622 {
1623 initiateOccRequest(resetInstance);
1624
1625 if (!waitForAllOccsTimer->isEnabled())
1626 {
1627 lg2::warning("occsNotAllRunning: Restarting waitForAllOccTimer");
1628 // restart occ wait timer
1629 waitForAllOccsTimer->restartOnce(60s);
1630 }
1631 }
1632 else
1633 {
1634 validateOccMaster();
1635 }
1636 }
1637
1638 #ifdef PLDM
1639 // Called when throttlePldmTraceTimer expires.
1640 // If this timer expires, that indicates there are no OCC active sensor PDRs
1641 // found which will trigger pldm traces to be throttled.
1642 // The second time this timer expires, a PEL will get created.
throttlePldmTraceExpired()1643 void Manager::throttlePldmTraceExpired()
1644 {
1645 if (utils::isHostRunning())
1646 {
1647 if (!onPldmTimeoutCreatePel)
1648 {
1649 // Throttle traces
1650 pldmHandle->setTraceThrottle(true);
1651 // Restart timer to log a PEL when timer expires
1652 onPldmTimeoutCreatePel = true;
1653 throttlePldmTraceTimer->restartOnce(40min);
1654 }
1655 else
1656 {
1657 lg2::error(
1658 "throttlePldmTraceExpired(): OCC active sensors still not available!");
1659 // Create PEL
1660 createPldmSensorPEL();
1661 }
1662 }
1663 else
1664 {
1665 // Make sure traces are not throttled
1666 pldmHandle->setTraceThrottle(false);
1667 lg2::info(
1668 "throttlePldmTraceExpired(): host it not running ignoring sensor timer");
1669 }
1670 }
1671
createPldmSensorPEL()1672 void Manager::createPldmSensorPEL()
1673 {
1674 Error::Descriptor d = Error::Descriptor(MISSING_OCC_SENSORS_PATH);
1675 std::map<std::string, std::string> additionalData;
1676
1677 additionalData.emplace("_PID", std::to_string(getpid()));
1678
1679 lg2::info(
1680 "createPldmSensorPEL(): Unable to find PLDM sensors for the OCCs");
1681
1682 auto& bus = utils::getBus();
1683
1684 try
1685 {
1686 FFDCFiles ffdc;
1687 // Add occ-control journal traces to PEL FFDC
1688 auto occJournalFile =
1689 FFDC::addJournalEntries(ffdc, "openpower-occ-control", 40);
1690
1691 static constexpr auto loggingObjectPath =
1692 "/xyz/openbmc_project/logging";
1693 static constexpr auto opLoggingInterface = "org.open_power.Logging.PEL";
1694 std::string service =
1695 utils::getService(loggingObjectPath, opLoggingInterface);
1696 auto method =
1697 bus.new_method_call(service.c_str(), loggingObjectPath,
1698 opLoggingInterface, "CreatePELWithFFDCFiles");
1699
1700 // Set level to Warning (Predictive).
1701 auto level =
1702 sdbusplus::xyz::openbmc_project::Logging::server::convertForMessage(
1703 sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level::
1704 Warning);
1705
1706 method.append(d.path, level, additionalData, ffdc);
1707 bus.call(method);
1708 }
1709 catch (const sdbusplus::exception_t& e)
1710 {
1711 lg2::error("Failed to create MISSING_OCC_SENSORS PEL: {ERROR}", "ERROR",
1712 e.what());
1713 }
1714 }
1715 #endif // PLDM
1716 #endif // POWER10
1717
1718 // Verify single master OCC and start presence monitor
validateOccMaster()1719 void Manager::validateOccMaster()
1720 {
1721 int masterInstance = -1;
1722 for (auto& obj : statusObjects)
1723 {
1724 auto instance = obj->getOccInstanceID();
1725 #ifdef POWER10
1726 if (!obj->occActive())
1727 {
1728 if (utils::isHostRunning())
1729 {
1730 // Check if sensor was queued while waiting for discovery
1731 auto match = queuedActiveState.find(instance);
1732 if (match != queuedActiveState.end())
1733 {
1734 queuedActiveState.erase(match);
1735 lg2::info("validateOccMaster: OCC{INST} is ACTIVE (queued)",
1736 "INST", instance);
1737 obj->occActive(true);
1738 }
1739 else
1740 {
1741 // OCC does not appear to be active yet, check active sensor
1742 #ifdef PLDM
1743 pldmHandle->checkActiveSensor(instance);
1744 #endif
1745 if (obj->occActive())
1746 {
1747 lg2::info(
1748 "validateOccMaster: OCC{INST} is ACTIVE after reading sensor",
1749 "INST", instance);
1750 }
1751 }
1752 }
1753 else
1754 {
1755 lg2::warning(
1756 "validateOccMaster: HOST is not running (OCC{INST})",
1757 "INST", instance);
1758 return;
1759 }
1760 }
1761 #endif // POWER10
1762
1763 if (obj->isMasterOcc())
1764 {
1765 obj->addPresenceWatchMaster();
1766
1767 if (masterInstance == -1)
1768 {
1769 masterInstance = instance;
1770 }
1771 else
1772 {
1773 lg2::error(
1774 "validateOccMaster: Multiple OCC masters! ({MAST1} and {MAST2})",
1775 "MAST1", masterInstance, "MAST2", instance);
1776 // request reset
1777 obj->deviceError(Error::Descriptor(PRESENCE_ERROR_PATH));
1778 }
1779 }
1780 }
1781
1782 if (masterInstance < 0)
1783 {
1784 lg2::error("validateOccMaster: Master OCC not found! (of {NUM} OCCs)",
1785 "NUM", statusObjects.size());
1786 // request reset
1787 statusObjects.front()->deviceError(
1788 Error::Descriptor(PRESENCE_ERROR_PATH));
1789 }
1790 else
1791 {
1792 lg2::info("validateOccMaster: OCC{INST} is master of {COUNT} OCCs",
1793 "INST", masterInstance, "COUNT", activeCount);
1794 #ifdef POWER10
1795 pmode->updateDbusSafeMode(false);
1796 #endif
1797 }
1798 }
1799
updatePcapBounds() const1800 void Manager::updatePcapBounds() const
1801 {
1802 if (pcap)
1803 {
1804 pcap->updatePcapBounds();
1805 }
1806 }
1807
1808 // Clean up any variables since the OCC is no longer running.
1809 // Called when pldm receives an event indicating host is powered off.
hostPoweredOff()1810 void Manager::hostPoweredOff()
1811 {
1812 if (resetRequired)
1813 {
1814 lg2::info("hostPoweredOff: Clearing resetRequired for OCC{INST}",
1815 "INST", resetInstance);
1816 resetRequired = false;
1817 }
1818 if (resetInProgress)
1819 {
1820 lg2::info("hostPoweredOff: Clearing resetInProgress for OCC{INST}",
1821 "INST", resetInstance);
1822 resetInProgress = false;
1823 }
1824 resetInstance = 255;
1825 }
1826
1827 } // namespace occ
1828 } // namespace open_power
1829