1 #include "config.h"
2
3 #include "occ_manager.hpp"
4
5 #include "i2c_occ.hpp"
6 #include "occ_dbus.hpp"
7 #include "occ_errors.hpp"
8 #include "utils.hpp"
9
10 #include <phosphor-logging/elog-errors.hpp>
11 #include <phosphor-logging/log.hpp>
12 #include <xyz/openbmc_project/Common/error.hpp>
13
14 #include <chrono>
15 #include <cmath>
16 #include <filesystem>
17 #include <fstream>
18 #include <regex>
19
20 namespace open_power
21 {
22 namespace occ
23 {
24
25 constexpr uint32_t fruTypeNotAvailable = 0xFF;
26 constexpr auto fruTypeSuffix = "fru_type";
27 constexpr auto faultSuffix = "fault";
28 constexpr auto inputSuffix = "input";
29 constexpr auto maxSuffix = "max";
30
31 const auto HOST_ON_FILE = "/run/openbmc/host@0-on";
32
33 using namespace phosphor::logging;
34 using namespace std::literals::chrono_literals;
35
36 template <typename T>
readFile(const std::string & path)37 T readFile(const std::string& path)
38 {
39 std::ifstream ifs;
40 ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit |
41 std::ifstream::eofbit);
42 T data;
43
44 try
45 {
46 ifs.open(path);
47 ifs >> data;
48 ifs.close();
49 }
50 catch (const std::exception& e)
51 {
52 auto err = errno;
53 throw std::system_error(err, std::generic_category());
54 }
55
56 return data;
57 }
58
59 // findAndCreateObjects():
60 // Takes care of getting the required objects created and
61 // finds the available devices/processors.
62 // (function is called everytime the discoverTimer expires)
63 // - create the PowerMode object to control OCC modes
64 // - create statusObjects for each OCC device found
65 // - waits for OCC Active sensors PDRs to become available
66 // - restart discoverTimer if all data is not available yet
findAndCreateObjects()67 void Manager::findAndCreateObjects()
68 {
69 #ifndef POWER10
70 for (auto id = 0; id < MAX_CPUS; ++id)
71 {
72 // Create one occ per cpu
73 auto occ = std::string(OCC_NAME) + std::to_string(id);
74 createObjects(occ);
75 }
76 #else
77 if (!pmode)
78 {
79 // Create the power mode object
80 pmode = std::make_unique<powermode::PowerMode>(
81 *this, powermode::PMODE_PATH, powermode::PIPS_PATH, event);
82 }
83
84 if (!fs::exists(HOST_ON_FILE))
85 {
86 static bool statusObjCreated = false;
87 if (!statusObjCreated)
88 {
89 // Create the OCCs based on on the /dev/occX devices
90 auto occs = findOCCsInDev();
91
92 if (occs.empty() || (prevOCCSearch.size() != occs.size()))
93 {
94 // Something changed or no OCCs yet, try again in 10s.
95 // Note on the first pass prevOCCSearch will be empty,
96 // so there will be at least one delay to give things
97 // a chance to settle.
98 prevOCCSearch = occs;
99
100 log<level::INFO>(
101 std::format(
102 "Manager::findAndCreateObjects(): Waiting for OCCs (currently {})",
103 occs.size())
104 .c_str());
105
106 discoverTimer->restartOnce(10s);
107 }
108 else
109 {
110 // All OCCs appear to be available, create status objects
111
112 // createObjects requires OCC0 first.
113 std::sort(occs.begin(), occs.end());
114
115 log<level::INFO>(
116 std::format(
117 "Manager::findAndCreateObjects(): Creating {} OCC Status Objects",
118 occs.size())
119 .c_str());
120 for (auto id : occs)
121 {
122 createObjects(std::string(OCC_NAME) + std::to_string(id));
123 }
124 statusObjCreated = true;
125 waitingForAllOccActiveSensors = true;
126
127 // Find/update the processor path associated with each OCC
128 for (auto& obj : statusObjects)
129 {
130 obj->updateProcAssociation();
131 }
132 }
133 }
134
135 if (statusObjCreated && waitingForAllOccActiveSensors)
136 {
137 static bool tracedHostWait = false;
138 if (utils::isHostRunning())
139 {
140 if (tracedHostWait)
141 {
142 log<level::INFO>(
143 "Manager::findAndCreateObjects(): Host is running");
144 tracedHostWait = false;
145 }
146 checkAllActiveSensors();
147 }
148 else
149 {
150 if (!tracedHostWait)
151 {
152 log<level::INFO>(
153 "Manager::findAndCreateObjects(): Waiting for host to start");
154 tracedHostWait = true;
155 }
156 discoverTimer->restartOnce(30s);
157 #ifdef PLDM
158 if (throttlePldmTraceTimer->isEnabled())
159 {
160 // Host is no longer running, disable throttle timer and
161 // make sure traces are not throttled
162 log<level::INFO>(
163 "findAndCreateObjects(): disabling sensor timer");
164 throttlePldmTraceTimer->setEnabled(false);
165 pldmHandle->setTraceThrottle(false);
166 }
167 #endif
168 }
169 }
170 }
171 else
172 {
173 log<level::INFO>(
174 std::format(
175 "Manager::findAndCreateObjects(): Waiting for {} to complete...",
176 HOST_ON_FILE)
177 .c_str());
178 discoverTimer->restartOnce(10s);
179 }
180 #endif
181 }
182
183 #ifdef POWER10
184 // Check if all occActive sensors are available
checkAllActiveSensors()185 void Manager::checkAllActiveSensors()
186 {
187 static bool allActiveSensorAvailable = false;
188 static bool tracedSensorWait = false;
189 static bool waitingForHost = false;
190
191 if (open_power::occ::utils::isHostRunning())
192 {
193 if (waitingForHost)
194 {
195 waitingForHost = false;
196 log<level::INFO>("checkAllActiveSensors(): Host is now running");
197 }
198
199 // Start with the assumption that all are available
200 allActiveSensorAvailable = true;
201 for (auto& obj : statusObjects)
202 {
203 if ((!obj->occActive()) && (!obj->getPldmSensorReceived()))
204 {
205 auto instance = obj->getOccInstanceID();
206 // Check if sensor was queued while waiting for discovery
207 auto match = queuedActiveState.find(instance);
208 if (match != queuedActiveState.end())
209 {
210 queuedActiveState.erase(match);
211 log<level::INFO>(
212 std::format(
213 "checkAllActiveSensors(): OCC{} is ACTIVE (queued)",
214 instance)
215 .c_str());
216 obj->occActive(true);
217 }
218 else
219 {
220 allActiveSensorAvailable = false;
221 if (!tracedSensorWait)
222 {
223 log<level::INFO>(
224 std::format(
225 "checkAllActiveSensors(): Waiting on OCC{} Active sensor",
226 instance)
227 .c_str());
228 tracedSensorWait = true;
229 #ifdef PLDM
230 // Make sure PLDM traces are not throttled
231 pldmHandle->setTraceThrottle(false);
232 // Start timer to throttle PLDM traces when timer
233 // expires
234 onPldmTimeoutCreatePel = false;
235 throttlePldmTraceTimer->restartOnce(5min);
236 #endif
237 }
238 #ifdef PLDM
239 pldmHandle->checkActiveSensor(obj->getOccInstanceID());
240 #endif
241 break;
242 }
243 }
244 }
245 }
246 else
247 {
248 if (!waitingForHost)
249 {
250 waitingForHost = true;
251 log<level::INFO>(
252 "checkAllActiveSensors(): Waiting for host to start");
253 #ifdef PLDM
254 if (throttlePldmTraceTimer->isEnabled())
255 {
256 // Host is no longer running, disable throttle timer and
257 // make sure traces are not throttled
258 log<level::INFO>(
259 "checkAllActiveSensors(): disabling sensor timer");
260 throttlePldmTraceTimer->setEnabled(false);
261 pldmHandle->setTraceThrottle(false);
262 }
263 #endif
264 }
265 }
266
267 if (allActiveSensorAvailable)
268 {
269 // All sensors were found, disable the discovery timer
270 if (discoverTimer->isEnabled())
271 {
272 discoverTimer->setEnabled(false);
273 }
274 #ifdef PLDM
275 if (throttlePldmTraceTimer->isEnabled())
276 {
277 // Disable throttle timer and make sure traces are not throttled
278 throttlePldmTraceTimer->setEnabled(false);
279 pldmHandle->setTraceThrottle(false);
280 }
281 #endif
282 if (waitingForAllOccActiveSensors)
283 {
284 log<level::INFO>(
285 "checkAllActiveSensors(): OCC Active sensors are available");
286 waitingForAllOccActiveSensors = false;
287 }
288 queuedActiveState.clear();
289 tracedSensorWait = false;
290 }
291 else
292 {
293 // Not all sensors were available, so keep waiting
294 if (!tracedSensorWait)
295 {
296 log<level::INFO>(
297 "checkAllActiveSensors(): Waiting for OCC Active sensors to become available");
298 tracedSensorWait = true;
299 }
300 discoverTimer->restartOnce(10s);
301 }
302 }
303 #endif
304
findOCCsInDev()305 std::vector<int> Manager::findOCCsInDev()
306 {
307 std::vector<int> occs;
308 std::regex expr{R"(occ(\d+)$)"};
309
310 for (auto& file : fs::directory_iterator("/dev"))
311 {
312 std::smatch match;
313 std::string path{file.path().string()};
314 if (std::regex_search(path, match, expr))
315 {
316 auto num = std::stoi(match[1].str());
317
318 // /dev numbering starts at 1, ours starts at 0.
319 occs.push_back(num - 1);
320 }
321 }
322
323 return occs;
324 }
325
cpuCreated(sdbusplus::message_t & msg)326 int Manager::cpuCreated(sdbusplus::message_t& msg)
327 {
328 namespace fs = std::filesystem;
329
330 sdbusplus::message::object_path o;
331 msg.read(o);
332 fs::path cpuPath(std::string(std::move(o)));
333
334 auto name = cpuPath.filename().string();
335 auto index = name.find(CPU_NAME);
336 name.replace(index, std::strlen(CPU_NAME), OCC_NAME);
337
338 createObjects(name);
339
340 return 0;
341 }
342
createObjects(const std::string & occ)343 void Manager::createObjects(const std::string& occ)
344 {
345 auto path = fs::path(OCC_CONTROL_ROOT) / occ;
346
347 statusObjects.emplace_back(std::make_unique<Status>(
348 event, path.c_str(), *this,
349 #ifdef POWER10
350 pmode,
351 #endif
352 std::bind(std::mem_fn(&Manager::statusCallBack), this,
353 std::placeholders::_1, std::placeholders::_2)
354 #ifdef PLDM
355 ,
356 std::bind(std::mem_fn(&pldm::Interface::resetOCC), pldmHandle.get(),
357 std::placeholders::_1)
358 #endif
359 ));
360
361 // Create the power cap monitor object
362 if (!pcap)
363 {
364 pcap = std::make_unique<open_power::occ::powercap::PowerCap>(
365 *statusObjects.back());
366 }
367
368 if (statusObjects.back()->isMasterOcc())
369 {
370 log<level::INFO>(
371 std::format("Manager::createObjects(): OCC{} is the master",
372 statusObjects.back()->getOccInstanceID())
373 .c_str());
374 _pollTimer->setEnabled(false);
375
376 #ifdef POWER10
377 // Set the master OCC on the PowerMode object
378 pmode->setMasterOcc(path);
379 #endif
380 }
381
382 passThroughObjects.emplace_back(std::make_unique<PassThrough>(
383 path.c_str()
384 #ifdef POWER10
385 ,
386 pmode
387 #endif
388 ));
389 }
390
statusCallBack(instanceID instance,bool status)391 void Manager::statusCallBack(instanceID instance, bool status)
392 {
393 if (status == true)
394 {
395 // OCC went active
396 ++activeCount;
397
398 #ifdef POWER10
399 if (activeCount == 1)
400 {
401 // First OCC went active (allow some time for all OCCs to go active)
402 waitForAllOccsTimer->restartOnce(60s);
403 }
404 #endif
405
406 if (activeCount == statusObjects.size())
407 {
408 #ifdef POWER10
409 // All OCCs are now running
410 if (waitForAllOccsTimer->isEnabled())
411 {
412 // stop occ wait timer
413 waitForAllOccsTimer->setEnabled(false);
414 }
415 #endif
416
417 // Verify master OCC and start presence monitor
418 validateOccMaster();
419 }
420
421 // Start poll timer if not already started
422 if (!_pollTimer->isEnabled())
423 {
424 log<level::INFO>(
425 std::format("Manager: OCCs will be polled every {} seconds",
426 pollInterval)
427 .c_str());
428
429 // Send poll and start OCC poll timer
430 pollerTimerExpired();
431 }
432 }
433 else
434 {
435 // OCC went away
436 if (activeCount > 0)
437 {
438 --activeCount;
439 }
440 else
441 {
442 log<level::ERR>(
443 std::format("OCC{} disabled, but currently no active OCCs",
444 instance)
445 .c_str());
446 }
447
448 if (activeCount == 0)
449 {
450 // No OCCs are running
451
452 // Stop OCC poll timer
453 if (_pollTimer->isEnabled())
454 {
455 log<level::INFO>(
456 "Manager::statusCallBack(): OCCs are not running, stopping poll timer");
457 _pollTimer->setEnabled(false);
458 }
459
460 #ifdef POWER10
461 // stop wait timer
462 if (waitForAllOccsTimer->isEnabled())
463 {
464 waitForAllOccsTimer->setEnabled(false);
465 }
466 #endif
467 }
468 #ifdef READ_OCC_SENSORS
469 // Clear OCC sensors
470 setSensorValueToNaN(instance);
471 #endif
472 }
473
474 #ifdef POWER10
475 if (waitingForAllOccActiveSensors)
476 {
477 if (utils::isHostRunning())
478 {
479 checkAllActiveSensors();
480 }
481 }
482 #endif
483 }
484
485 #ifdef I2C_OCC
initStatusObjects()486 void Manager::initStatusObjects()
487 {
488 // Make sure we have a valid path string
489 static_assert(sizeof(DEV_PATH) != 0);
490
491 auto deviceNames = i2c_occ::getOccHwmonDevices(DEV_PATH);
492 for (auto& name : deviceNames)
493 {
494 i2c_occ::i2cToDbus(name);
495 name = std::string(OCC_NAME) + '_' + name;
496 auto path = fs::path(OCC_CONTROL_ROOT) / name;
497 statusObjects.emplace_back(
498 std::make_unique<Status>(event, path.c_str(), *this));
499 }
500 // The first device is master occ
501 pcap = std::make_unique<open_power::occ::powercap::PowerCap>(
502 *statusObjects.front());
503 #ifdef POWER10
504 pmode = std::make_unique<powermode::PowerMode>(*this, powermode::PMODE_PATH,
505 powermode::PIPS_PATH);
506 // Set the master OCC on the PowerMode object
507 pmode->setMasterOcc(path);
508 #endif
509 }
510 #endif
511
512 #ifdef PLDM
sbeTimeout(unsigned int instance)513 void Manager::sbeTimeout(unsigned int instance)
514 {
515 auto obj = std::find_if(statusObjects.begin(), statusObjects.end(),
516 [instance](const auto& obj) {
517 return instance == obj->getOccInstanceID();
518 });
519
520 if (obj != statusObjects.end() && (*obj)->occActive())
521 {
522 log<level::INFO>(
523 std::format("SBE timeout, requesting HRESET (OCC{})", instance)
524 .c_str());
525
526 setSBEState(instance, SBE_STATE_NOT_USABLE);
527
528 pldmHandle->sendHRESET(instance);
529 }
530 }
531
updateOCCActive(instanceID instance,bool status)532 bool Manager::updateOCCActive(instanceID instance, bool status)
533 {
534 auto obj = std::find_if(statusObjects.begin(), statusObjects.end(),
535 [instance](const auto& obj) {
536 return instance == obj->getOccInstanceID();
537 });
538
539 const bool hostRunning = open_power::occ::utils::isHostRunning();
540 if (obj != statusObjects.end())
541 {
542 if (!hostRunning && (status == true))
543 {
544 log<level::WARNING>(
545 std::format(
546 "updateOCCActive: Host is not running yet (OCC{} active={}), clearing sensor received",
547 instance, status)
548 .c_str());
549 (*obj)->setPldmSensorReceived(false);
550 if (!waitingForAllOccActiveSensors)
551 {
552 log<level::INFO>(
553 "updateOCCActive: Waiting for Host and all OCC Active Sensors");
554 waitingForAllOccActiveSensors = true;
555 }
556 #ifdef POWER10
557 discoverTimer->restartOnce(30s);
558 #endif
559 return false;
560 }
561 else
562 {
563 (*obj)->setPldmSensorReceived(true);
564 return (*obj)->occActive(status);
565 }
566 }
567 else
568 {
569 if (hostRunning)
570 {
571 log<level::WARNING>(
572 std::format(
573 "updateOCCActive: No status object to update for OCC{} (active={})",
574 instance, status)
575 .c_str());
576 }
577 else
578 {
579 if (status == true)
580 {
581 log<level::WARNING>(
582 std::format(
583 "updateOCCActive: No status objects and Host is not running yet (OCC{} active={})",
584 instance, status)
585 .c_str());
586 }
587 }
588 if (status == true)
589 {
590 // OCC went active
591 queuedActiveState.insert(instance);
592 }
593 else
594 {
595 auto match = queuedActiveState.find(instance);
596 if (match != queuedActiveState.end())
597 {
598 // OCC was disabled
599 queuedActiveState.erase(match);
600 }
601 }
602 return false;
603 }
604 }
605
606 // Called upon pldm event To set powermode Safe Mode State for system.
updateOccSafeMode(bool safeMode)607 void Manager::updateOccSafeMode(bool safeMode)
608 {
609 #ifdef POWER10
610 pmode->updateDbusSafeMode(safeMode);
611 #endif
612 // Update the processor throttle status on dbus
613 for (auto& obj : statusObjects)
614 {
615 obj->updateThrottle(safeMode, THROTTLED_SAFE);
616 }
617 }
618
sbeHRESETResult(instanceID instance,bool success)619 void Manager::sbeHRESETResult(instanceID instance, bool success)
620 {
621 if (success)
622 {
623 log<level::INFO>(
624 std::format("HRESET succeeded (OCC{})", instance).c_str());
625
626 setSBEState(instance, SBE_STATE_BOOTED);
627
628 return;
629 }
630
631 setSBEState(instance, SBE_STATE_FAILED);
632
633 if (sbeCanDump(instance))
634 {
635 log<level::INFO>(
636 std::format("HRESET failed (OCC{}), triggering SBE dump", instance)
637 .c_str());
638
639 auto& bus = utils::getBus();
640 uint32_t src6 = instance << 16;
641 uint32_t logId =
642 FFDC::createPEL("org.open_power.Processor.Error.SbeChipOpTimeout",
643 src6, "SBE command timeout");
644
645 try
646 {
647 constexpr auto interface = "xyz.openbmc_project.Dump.Create";
648 constexpr auto function = "CreateDump";
649
650 std::string service =
651 utils::getService(OP_DUMP_OBJ_PATH, interface);
652 auto method = bus.new_method_call(service.c_str(), OP_DUMP_OBJ_PATH,
653 interface, function);
654
655 std::map<std::string, std::variant<std::string, uint64_t>>
656 createParams{
657 {"com.ibm.Dump.Create.CreateParameters.ErrorLogId",
658 uint64_t(logId)},
659 {"com.ibm.Dump.Create.CreateParameters.DumpType",
660 "com.ibm.Dump.Create.DumpType.SBE"},
661 {"com.ibm.Dump.Create.CreateParameters.FailingUnitId",
662 uint64_t(instance)},
663 };
664
665 method.append(createParams);
666
667 auto response = bus.call(method);
668 }
669 catch (const sdbusplus::exception_t& e)
670 {
671 constexpr auto ERROR_DUMP_DISABLED =
672 "xyz.openbmc_project.Dump.Create.Error.Disabled";
673 if (e.name() == ERROR_DUMP_DISABLED)
674 {
675 log<level::INFO>("Dump is disabled, skipping");
676 }
677 else
678 {
679 log<level::ERR>("Dump failed");
680 }
681 }
682 }
683 }
684
sbeCanDump(unsigned int instance)685 bool Manager::sbeCanDump(unsigned int instance)
686 {
687 struct pdbg_target* proc = getPdbgTarget(instance);
688
689 if (!proc)
690 {
691 // allow the dump in the error case
692 return true;
693 }
694
695 try
696 {
697 if (!openpower::phal::sbe::isDumpAllowed(proc))
698 {
699 return false;
700 }
701
702 if (openpower::phal::pdbg::isSbeVitalAttnActive(proc))
703 {
704 return false;
705 }
706 }
707 catch (openpower::phal::exception::SbeError& e)
708 {
709 log<level::INFO>("Failed to query SBE state");
710 }
711
712 // allow the dump in the error case
713 return true;
714 }
715
setSBEState(unsigned int instance,enum sbe_state state)716 void Manager::setSBEState(unsigned int instance, enum sbe_state state)
717 {
718 struct pdbg_target* proc = getPdbgTarget(instance);
719
720 if (!proc)
721 {
722 return;
723 }
724
725 try
726 {
727 openpower::phal::sbe::setState(proc, state);
728 }
729 catch (const openpower::phal::exception::SbeError& e)
730 {
731 log<level::ERR>(
732 std::format("Failed to set SBE state: {}", e.what()).c_str());
733 }
734 }
735
getPdbgTarget(unsigned int instance)736 struct pdbg_target* Manager::getPdbgTarget(unsigned int instance)
737 {
738 if (!pdbgInitialized)
739 {
740 try
741 {
742 openpower::phal::pdbg::init();
743 pdbgInitialized = true;
744 }
745 catch (const openpower::phal::exception::PdbgError& e)
746 {
747 log<level::ERR>("pdbg initialization failed");
748 return nullptr;
749 }
750 }
751
752 struct pdbg_target* proc = nullptr;
753 pdbg_for_each_class_target("proc", proc)
754 {
755 if (pdbg_target_index(proc) == instance)
756 {
757 return proc;
758 }
759 }
760
761 log<level::ERR>("Failed to get pdbg target");
762 return nullptr;
763 }
764 #endif
765
pollerTimerExpired()766 void Manager::pollerTimerExpired()
767 {
768 if (!_pollTimer)
769 {
770 log<level::ERR>(
771 "Manager::pollerTimerExpired() ERROR: Timer not defined");
772 return;
773 }
774
775 for (auto& obj : statusObjects)
776 {
777 if (!obj->occActive())
778 {
779 // OCC is not running yet
780 #ifdef READ_OCC_SENSORS
781 auto id = obj->getOccInstanceID();
782 setSensorValueToNaN(id);
783 #endif
784 continue;
785 }
786
787 // Read sysfs to force kernel to poll OCC
788 obj->readOccState();
789
790 #ifdef READ_OCC_SENSORS
791 // Read occ sensor values
792 getSensorValues(obj);
793 #endif
794 }
795
796 if (activeCount > 0)
797 {
798 // Restart OCC poll timer
799 _pollTimer->restartOnce(std::chrono::seconds(pollInterval));
800 }
801 else
802 {
803 // No OCCs running, so poll timer will not be restarted
804 log<level::INFO>(
805 std::format(
806 "Manager::pollerTimerExpired: poll timer will not be restarted")
807 .c_str());
808 }
809 }
810
811 #ifdef READ_OCC_SENSORS
readTempSensors(const fs::path & path,uint32_t occInstance)812 void Manager::readTempSensors(const fs::path& path, uint32_t occInstance)
813 {
814 // There may be more than one sensor with the same FRU type
815 // and label so make two passes: the first to read the temps
816 // from sysfs, and the second to put them on D-Bus after
817 // resolving any conflicts.
818 std::map<std::string, double> sensorData;
819
820 std::regex expr{"temp\\d+_label$"}; // Example: temp5_label
821 for (auto& file : fs::directory_iterator(path))
822 {
823 if (!std::regex_search(file.path().string(), expr))
824 {
825 continue;
826 }
827
828 uint32_t labelValue{0};
829
830 try
831 {
832 labelValue = readFile<uint32_t>(file.path());
833 }
834 catch (const std::system_error& e)
835 {
836 log<level::DEBUG>(
837 std::format("readTempSensors: Failed reading {}, errno = {}",
838 file.path().string(), e.code().value())
839 .c_str());
840 continue;
841 }
842
843 const std::string& tempLabel = "label";
844 const std::string filePathString = file.path().string().substr(
845 0, file.path().string().length() - tempLabel.length());
846
847 uint32_t fruTypeValue{0};
848 try
849 {
850 fruTypeValue = readFile<uint32_t>(filePathString + fruTypeSuffix);
851 }
852 catch (const std::system_error& e)
853 {
854 log<level::DEBUG>(
855 std::format("readTempSensors: Failed reading {}, errno = {}",
856 filePathString + fruTypeSuffix, e.code().value())
857 .c_str());
858 continue;
859 }
860
861 std::string sensorPath =
862 OCC_SENSORS_ROOT + std::string("/temperature/");
863
864 std::string dvfsTempPath;
865
866 if (fruTypeValue == VRMVdd)
867 {
868 sensorPath.append(
869 "vrm_vdd" + std::to_string(occInstance) + "_temp");
870 }
871 else if (fruTypeValue == processorIoRing)
872 {
873 sensorPath.append(
874 "proc" + std::to_string(occInstance) + "_ioring_temp");
875 dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/proc" +
876 std::to_string(occInstance) + "_ioring_dvfs_temp";
877 }
878 else
879 {
880 uint16_t type = (labelValue & 0xFF000000) >> 24;
881 uint16_t instanceID = labelValue & 0x0000FFFF;
882
883 if (type == OCC_DIMM_TEMP_SENSOR_TYPE)
884 {
885 if (fruTypeValue == fruTypeNotAvailable)
886 {
887 // Not all DIMM related temps are available to read
888 // (no _input file in this case)
889 continue;
890 }
891 auto iter = dimmTempSensorName.find(fruTypeValue);
892 if (iter == dimmTempSensorName.end())
893 {
894 log<level::ERR>(
895 std::format(
896 "readTempSensors: Fru type error! fruTypeValue = {}) ",
897 fruTypeValue)
898 .c_str());
899 continue;
900 }
901
902 sensorPath.append(
903 "dimm" + std::to_string(instanceID) + iter->second);
904
905 dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/" +
906 dimmDVFSSensorName.at(fruTypeValue);
907 }
908 else if (type == OCC_CPU_TEMP_SENSOR_TYPE)
909 {
910 if (fruTypeValue == processorCore)
911 {
912 // The OCC reports small core temps, of which there are
913 // two per big core. All current P10 systems are in big
914 // core mode, so use a big core name.
915 uint16_t coreNum = instanceID / 2;
916 uint16_t tempNum = instanceID % 2;
917 sensorPath.append("proc" + std::to_string(occInstance) +
918 "_core" + std::to_string(coreNum) + "_" +
919 std::to_string(tempNum) + "_temp");
920
921 dvfsTempPath =
922 std::string{OCC_SENSORS_ROOT} + "/temperature/proc" +
923 std::to_string(occInstance) + "_core_dvfs_temp";
924 }
925 else
926 {
927 continue;
928 }
929 }
930 else
931 {
932 continue;
933 }
934 }
935
936 // The dvfs temp file only needs to be read once per chip per type.
937 if (!dvfsTempPath.empty() &&
938 !dbus::OccDBusSensors::getOccDBus().hasDvfsTemp(dvfsTempPath))
939 {
940 try
941 {
942 auto dvfsValue = readFile<double>(filePathString + maxSuffix);
943
944 dbus::OccDBusSensors::getOccDBus().setDvfsTemp(
945 dvfsTempPath, dvfsValue * std::pow(10, -3));
946 }
947 catch (const std::system_error& e)
948 {
949 log<level::DEBUG>(
950 std::format(
951 "readTempSensors: Failed reading {}, errno = {}",
952 filePathString + maxSuffix, e.code().value())
953 .c_str());
954 }
955 }
956
957 uint32_t faultValue{0};
958 try
959 {
960 faultValue = readFile<uint32_t>(filePathString + faultSuffix);
961 }
962 catch (const std::system_error& e)
963 {
964 log<level::DEBUG>(
965 std::format("readTempSensors: Failed reading {}, errno = {}",
966 filePathString + faultSuffix, e.code().value())
967 .c_str());
968 continue;
969 }
970
971 double tempValue{0};
972 // NOTE: if OCC sends back 0xFF, kernal sets this fault value to 1.
973 if (faultValue != 0)
974 {
975 tempValue = std::numeric_limits<double>::quiet_NaN();
976 }
977 else
978 {
979 // Read the temperature
980 try
981 {
982 tempValue = readFile<double>(filePathString + inputSuffix);
983 }
984 catch (const std::system_error& e)
985 {
986 log<level::DEBUG>(
987 std::format(
988 "readTempSensors: Failed reading {}, errno = {}",
989 filePathString + inputSuffix, e.code().value())
990 .c_str());
991
992 // if errno == EAGAIN(Resource temporarily unavailable) then set
993 // temp to 0, to avoid using old temp, and affecting FAN
994 // Control.
995 if (e.code().value() == EAGAIN)
996 {
997 tempValue = 0;
998 }
999 // else the errno would be something like
1000 // EBADF(Bad file descriptor)
1001 // or ENOENT(No such file or directory)
1002 else
1003 {
1004 continue;
1005 }
1006 }
1007 }
1008
1009 // If this object path already has a value, only overwite
1010 // it if the previous one was an NaN or a smaller value.
1011 auto existing = sensorData.find(sensorPath);
1012 if (existing != sensorData.end())
1013 {
1014 // Multiple sensors found for this FRU type
1015 if ((std::isnan(existing->second) && (tempValue == 0)) ||
1016 ((existing->second == 0) && std::isnan(tempValue)))
1017 {
1018 // One of the redundant sensors has failed (0xFF/nan), and the
1019 // other sensor has no reading (0), so set the FRU to NaN to
1020 // force fan increase
1021 tempValue = std::numeric_limits<double>::quiet_NaN();
1022 existing->second = tempValue;
1023 }
1024 if (std::isnan(existing->second) || (tempValue > existing->second))
1025 {
1026 existing->second = tempValue;
1027 }
1028 }
1029 else
1030 {
1031 // First sensor for this FRU type
1032 sensorData[sensorPath] = tempValue;
1033 }
1034 }
1035
1036 // Now publish the values on D-Bus.
1037 for (const auto& [objectPath, value] : sensorData)
1038 {
1039 dbus::OccDBusSensors::getOccDBus().setValue(objectPath,
1040 value * std::pow(10, -3));
1041
1042 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1043 objectPath, !std::isnan(value));
1044
1045 if (existingSensors.find(objectPath) == existingSensors.end())
1046 {
1047 dbus::OccDBusSensors::getOccDBus().setChassisAssociation(
1048 objectPath);
1049 }
1050
1051 existingSensors[objectPath] = occInstance;
1052 }
1053 }
1054
1055 std::optional<std::string>
getPowerLabelFunctionID(const std::string & value)1056 Manager::getPowerLabelFunctionID(const std::string& value)
1057 {
1058 // If the value is "system", then the FunctionID is "system".
1059 if (value == "system")
1060 {
1061 return value;
1062 }
1063
1064 // If the value is not "system", then the label value have 3 numbers, of
1065 // which we only care about the middle one:
1066 // <sensor id>_<function id>_<apss channel>
1067 // eg: The value is "0_10_5" , then the FunctionID is "10".
1068 if (value.find("_") == std::string::npos)
1069 {
1070 return std::nullopt;
1071 }
1072
1073 auto powerLabelValue = value.substr((value.find("_") + 1));
1074
1075 if (powerLabelValue.find("_") == std::string::npos)
1076 {
1077 return std::nullopt;
1078 }
1079
1080 return powerLabelValue.substr(0, powerLabelValue.find("_"));
1081 }
1082
readPowerSensors(const fs::path & path,uint32_t id)1083 void Manager::readPowerSensors(const fs::path& path, uint32_t id)
1084 {
1085 std::regex expr{"power\\d+_label$"}; // Example: power5_label
1086 for (auto& file : fs::directory_iterator(path))
1087 {
1088 if (!std::regex_search(file.path().string(), expr))
1089 {
1090 continue;
1091 }
1092
1093 std::string labelValue;
1094 try
1095 {
1096 labelValue = readFile<std::string>(file.path());
1097 }
1098 catch (const std::system_error& e)
1099 {
1100 log<level::DEBUG>(
1101 std::format("readPowerSensors: Failed reading {}, errno = {}",
1102 file.path().string(), e.code().value())
1103 .c_str());
1104 continue;
1105 }
1106
1107 auto functionID = getPowerLabelFunctionID(labelValue);
1108 if (functionID == std::nullopt)
1109 {
1110 continue;
1111 }
1112
1113 const std::string& tempLabel = "label";
1114 const std::string filePathString = file.path().string().substr(
1115 0, file.path().string().length() - tempLabel.length());
1116
1117 std::string sensorPath = OCC_SENSORS_ROOT + std::string("/power/");
1118
1119 auto iter = powerSensorName.find(*functionID);
1120 if (iter == powerSensorName.end())
1121 {
1122 continue;
1123 }
1124 sensorPath.append(iter->second);
1125
1126 double tempValue{0};
1127
1128 try
1129 {
1130 tempValue = readFile<double>(filePathString + inputSuffix);
1131 }
1132 catch (const std::system_error& e)
1133 {
1134 log<level::DEBUG>(
1135 std::format("readPowerSensors: Failed reading {}, errno = {}",
1136 filePathString + inputSuffix, e.code().value())
1137 .c_str());
1138 continue;
1139 }
1140
1141 dbus::OccDBusSensors::getOccDBus().setUnit(
1142 sensorPath, "xyz.openbmc_project.Sensor.Value.Unit.Watts");
1143
1144 dbus::OccDBusSensors::getOccDBus().setValue(
1145 sensorPath, tempValue * std::pow(10, -3) * std::pow(10, -3));
1146
1147 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1148 sensorPath, true);
1149
1150 if (existingSensors.find(sensorPath) == existingSensors.end())
1151 {
1152 dbus::OccDBusSensors::getOccDBus().setChassisAssociation(
1153 sensorPath);
1154 }
1155
1156 existingSensors[sensorPath] = id;
1157 }
1158 return;
1159 }
1160
setSensorValueToNaN(uint32_t id) const1161 void Manager::setSensorValueToNaN(uint32_t id) const
1162 {
1163 for (const auto& [sensorPath, occId] : existingSensors)
1164 {
1165 if (occId == id)
1166 {
1167 dbus::OccDBusSensors::getOccDBus().setValue(
1168 sensorPath, std::numeric_limits<double>::quiet_NaN());
1169
1170 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1171 sensorPath, true);
1172 }
1173 }
1174 return;
1175 }
1176
setSensorValueToNonFunctional(uint32_t id) const1177 void Manager::setSensorValueToNonFunctional(uint32_t id) const
1178 {
1179 for (const auto& [sensorPath, occId] : existingSensors)
1180 {
1181 if (occId == id)
1182 {
1183 dbus::OccDBusSensors::getOccDBus().setValue(
1184 sensorPath, std::numeric_limits<double>::quiet_NaN());
1185
1186 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1187 sensorPath, false);
1188 }
1189 }
1190 return;
1191 }
1192
getSensorValues(std::unique_ptr<Status> & occ)1193 void Manager::getSensorValues(std::unique_ptr<Status>& occ)
1194 {
1195 static bool tracedError[8] = {0};
1196 const fs::path sensorPath = occ->getHwmonPath();
1197 const uint32_t id = occ->getOccInstanceID();
1198
1199 if (fs::exists(sensorPath))
1200 {
1201 // Read temperature sensors
1202 readTempSensors(sensorPath, id);
1203
1204 if (occ->isMasterOcc())
1205 {
1206 // Read power sensors
1207 readPowerSensors(sensorPath, id);
1208 }
1209 tracedError[id] = false;
1210 }
1211 else
1212 {
1213 if (!tracedError[id])
1214 {
1215 log<level::ERR>(
1216 std::format(
1217 "Manager::getSensorValues: OCC{} sensor path missing: {}",
1218 id, sensorPath.c_str())
1219 .c_str());
1220 tracedError[id] = true;
1221 }
1222 }
1223
1224 return;
1225 }
1226 #endif
1227
1228 // Read the altitude from DBus
readAltitude()1229 void Manager::readAltitude()
1230 {
1231 static bool traceAltitudeErr = true;
1232
1233 utils::PropertyValue altitudeProperty{};
1234 try
1235 {
1236 altitudeProperty = utils::getProperty(ALTITUDE_PATH, ALTITUDE_INTERFACE,
1237 ALTITUDE_PROP);
1238 auto sensorVal = std::get<double>(altitudeProperty);
1239 if (sensorVal < 0xFFFF)
1240 {
1241 if (sensorVal < 0)
1242 {
1243 altitude = 0;
1244 }
1245 else
1246 {
1247 // Round to nearest meter
1248 altitude = uint16_t(sensorVal + 0.5);
1249 }
1250 log<level::DEBUG>(std::format("readAltitude: sensor={} ({}m)",
1251 sensorVal, altitude)
1252 .c_str());
1253 traceAltitudeErr = true;
1254 }
1255 else
1256 {
1257 if (traceAltitudeErr)
1258 {
1259 traceAltitudeErr = false;
1260 log<level::DEBUG>(
1261 std::format("Invalid altitude value: {}", sensorVal)
1262 .c_str());
1263 }
1264 }
1265 }
1266 catch (const sdbusplus::exception_t& e)
1267 {
1268 if (traceAltitudeErr)
1269 {
1270 traceAltitudeErr = false;
1271 log<level::INFO>(
1272 std::format("Unable to read Altitude: {}", e.what()).c_str());
1273 }
1274 altitude = 0xFFFF; // not available
1275 }
1276 }
1277
1278 // Callback function when ambient temperature changes
ambientCallback(sdbusplus::message_t & msg)1279 void Manager::ambientCallback(sdbusplus::message_t& msg)
1280 {
1281 double currentTemp = 0;
1282 uint8_t truncatedTemp = 0xFF;
1283 std::string msgSensor;
1284 std::map<std::string, std::variant<double>> msgData;
1285 msg.read(msgSensor, msgData);
1286
1287 auto valPropMap = msgData.find(AMBIENT_PROP);
1288 if (valPropMap == msgData.end())
1289 {
1290 log<level::DEBUG>("ambientCallback: Unknown ambient property changed");
1291 return;
1292 }
1293 currentTemp = std::get<double>(valPropMap->second);
1294 if (std::isnan(currentTemp))
1295 {
1296 truncatedTemp = 0xFF;
1297 }
1298 else
1299 {
1300 if (currentTemp < 0)
1301 {
1302 truncatedTemp = 0;
1303 }
1304 else
1305 {
1306 // Round to nearest degree C
1307 truncatedTemp = uint8_t(currentTemp + 0.5);
1308 }
1309 }
1310
1311 // If ambient changes, notify OCCs
1312 if (truncatedTemp != ambient)
1313 {
1314 log<level::DEBUG>(
1315 std::format("ambientCallback: Ambient change from {} to {}C",
1316 ambient, currentTemp)
1317 .c_str());
1318
1319 ambient = truncatedTemp;
1320 if (altitude == 0xFFFF)
1321 {
1322 // No altitude yet, try reading again
1323 readAltitude();
1324 }
1325
1326 log<level::DEBUG>(
1327 std::format("ambientCallback: Ambient: {}C, altitude: {}m", ambient,
1328 altitude)
1329 .c_str());
1330 #ifdef POWER10
1331 // Send ambient and altitude to all OCCs
1332 for (auto& obj : statusObjects)
1333 {
1334 if (obj->occActive())
1335 {
1336 obj->sendAmbient(ambient, altitude);
1337 }
1338 }
1339 #endif // POWER10
1340 }
1341 }
1342
1343 // return the current ambient and altitude readings
getAmbientData(bool & ambientValid,uint8_t & ambientTemp,uint16_t & altitudeValue) const1344 void Manager::getAmbientData(bool& ambientValid, uint8_t& ambientTemp,
1345 uint16_t& altitudeValue) const
1346 {
1347 ambientValid = true;
1348 ambientTemp = ambient;
1349 altitudeValue = altitude;
1350
1351 if (ambient == 0xFF)
1352 {
1353 ambientValid = false;
1354 }
1355 }
1356
1357 #ifdef POWER10
1358 // Called when waitForAllOccsTimer expires
1359 // After the first OCC goes active, this timer will be started (60 seconds)
occsNotAllRunning()1360 void Manager::occsNotAllRunning()
1361 {
1362 if (activeCount != statusObjects.size())
1363 {
1364 // Not all OCCs went active
1365 log<level::WARNING>(
1366 std::format(
1367 "occsNotAllRunning: Active OCC count ({}) does not match expected count ({})",
1368 activeCount, statusObjects.size())
1369 .c_str());
1370 // Procs may be garded, so may be expected
1371 }
1372
1373 validateOccMaster();
1374 }
1375
1376 #ifdef PLDM
1377 // Called when throttlePldmTraceTimer expires.
1378 // If this timer expires, that indicates there are no OCC active sensor PDRs
1379 // found which will trigger pldm traces to be throttled.
1380 // The second time this timer expires, a PEL will get created.
throttlePldmTraceExpired()1381 void Manager::throttlePldmTraceExpired()
1382 {
1383 if (utils::isHostRunning())
1384 {
1385 if (!onPldmTimeoutCreatePel)
1386 {
1387 // Throttle traces
1388 pldmHandle->setTraceThrottle(true);
1389 // Restart timer to log a PEL when timer expires
1390 onPldmTimeoutCreatePel = true;
1391 throttlePldmTraceTimer->restartOnce(40min);
1392 }
1393 else
1394 {
1395 log<level::ERR>(
1396 "throttlePldmTraceExpired(): OCC active sensors still not available!");
1397 // Create PEL
1398 createPldmSensorPEL();
1399 }
1400 }
1401 else
1402 {
1403 // Make sure traces are not throttled
1404 pldmHandle->setTraceThrottle(false);
1405 log<level::INFO>(
1406 "throttlePldmTraceExpired(): host it not running ignoring sensor timer");
1407 }
1408 }
1409
createPldmSensorPEL()1410 void Manager::createPldmSensorPEL()
1411 {
1412 Error::Descriptor d = Error::Descriptor(MISSING_OCC_SENSORS_PATH);
1413 std::map<std::string, std::string> additionalData;
1414
1415 additionalData.emplace("_PID", std::to_string(getpid()));
1416
1417 log<level::INFO>(
1418 std::format(
1419 "createPldmSensorPEL(): Unable to find PLDM sensors for the OCCs")
1420 .c_str());
1421
1422 auto& bus = utils::getBus();
1423
1424 try
1425 {
1426 FFDCFiles ffdc;
1427 // Add occ-control journal traces to PEL FFDC
1428 auto occJournalFile =
1429 FFDC::addJournalEntries(ffdc, "openpower-occ-control", 40);
1430
1431 static constexpr auto loggingObjectPath =
1432 "/xyz/openbmc_project/logging";
1433 static constexpr auto opLoggingInterface = "org.open_power.Logging.PEL";
1434 std::string service =
1435 utils::getService(loggingObjectPath, opLoggingInterface);
1436 auto method =
1437 bus.new_method_call(service.c_str(), loggingObjectPath,
1438 opLoggingInterface, "CreatePELWithFFDCFiles");
1439
1440 // Set level to Warning (Predictive).
1441 auto level =
1442 sdbusplus::xyz::openbmc_project::Logging::server::convertForMessage(
1443 sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level::
1444 Warning);
1445
1446 method.append(d.path, level, additionalData, ffdc);
1447 bus.call(method);
1448 }
1449 catch (const sdbusplus::exception_t& e)
1450 {
1451 log<level::ERR>(
1452 std::format("Failed to create MISSING_OCC_SENSORS PEL: {}",
1453 e.what())
1454 .c_str());
1455 }
1456 }
1457 #endif // PLDM
1458 #endif // POWER10
1459
1460 // Verify single master OCC and start presence monitor
validateOccMaster()1461 void Manager::validateOccMaster()
1462 {
1463 int masterInstance = -1;
1464 for (auto& obj : statusObjects)
1465 {
1466 auto instance = obj->getOccInstanceID();
1467 #ifdef POWER10
1468 if (!obj->occActive())
1469 {
1470 if (utils::isHostRunning())
1471 {
1472 // Check if sensor was queued while waiting for discovery
1473 auto match = queuedActiveState.find(instance);
1474 if (match != queuedActiveState.end())
1475 {
1476 queuedActiveState.erase(match);
1477 log<level::INFO>(
1478 std::format(
1479 "validateOccMaster: OCC{} is ACTIVE (queued)",
1480 instance)
1481 .c_str());
1482 obj->occActive(true);
1483 }
1484 else
1485 {
1486 // OCC does not appear to be active yet, check active sensor
1487 #ifdef PLDM
1488 pldmHandle->checkActiveSensor(instance);
1489 #endif
1490 if (obj->occActive())
1491 {
1492 log<level::INFO>(
1493 std::format(
1494 "validateOccMaster: OCC{} is ACTIVE after reading sensor",
1495 instance)
1496 .c_str());
1497 }
1498 }
1499 }
1500 else
1501 {
1502 log<level::WARNING>(
1503 std::format(
1504 "validateOccMaster: HOST is not running (OCC{})",
1505 instance)
1506 .c_str());
1507 return;
1508 }
1509 }
1510 #endif // POWER10
1511
1512 if (obj->isMasterOcc())
1513 {
1514 obj->addPresenceWatchMaster();
1515
1516 if (masterInstance == -1)
1517 {
1518 masterInstance = instance;
1519 }
1520 else
1521 {
1522 log<level::ERR>(
1523 std::format(
1524 "validateOccMaster: Multiple OCC masters! ({} and {})",
1525 masterInstance, instance)
1526 .c_str());
1527 // request reset
1528 obj->deviceError(Error::Descriptor(PRESENCE_ERROR_PATH));
1529 }
1530 }
1531 }
1532
1533 if (masterInstance < 0)
1534 {
1535 log<level::ERR>(
1536 std::format("validateOccMaster: Master OCC not found! (of {} OCCs)",
1537 statusObjects.size())
1538 .c_str());
1539 // request reset
1540 statusObjects.front()->deviceError(
1541 Error::Descriptor(PRESENCE_ERROR_PATH));
1542 }
1543 else
1544 {
1545 log<level::INFO>(
1546 std::format("validateOccMaster: OCC{} is master of {} OCCs",
1547 masterInstance, activeCount)
1548 .c_str());
1549 #ifdef POWER10
1550 pmode->updateDbusSafeMode(false);
1551 #endif
1552 }
1553 }
1554
updatePcapBounds() const1555 void Manager::updatePcapBounds() const
1556 {
1557 if (pcap)
1558 {
1559 pcap->updatePcapBounds();
1560 }
1561 }
1562
1563 } // namespace occ
1564 } // namespace open_power
1565