1 #include "config.h"
2
3 #include "occ_manager.hpp"
4
5 #include "i2c_occ.hpp"
6 #include "occ_dbus.hpp"
7 #include "occ_errors.hpp"
8 #include "utils.hpp"
9
10 #include <phosphor-logging/elog-errors.hpp>
11 #include <phosphor-logging/log.hpp>
12 #include <xyz/openbmc_project/Common/error.hpp>
13
14 #include <chrono>
15 #include <cmath>
16 #include <filesystem>
17 #include <fstream>
18 #include <regex>
19
20 namespace open_power
21 {
22 namespace occ
23 {
24
25 constexpr uint32_t fruTypeNotAvailable = 0xFF;
26 constexpr auto fruTypeSuffix = "fru_type";
27 constexpr auto faultSuffix = "fault";
28 constexpr auto inputSuffix = "input";
29 constexpr auto maxSuffix = "max";
30
31 const auto HOST_ON_FILE = "/run/openbmc/host@0-on";
32
33 using namespace phosphor::logging;
34 using namespace std::literals::chrono_literals;
35
36 template <typename T>
readFile(const std::string & path)37 T readFile(const std::string& path)
38 {
39 std::ifstream ifs;
40 ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit |
41 std::ifstream::eofbit);
42 T data;
43
44 try
45 {
46 ifs.open(path);
47 ifs >> data;
48 ifs.close();
49 }
50 catch (const std::exception& e)
51 {
52 auto err = errno;
53 throw std::system_error(err, std::generic_category());
54 }
55
56 return data;
57 }
58
59 // findAndCreateObjects():
60 // Takes care of getting the required objects created and
61 // finds the available devices/processors.
62 // (function is called everytime the discoverTimer expires)
63 // - create the PowerMode object to control OCC modes
64 // - create statusObjects for each OCC device found
65 // - waits for OCC Active sensors PDRs to become available
66 // - restart discoverTimer if all data is not available yet
findAndCreateObjects()67 void Manager::findAndCreateObjects()
68 {
69 #ifndef POWER10
70 for (auto id = 0; id < MAX_CPUS; ++id)
71 {
72 // Create one occ per cpu
73 auto occ = std::string(OCC_NAME) + std::to_string(id);
74 createObjects(occ);
75 }
76 #else
77 if (!pmode)
78 {
79 // Create the power mode object
80 pmode = std::make_unique<powermode::PowerMode>(
81 *this, powermode::PMODE_PATH, powermode::PIPS_PATH, event);
82 }
83
84 if (!fs::exists(HOST_ON_FILE))
85 {
86 static bool statusObjCreated = false;
87 if (!statusObjCreated)
88 {
89 // Create the OCCs based on on the /dev/occX devices
90 auto occs = findOCCsInDev();
91
92 if (occs.empty() || (prevOCCSearch.size() != occs.size()))
93 {
94 // Something changed or no OCCs yet, try again in 10s.
95 // Note on the first pass prevOCCSearch will be empty,
96 // so there will be at least one delay to give things
97 // a chance to settle.
98 prevOCCSearch = occs;
99
100 log<level::INFO>(
101 std::format(
102 "Manager::findAndCreateObjects(): Waiting for OCCs (currently {})",
103 occs.size())
104 .c_str());
105
106 discoverTimer->restartOnce(10s);
107 }
108 else
109 {
110 // All OCCs appear to be available, create status objects
111
112 // createObjects requires OCC0 first.
113 std::sort(occs.begin(), occs.end());
114
115 log<level::INFO>(
116 std::format(
117 "Manager::findAndCreateObjects(): Creating {} OCC Status Objects",
118 occs.size())
119 .c_str());
120 for (auto id : occs)
121 {
122 createObjects(std::string(OCC_NAME) + std::to_string(id));
123 }
124 statusObjCreated = true;
125 waitingForAllOccActiveSensors = true;
126
127 // Find/update the processor path associated with each OCC
128 for (auto& obj : statusObjects)
129 {
130 obj->updateProcAssociation();
131 }
132 }
133 }
134
135 if (statusObjCreated && waitingForAllOccActiveSensors)
136 {
137 static bool tracedHostWait = false;
138 if (utils::isHostRunning())
139 {
140 if (tracedHostWait)
141 {
142 log<level::INFO>(
143 "Manager::findAndCreateObjects(): Host is running");
144 tracedHostWait = false;
145 }
146 checkAllActiveSensors();
147 }
148 else
149 {
150 if (!tracedHostWait)
151 {
152 log<level::INFO>(
153 "Manager::findAndCreateObjects(): Waiting for host to start");
154 tracedHostWait = true;
155 }
156 discoverTimer->restartOnce(30s);
157 #ifdef PLDM
158 if (throttlePldmTraceTimer->isEnabled())
159 {
160 // Host is no longer running, disable throttle timer and
161 // make sure traces are not throttled
162 log<level::INFO>(
163 "findAndCreateObjects(): disabling sensor timer");
164 throttlePldmTraceTimer->setEnabled(false);
165 pldmHandle->setTraceThrottle(false);
166 }
167 #endif
168 }
169 }
170 }
171 else
172 {
173 log<level::INFO>(
174 std::format(
175 "Manager::findAndCreateObjects(): Waiting for {} to complete...",
176 HOST_ON_FILE)
177 .c_str());
178 discoverTimer->restartOnce(10s);
179 }
180 #endif
181 }
182
183 #ifdef POWER10
184 // Check if all occActive sensors are available
checkAllActiveSensors()185 void Manager::checkAllActiveSensors()
186 {
187 static bool allActiveSensorAvailable = false;
188 static bool tracedSensorWait = false;
189 static bool waitingForHost = false;
190
191 if (open_power::occ::utils::isHostRunning())
192 {
193 if (waitingForHost)
194 {
195 waitingForHost = false;
196 log<level::INFO>("checkAllActiveSensors(): Host is now running");
197 }
198
199 // Start with the assumption that all are available
200 allActiveSensorAvailable = true;
201 for (auto& obj : statusObjects)
202 {
203 if ((!obj->occActive()) && (!obj->getPldmSensorReceived()))
204 {
205 auto instance = obj->getOccInstanceID();
206 // Check if sensor was queued while waiting for discovery
207 auto match = queuedActiveState.find(instance);
208 if (match != queuedActiveState.end())
209 {
210 queuedActiveState.erase(match);
211 log<level::INFO>(
212 std::format(
213 "checkAllActiveSensors(): OCC{} is ACTIVE (queued)",
214 instance)
215 .c_str());
216 obj->occActive(true);
217 }
218 else
219 {
220 allActiveSensorAvailable = false;
221 if (!tracedSensorWait)
222 {
223 log<level::INFO>(
224 std::format(
225 "checkAllActiveSensors(): Waiting on OCC{} Active sensor",
226 instance)
227 .c_str());
228 tracedSensorWait = true;
229 #ifdef PLDM
230 // Make sure PLDM traces are not throttled
231 pldmHandle->setTraceThrottle(false);
232 // Start timer to throttle PLDM traces when timer
233 // expires
234 onPldmTimeoutCreatePel = false;
235 throttlePldmTraceTimer->restartOnce(5min);
236 #endif
237 }
238 #ifdef PLDM
239 pldmHandle->checkActiveSensor(obj->getOccInstanceID());
240 #endif
241 break;
242 }
243 }
244 }
245 }
246 else
247 {
248 if (!waitingForHost)
249 {
250 waitingForHost = true;
251 log<level::INFO>(
252 "checkAllActiveSensors(): Waiting for host to start");
253 #ifdef PLDM
254 if (throttlePldmTraceTimer->isEnabled())
255 {
256 // Host is no longer running, disable throttle timer and
257 // make sure traces are not throttled
258 log<level::INFO>(
259 "checkAllActiveSensors(): disabling sensor timer");
260 throttlePldmTraceTimer->setEnabled(false);
261 pldmHandle->setTraceThrottle(false);
262 }
263 #endif
264 }
265 }
266
267 if (allActiveSensorAvailable)
268 {
269 // All sensors were found, disable the discovery timer
270 if (discoverTimer->isEnabled())
271 {
272 discoverTimer->setEnabled(false);
273 }
274 #ifdef PLDM
275 if (throttlePldmTraceTimer->isEnabled())
276 {
277 // Disable throttle timer and make sure traces are not throttled
278 throttlePldmTraceTimer->setEnabled(false);
279 pldmHandle->setTraceThrottle(false);
280 }
281 #endif
282 if (waitingForAllOccActiveSensors)
283 {
284 log<level::INFO>(
285 "checkAllActiveSensors(): OCC Active sensors are available");
286 waitingForAllOccActiveSensors = false;
287 }
288 queuedActiveState.clear();
289 tracedSensorWait = false;
290 }
291 else
292 {
293 // Not all sensors were available, so keep waiting
294 if (!tracedSensorWait)
295 {
296 log<level::INFO>(
297 "checkAllActiveSensors(): Waiting for OCC Active sensors to become available");
298 tracedSensorWait = true;
299 }
300 discoverTimer->restartOnce(10s);
301 }
302 }
303 #endif
304
findOCCsInDev()305 std::vector<int> Manager::findOCCsInDev()
306 {
307 std::vector<int> occs;
308 std::regex expr{R"(occ(\d+)$)"};
309
310 for (auto& file : fs::directory_iterator("/dev"))
311 {
312 std::smatch match;
313 std::string path{file.path().string()};
314 if (std::regex_search(path, match, expr))
315 {
316 auto num = std::stoi(match[1].str());
317
318 // /dev numbering starts at 1, ours starts at 0.
319 occs.push_back(num - 1);
320 }
321 }
322
323 return occs;
324 }
325
cpuCreated(sdbusplus::message_t & msg)326 int Manager::cpuCreated(sdbusplus::message_t& msg)
327 {
328 namespace fs = std::filesystem;
329
330 sdbusplus::message::object_path o;
331 msg.read(o);
332 fs::path cpuPath(std::string(std::move(o)));
333
334 auto name = cpuPath.filename().string();
335 auto index = name.find(CPU_NAME);
336 name.replace(index, std::strlen(CPU_NAME), OCC_NAME);
337
338 createObjects(name);
339
340 return 0;
341 }
342
createObjects(const std::string & occ)343 void Manager::createObjects(const std::string& occ)
344 {
345 auto path = fs::path(OCC_CONTROL_ROOT) / occ;
346
347 statusObjects.emplace_back(std::make_unique<Status>(
348 event, path.c_str(), *this,
349 #ifdef POWER10
350 pmode,
351 #endif
352 std::bind(std::mem_fn(&Manager::statusCallBack), this,
353 std::placeholders::_1, std::placeholders::_2)
354 #ifdef PLDM
355 ,
356 std::bind(std::mem_fn(&pldm::Interface::resetOCC), pldmHandle.get(),
357 std::placeholders::_1)
358 #endif
359 ));
360
361 // Create the power cap monitor object
362 if (!pcap)
363 {
364 pcap = std::make_unique<open_power::occ::powercap::PowerCap>(
365 *statusObjects.back());
366 }
367
368 if (statusObjects.back()->isMasterOcc())
369 {
370 log<level::INFO>(
371 std::format("Manager::createObjects(): OCC{} is the master",
372 statusObjects.back()->getOccInstanceID())
373 .c_str());
374 _pollTimer->setEnabled(false);
375
376 #ifdef POWER10
377 // Set the master OCC on the PowerMode object
378 pmode->setMasterOcc(path);
379 #endif
380 }
381
382 passThroughObjects.emplace_back(std::make_unique<PassThrough>(path.c_str()
383 #ifdef POWER10
384 ,
385 pmode
386 #endif
387 ));
388 }
389
statusCallBack(instanceID instance,bool status)390 void Manager::statusCallBack(instanceID instance, bool status)
391 {
392 if (status == true)
393 {
394 // OCC went active
395 ++activeCount;
396
397 #ifdef POWER10
398 if (activeCount == 1)
399 {
400 // First OCC went active (allow some time for all OCCs to go active)
401 waitForAllOccsTimer->restartOnce(60s);
402 }
403 #endif
404
405 if (activeCount == statusObjects.size())
406 {
407 #ifdef POWER10
408 // All OCCs are now running
409 if (waitForAllOccsTimer->isEnabled())
410 {
411 // stop occ wait timer
412 waitForAllOccsTimer->setEnabled(false);
413 }
414 #endif
415
416 // Verify master OCC and start presence monitor
417 validateOccMaster();
418 }
419
420 // Start poll timer if not already started
421 if (!_pollTimer->isEnabled())
422 {
423 log<level::INFO>(
424 std::format("Manager: OCCs will be polled every {} seconds",
425 pollInterval)
426 .c_str());
427
428 // Send poll and start OCC poll timer
429 pollerTimerExpired();
430 }
431 }
432 else
433 {
434 // OCC went away
435 if (activeCount > 0)
436 {
437 --activeCount;
438 }
439 else
440 {
441 log<level::ERR>(
442 std::format("OCC{} disabled, but currently no active OCCs",
443 instance)
444 .c_str());
445 }
446
447 if (activeCount == 0)
448 {
449 // No OCCs are running
450
451 // Stop OCC poll timer
452 if (_pollTimer->isEnabled())
453 {
454 log<level::INFO>(
455 "Manager::statusCallBack(): OCCs are not running, stopping poll timer");
456 _pollTimer->setEnabled(false);
457 }
458
459 #ifdef POWER10
460 // stop wait timer
461 if (waitForAllOccsTimer->isEnabled())
462 {
463 waitForAllOccsTimer->setEnabled(false);
464 }
465 #endif
466 }
467 #ifdef READ_OCC_SENSORS
468 // Clear OCC sensors
469 setSensorValueToNaN(instance);
470 #endif
471 }
472
473 #ifdef POWER10
474 if (waitingForAllOccActiveSensors)
475 {
476 if (utils::isHostRunning())
477 {
478 checkAllActiveSensors();
479 }
480 }
481 #endif
482 }
483
484 #ifdef I2C_OCC
initStatusObjects()485 void Manager::initStatusObjects()
486 {
487 // Make sure we have a valid path string
488 static_assert(sizeof(DEV_PATH) != 0);
489
490 auto deviceNames = i2c_occ::getOccHwmonDevices(DEV_PATH);
491 for (auto& name : deviceNames)
492 {
493 i2c_occ::i2cToDbus(name);
494 name = std::string(OCC_NAME) + '_' + name;
495 auto path = fs::path(OCC_CONTROL_ROOT) / name;
496 statusObjects.emplace_back(
497 std::make_unique<Status>(event, path.c_str(), *this));
498 }
499 // The first device is master occ
500 pcap = std::make_unique<open_power::occ::powercap::PowerCap>(
501 *statusObjects.front());
502 #ifdef POWER10
503 pmode = std::make_unique<powermode::PowerMode>(*this, powermode::PMODE_PATH,
504 powermode::PIPS_PATH);
505 // Set the master OCC on the PowerMode object
506 pmode->setMasterOcc(path);
507 #endif
508 }
509 #endif
510
511 #ifdef PLDM
sbeTimeout(unsigned int instance)512 void Manager::sbeTimeout(unsigned int instance)
513 {
514 auto obj = std::find_if(statusObjects.begin(), statusObjects.end(),
515 [instance](const auto& obj) {
516 return instance == obj->getOccInstanceID();
517 });
518
519 if (obj != statusObjects.end() && (*obj)->occActive())
520 {
521 log<level::INFO>(
522 std::format("SBE timeout, requesting HRESET (OCC{})", instance)
523 .c_str());
524
525 setSBEState(instance, SBE_STATE_NOT_USABLE);
526
527 pldmHandle->sendHRESET(instance);
528 }
529 }
530
updateOCCActive(instanceID instance,bool status)531 bool Manager::updateOCCActive(instanceID instance, bool status)
532 {
533 auto obj = std::find_if(statusObjects.begin(), statusObjects.end(),
534 [instance](const auto& obj) {
535 return instance == obj->getOccInstanceID();
536 });
537
538 const bool hostRunning = open_power::occ::utils::isHostRunning();
539 if (obj != statusObjects.end())
540 {
541 if (!hostRunning && (status == true))
542 {
543 log<level::WARNING>(
544 std::format(
545 "updateOCCActive: Host is not running yet (OCC{} active={}), clearing sensor received",
546 instance, status)
547 .c_str());
548 (*obj)->setPldmSensorReceived(false);
549 if (!waitingForAllOccActiveSensors)
550 {
551 log<level::INFO>(
552 "updateOCCActive: Waiting for Host and all OCC Active Sensors");
553 waitingForAllOccActiveSensors = true;
554 }
555 #ifdef POWER10
556 discoverTimer->restartOnce(30s);
557 #endif
558 return false;
559 }
560 else
561 {
562 log<level::INFO>(std::format("updateOCCActive: OCC{} active={}",
563 instance, status)
564 .c_str());
565 (*obj)->setPldmSensorReceived(true);
566 return (*obj)->occActive(status);
567 }
568 }
569 else
570 {
571 if (hostRunning)
572 {
573 log<level::WARNING>(
574 std::format(
575 "updateOCCActive: No status object to update for OCC{} (active={})",
576 instance, status)
577 .c_str());
578 }
579 else
580 {
581 if (status == true)
582 {
583 log<level::WARNING>(
584 std::format(
585 "updateOCCActive: No status objects and Host is not running yet (OCC{} active={})",
586 instance, status)
587 .c_str());
588 }
589 }
590 if (status == true)
591 {
592 // OCC went active
593 queuedActiveState.insert(instance);
594 }
595 else
596 {
597 auto match = queuedActiveState.find(instance);
598 if (match != queuedActiveState.end())
599 {
600 // OCC was disabled
601 queuedActiveState.erase(match);
602 }
603 }
604 return false;
605 }
606 }
607
608 // Called upon pldm event To set powermode Safe Mode State for system.
updateOccSafeMode(bool safeMode)609 void Manager::updateOccSafeMode(bool safeMode)
610 {
611 #ifdef POWER10
612 pmode->updateDbusSafeMode(safeMode);
613 #endif
614 // Update the processor throttle status on dbus
615 for (auto& obj : statusObjects)
616 {
617 obj->updateThrottle(safeMode, THROTTLED_SAFE);
618 }
619 }
620
sbeHRESETResult(instanceID instance,bool success)621 void Manager::sbeHRESETResult(instanceID instance, bool success)
622 {
623 if (success)
624 {
625 log<level::INFO>(
626 std::format("HRESET succeeded (OCC{})", instance).c_str());
627
628 setSBEState(instance, SBE_STATE_BOOTED);
629
630 return;
631 }
632
633 setSBEState(instance, SBE_STATE_FAILED);
634
635 if (sbeCanDump(instance))
636 {
637 log<level::INFO>(
638 std::format("HRESET failed (OCC{}), triggering SBE dump", instance)
639 .c_str());
640
641 auto& bus = utils::getBus();
642 uint32_t src6 = instance << 16;
643 uint32_t logId =
644 FFDC::createPEL("org.open_power.Processor.Error.SbeChipOpTimeout",
645 src6, "SBE command timeout");
646
647 try
648 {
649 constexpr auto interface = "xyz.openbmc_project.Dump.Create";
650 constexpr auto function = "CreateDump";
651
652 std::string service = utils::getService(OP_DUMP_OBJ_PATH,
653 interface);
654 auto method = bus.new_method_call(service.c_str(), OP_DUMP_OBJ_PATH,
655 interface, function);
656
657 std::map<std::string, std::variant<std::string, uint64_t>>
658 createParams{
659 {"com.ibm.Dump.Create.CreateParameters.ErrorLogId",
660 uint64_t(logId)},
661 {"com.ibm.Dump.Create.CreateParameters.DumpType",
662 "com.ibm.Dump.Create.DumpType.SBE"},
663 {"com.ibm.Dump.Create.CreateParameters.FailingUnitId",
664 uint64_t(instance)},
665 };
666
667 method.append(createParams);
668
669 auto response = bus.call(method);
670 }
671 catch (const sdbusplus::exception_t& e)
672 {
673 constexpr auto ERROR_DUMP_DISABLED =
674 "xyz.openbmc_project.Dump.Create.Error.Disabled";
675 if (e.name() == ERROR_DUMP_DISABLED)
676 {
677 log<level::INFO>("Dump is disabled, skipping");
678 }
679 else
680 {
681 log<level::ERR>("Dump failed");
682 }
683 }
684 }
685 }
686
sbeCanDump(unsigned int instance)687 bool Manager::sbeCanDump(unsigned int instance)
688 {
689 struct pdbg_target* proc = getPdbgTarget(instance);
690
691 if (!proc)
692 {
693 // allow the dump in the error case
694 return true;
695 }
696
697 try
698 {
699 if (!openpower::phal::sbe::isDumpAllowed(proc))
700 {
701 return false;
702 }
703
704 if (openpower::phal::pdbg::isSbeVitalAttnActive(proc))
705 {
706 return false;
707 }
708 }
709 catch (openpower::phal::exception::SbeError& e)
710 {
711 log<level::INFO>("Failed to query SBE state");
712 }
713
714 // allow the dump in the error case
715 return true;
716 }
717
setSBEState(unsigned int instance,enum sbe_state state)718 void Manager::setSBEState(unsigned int instance, enum sbe_state state)
719 {
720 struct pdbg_target* proc = getPdbgTarget(instance);
721
722 if (!proc)
723 {
724 return;
725 }
726
727 try
728 {
729 openpower::phal::sbe::setState(proc, state);
730 }
731 catch (const openpower::phal::exception::SbeError& e)
732 {
733 log<level::ERR>("Failed to set SBE state");
734 }
735 }
736
getPdbgTarget(unsigned int instance)737 struct pdbg_target* Manager::getPdbgTarget(unsigned int instance)
738 {
739 if (!pdbgInitialized)
740 {
741 try
742 {
743 openpower::phal::pdbg::init();
744 pdbgInitialized = true;
745 }
746 catch (const openpower::phal::exception::PdbgError& e)
747 {
748 log<level::ERR>("pdbg initialization failed");
749 return nullptr;
750 }
751 }
752
753 struct pdbg_target* proc = nullptr;
754 pdbg_for_each_class_target("proc", proc)
755 {
756 if (pdbg_target_index(proc) == instance)
757 {
758 return proc;
759 }
760 }
761
762 log<level::ERR>("Failed to get pdbg target");
763 return nullptr;
764 }
765 #endif
766
pollerTimerExpired()767 void Manager::pollerTimerExpired()
768 {
769 if (!_pollTimer)
770 {
771 log<level::ERR>(
772 "Manager::pollerTimerExpired() ERROR: Timer not defined");
773 return;
774 }
775
776 for (auto& obj : statusObjects)
777 {
778 if (!obj->occActive())
779 {
780 // OCC is not running yet
781 #ifdef READ_OCC_SENSORS
782 auto id = obj->getOccInstanceID();
783 setSensorValueToNaN(id);
784 #endif
785 continue;
786 }
787
788 // Read sysfs to force kernel to poll OCC
789 obj->readOccState();
790
791 #ifdef READ_OCC_SENSORS
792 // Read occ sensor values
793 getSensorValues(obj);
794 #endif
795 }
796
797 if (activeCount > 0)
798 {
799 // Restart OCC poll timer
800 _pollTimer->restartOnce(std::chrono::seconds(pollInterval));
801 }
802 else
803 {
804 // No OCCs running, so poll timer will not be restarted
805 log<level::INFO>(
806 std::format(
807 "Manager::pollerTimerExpired: poll timer will not be restarted")
808 .c_str());
809 }
810 }
811
812 #ifdef READ_OCC_SENSORS
readTempSensors(const fs::path & path,uint32_t occInstance)813 void Manager::readTempSensors(const fs::path& path, uint32_t occInstance)
814 {
815 // There may be more than one sensor with the same FRU type
816 // and label so make two passes: the first to read the temps
817 // from sysfs, and the second to put them on D-Bus after
818 // resolving any conflicts.
819 std::map<std::string, double> sensorData;
820
821 std::regex expr{"temp\\d+_label$"}; // Example: temp5_label
822 for (auto& file : fs::directory_iterator(path))
823 {
824 if (!std::regex_search(file.path().string(), expr))
825 {
826 continue;
827 }
828
829 uint32_t labelValue{0};
830
831 try
832 {
833 labelValue = readFile<uint32_t>(file.path());
834 }
835 catch (const std::system_error& e)
836 {
837 log<level::DEBUG>(
838 std::format("readTempSensors: Failed reading {}, errno = {}",
839 file.path().string(), e.code().value())
840 .c_str());
841 continue;
842 }
843
844 const std::string& tempLabel = "label";
845 const std::string filePathString = file.path().string().substr(
846 0, file.path().string().length() - tempLabel.length());
847
848 uint32_t fruTypeValue{0};
849 try
850 {
851 fruTypeValue = readFile<uint32_t>(filePathString + fruTypeSuffix);
852 }
853 catch (const std::system_error& e)
854 {
855 log<level::DEBUG>(
856 std::format("readTempSensors: Failed reading {}, errno = {}",
857 filePathString + fruTypeSuffix, e.code().value())
858 .c_str());
859 continue;
860 }
861
862 std::string sensorPath = OCC_SENSORS_ROOT +
863 std::string("/temperature/");
864
865 std::string dvfsTempPath;
866
867 if (fruTypeValue == VRMVdd)
868 {
869 sensorPath.append("vrm_vdd" + std::to_string(occInstance) +
870 "_temp");
871 }
872 else if (fruTypeValue == processorIoRing)
873 {
874 sensorPath.append("proc" + std::to_string(occInstance) +
875 "_ioring_temp");
876 dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/proc" +
877 std::to_string(occInstance) + "_ioring_dvfs_temp";
878 }
879 else
880 {
881 uint16_t type = (labelValue & 0xFF000000) >> 24;
882 uint16_t instanceID = labelValue & 0x0000FFFF;
883
884 if (type == OCC_DIMM_TEMP_SENSOR_TYPE)
885 {
886 if (fruTypeValue == fruTypeNotAvailable)
887 {
888 // Not all DIMM related temps are available to read
889 // (no _input file in this case)
890 continue;
891 }
892 auto iter = dimmTempSensorName.find(fruTypeValue);
893 if (iter == dimmTempSensorName.end())
894 {
895 log<level::ERR>(
896 std::format(
897 "readTempSensors: Fru type error! fruTypeValue = {}) ",
898 fruTypeValue)
899 .c_str());
900 continue;
901 }
902
903 sensorPath.append("dimm" + std::to_string(instanceID) +
904 iter->second);
905
906 dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/" +
907 dimmDVFSSensorName.at(fruTypeValue);
908 }
909 else if (type == OCC_CPU_TEMP_SENSOR_TYPE)
910 {
911 if (fruTypeValue == processorCore)
912 {
913 // The OCC reports small core temps, of which there are
914 // two per big core. All current P10 systems are in big
915 // core mode, so use a big core name.
916 uint16_t coreNum = instanceID / 2;
917 uint16_t tempNum = instanceID % 2;
918 sensorPath.append("proc" + std::to_string(occInstance) +
919 "_core" + std::to_string(coreNum) + "_" +
920 std::to_string(tempNum) + "_temp");
921
922 dvfsTempPath =
923 std::string{OCC_SENSORS_ROOT} + "/temperature/proc" +
924 std::to_string(occInstance) + "_core_dvfs_temp";
925 }
926 else
927 {
928 continue;
929 }
930 }
931 else
932 {
933 continue;
934 }
935 }
936
937 // The dvfs temp file only needs to be read once per chip per type.
938 if (!dvfsTempPath.empty() &&
939 !dbus::OccDBusSensors::getOccDBus().hasDvfsTemp(dvfsTempPath))
940 {
941 try
942 {
943 auto dvfsValue = readFile<double>(filePathString + maxSuffix);
944
945 dbus::OccDBusSensors::getOccDBus().setDvfsTemp(
946 dvfsTempPath, dvfsValue * std::pow(10, -3));
947 }
948 catch (const std::system_error& e)
949 {
950 log<level::DEBUG>(
951 std::format(
952 "readTempSensors: Failed reading {}, errno = {}",
953 filePathString + maxSuffix, e.code().value())
954 .c_str());
955 }
956 }
957
958 uint32_t faultValue{0};
959 try
960 {
961 faultValue = readFile<uint32_t>(filePathString + faultSuffix);
962 }
963 catch (const std::system_error& e)
964 {
965 log<level::DEBUG>(
966 std::format("readTempSensors: Failed reading {}, errno = {}",
967 filePathString + faultSuffix, e.code().value())
968 .c_str());
969 continue;
970 }
971
972 double tempValue{0};
973 // NOTE: if OCC sends back 0xFF, kernal sets this fault value to 1.
974 if (faultValue != 0)
975 {
976 tempValue = std::numeric_limits<double>::quiet_NaN();
977 }
978 else
979 {
980 // Read the temperature
981 try
982 {
983 tempValue = readFile<double>(filePathString + inputSuffix);
984 }
985 catch (const std::system_error& e)
986 {
987 log<level::DEBUG>(
988 std::format(
989 "readTempSensors: Failed reading {}, errno = {}",
990 filePathString + inputSuffix, e.code().value())
991 .c_str());
992
993 // if errno == EAGAIN(Resource temporarily unavailable) then set
994 // temp to 0, to avoid using old temp, and affecting FAN
995 // Control.
996 if (e.code().value() == EAGAIN)
997 {
998 tempValue = 0;
999 }
1000 // else the errno would be something like
1001 // EBADF(Bad file descriptor)
1002 // or ENOENT(No such file or directory)
1003 else
1004 {
1005 continue;
1006 }
1007 }
1008 }
1009
1010 // If this object path already has a value, only overwite
1011 // it if the previous one was an NaN or a smaller value.
1012 auto existing = sensorData.find(sensorPath);
1013 if (existing != sensorData.end())
1014 {
1015 // Multiple sensors found for this FRU type
1016 if ((std::isnan(existing->second) && (tempValue == 0)) ||
1017 ((existing->second == 0) && std::isnan(tempValue)))
1018 {
1019 // One of the redundant sensors has failed (0xFF/nan), and the
1020 // other sensor has no reading (0), so set the FRU to NaN to
1021 // force fan increase
1022 tempValue = std::numeric_limits<double>::quiet_NaN();
1023 existing->second = tempValue;
1024 }
1025 if (std::isnan(existing->second) || (tempValue > existing->second))
1026 {
1027 existing->second = tempValue;
1028 }
1029 }
1030 else
1031 {
1032 // First sensor for this FRU type
1033 sensorData[sensorPath] = tempValue;
1034 }
1035 }
1036
1037 // Now publish the values on D-Bus.
1038 for (const auto& [objectPath, value] : sensorData)
1039 {
1040 dbus::OccDBusSensors::getOccDBus().setValue(objectPath,
1041 value * std::pow(10, -3));
1042
1043 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(
1044 objectPath, !std::isnan(value));
1045
1046 if (existingSensors.find(objectPath) == existingSensors.end())
1047 {
1048 dbus::OccDBusSensors::getOccDBus().setChassisAssociation(
1049 objectPath);
1050 }
1051
1052 existingSensors[objectPath] = occInstance;
1053 }
1054 }
1055
1056 std::optional<std::string>
getPowerLabelFunctionID(const std::string & value)1057 Manager::getPowerLabelFunctionID(const std::string& value)
1058 {
1059 // If the value is "system", then the FunctionID is "system".
1060 if (value == "system")
1061 {
1062 return value;
1063 }
1064
1065 // If the value is not "system", then the label value have 3 numbers, of
1066 // which we only care about the middle one:
1067 // <sensor id>_<function id>_<apss channel>
1068 // eg: The value is "0_10_5" , then the FunctionID is "10".
1069 if (value.find("_") == std::string::npos)
1070 {
1071 return std::nullopt;
1072 }
1073
1074 auto powerLabelValue = value.substr((value.find("_") + 1));
1075
1076 if (powerLabelValue.find("_") == std::string::npos)
1077 {
1078 return std::nullopt;
1079 }
1080
1081 return powerLabelValue.substr(0, powerLabelValue.find("_"));
1082 }
1083
readPowerSensors(const fs::path & path,uint32_t id)1084 void Manager::readPowerSensors(const fs::path& path, uint32_t id)
1085 {
1086 std::regex expr{"power\\d+_label$"}; // Example: power5_label
1087 for (auto& file : fs::directory_iterator(path))
1088 {
1089 if (!std::regex_search(file.path().string(), expr))
1090 {
1091 continue;
1092 }
1093
1094 std::string labelValue;
1095 try
1096 {
1097 labelValue = readFile<std::string>(file.path());
1098 }
1099 catch (const std::system_error& e)
1100 {
1101 log<level::DEBUG>(
1102 std::format("readPowerSensors: Failed reading {}, errno = {}",
1103 file.path().string(), e.code().value())
1104 .c_str());
1105 continue;
1106 }
1107
1108 auto functionID = getPowerLabelFunctionID(labelValue);
1109 if (functionID == std::nullopt)
1110 {
1111 continue;
1112 }
1113
1114 const std::string& tempLabel = "label";
1115 const std::string filePathString = file.path().string().substr(
1116 0, file.path().string().length() - tempLabel.length());
1117
1118 std::string sensorPath = OCC_SENSORS_ROOT + std::string("/power/");
1119
1120 auto iter = powerSensorName.find(*functionID);
1121 if (iter == powerSensorName.end())
1122 {
1123 continue;
1124 }
1125 sensorPath.append(iter->second);
1126
1127 double tempValue{0};
1128
1129 try
1130 {
1131 tempValue = readFile<double>(filePathString + inputSuffix);
1132 }
1133 catch (const std::system_error& e)
1134 {
1135 log<level::DEBUG>(
1136 std::format("readPowerSensors: Failed reading {}, errno = {}",
1137 filePathString + inputSuffix, e.code().value())
1138 .c_str());
1139 continue;
1140 }
1141
1142 dbus::OccDBusSensors::getOccDBus().setUnit(
1143 sensorPath, "xyz.openbmc_project.Sensor.Value.Unit.Watts");
1144
1145 dbus::OccDBusSensors::getOccDBus().setValue(
1146 sensorPath, tempValue * std::pow(10, -3) * std::pow(10, -3));
1147
1148 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath,
1149 true);
1150
1151 if (existingSensors.find(sensorPath) == existingSensors.end())
1152 {
1153 dbus::OccDBusSensors::getOccDBus().setChassisAssociation(
1154 sensorPath);
1155 }
1156
1157 existingSensors[sensorPath] = id;
1158 }
1159 return;
1160 }
1161
setSensorValueToNaN(uint32_t id) const1162 void Manager::setSensorValueToNaN(uint32_t id) const
1163 {
1164 for (const auto& [sensorPath, occId] : existingSensors)
1165 {
1166 if (occId == id)
1167 {
1168 dbus::OccDBusSensors::getOccDBus().setValue(
1169 sensorPath, std::numeric_limits<double>::quiet_NaN());
1170
1171 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath,
1172 true);
1173 }
1174 }
1175 return;
1176 }
1177
setSensorValueToNonFunctional(uint32_t id) const1178 void Manager::setSensorValueToNonFunctional(uint32_t id) const
1179 {
1180 for (const auto& [sensorPath, occId] : existingSensors)
1181 {
1182 if (occId == id)
1183 {
1184 dbus::OccDBusSensors::getOccDBus().setValue(
1185 sensorPath, std::numeric_limits<double>::quiet_NaN());
1186
1187 dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath,
1188 false);
1189 }
1190 }
1191 return;
1192 }
1193
getSensorValues(std::unique_ptr<Status> & occ)1194 void Manager::getSensorValues(std::unique_ptr<Status>& occ)
1195 {
1196 static bool tracedError[8] = {0};
1197 const fs::path sensorPath = occ->getHwmonPath();
1198 const uint32_t id = occ->getOccInstanceID();
1199
1200 if (fs::exists(sensorPath))
1201 {
1202 // Read temperature sensors
1203 readTempSensors(sensorPath, id);
1204
1205 if (occ->isMasterOcc())
1206 {
1207 // Read power sensors
1208 readPowerSensors(sensorPath, id);
1209 }
1210 tracedError[id] = false;
1211 }
1212 else
1213 {
1214 if (!tracedError[id])
1215 {
1216 log<level::ERR>(
1217 std::format(
1218 "Manager::getSensorValues: OCC{} sensor path missing: {}",
1219 id, sensorPath.c_str())
1220 .c_str());
1221 tracedError[id] = true;
1222 }
1223 }
1224
1225 return;
1226 }
1227 #endif
1228
1229 // Read the altitude from DBus
readAltitude()1230 void Manager::readAltitude()
1231 {
1232 static bool traceAltitudeErr = true;
1233
1234 utils::PropertyValue altitudeProperty{};
1235 try
1236 {
1237 altitudeProperty = utils::getProperty(ALTITUDE_PATH, ALTITUDE_INTERFACE,
1238 ALTITUDE_PROP);
1239 auto sensorVal = std::get<double>(altitudeProperty);
1240 if (sensorVal < 0xFFFF)
1241 {
1242 if (sensorVal < 0)
1243 {
1244 altitude = 0;
1245 }
1246 else
1247 {
1248 // Round to nearest meter
1249 altitude = uint16_t(sensorVal + 0.5);
1250 }
1251 log<level::DEBUG>(std::format("readAltitude: sensor={} ({}m)",
1252 sensorVal, altitude)
1253 .c_str());
1254 traceAltitudeErr = true;
1255 }
1256 else
1257 {
1258 if (traceAltitudeErr)
1259 {
1260 traceAltitudeErr = false;
1261 log<level::DEBUG>(
1262 std::format("Invalid altitude value: {}", sensorVal)
1263 .c_str());
1264 }
1265 }
1266 }
1267 catch (const sdbusplus::exception_t& e)
1268 {
1269 if (traceAltitudeErr)
1270 {
1271 traceAltitudeErr = false;
1272 log<level::INFO>(
1273 std::format("Unable to read Altitude: {}", e.what()).c_str());
1274 }
1275 altitude = 0xFFFF; // not available
1276 }
1277 }
1278
1279 // Callback function when ambient temperature changes
ambientCallback(sdbusplus::message_t & msg)1280 void Manager::ambientCallback(sdbusplus::message_t& msg)
1281 {
1282 double currentTemp = 0;
1283 uint8_t truncatedTemp = 0xFF;
1284 std::string msgSensor;
1285 std::map<std::string, std::variant<double>> msgData;
1286 msg.read(msgSensor, msgData);
1287
1288 auto valPropMap = msgData.find(AMBIENT_PROP);
1289 if (valPropMap == msgData.end())
1290 {
1291 log<level::DEBUG>("ambientCallback: Unknown ambient property changed");
1292 return;
1293 }
1294 currentTemp = std::get<double>(valPropMap->second);
1295 if (std::isnan(currentTemp))
1296 {
1297 truncatedTemp = 0xFF;
1298 }
1299 else
1300 {
1301 if (currentTemp < 0)
1302 {
1303 truncatedTemp = 0;
1304 }
1305 else
1306 {
1307 // Round to nearest degree C
1308 truncatedTemp = uint8_t(currentTemp + 0.5);
1309 }
1310 }
1311
1312 // If ambient changes, notify OCCs
1313 if (truncatedTemp != ambient)
1314 {
1315 log<level::DEBUG>(
1316 std::format("ambientCallback: Ambient change from {} to {}C",
1317 ambient, currentTemp)
1318 .c_str());
1319
1320 ambient = truncatedTemp;
1321 if (altitude == 0xFFFF)
1322 {
1323 // No altitude yet, try reading again
1324 readAltitude();
1325 }
1326
1327 log<level::DEBUG>(
1328 std::format("ambientCallback: Ambient: {}C, altitude: {}m", ambient,
1329 altitude)
1330 .c_str());
1331 #ifdef POWER10
1332 // Send ambient and altitude to all OCCs
1333 for (auto& obj : statusObjects)
1334 {
1335 if (obj->occActive())
1336 {
1337 obj->sendAmbient(ambient, altitude);
1338 }
1339 }
1340 #endif // POWER10
1341 }
1342 }
1343
1344 // return the current ambient and altitude readings
getAmbientData(bool & ambientValid,uint8_t & ambientTemp,uint16_t & altitudeValue) const1345 void Manager::getAmbientData(bool& ambientValid, uint8_t& ambientTemp,
1346 uint16_t& altitudeValue) const
1347 {
1348 ambientValid = true;
1349 ambientTemp = ambient;
1350 altitudeValue = altitude;
1351
1352 if (ambient == 0xFF)
1353 {
1354 ambientValid = false;
1355 }
1356 }
1357
1358 #ifdef POWER10
1359 // Called when waitForAllOccsTimer expires
1360 // After the first OCC goes active, this timer will be started (60 seconds)
occsNotAllRunning()1361 void Manager::occsNotAllRunning()
1362 {
1363 if (activeCount != statusObjects.size())
1364 {
1365 // Not all OCCs went active
1366 log<level::WARNING>(
1367 std::format(
1368 "occsNotAllRunning: Active OCC count ({}) does not match expected count ({})",
1369 activeCount, statusObjects.size())
1370 .c_str());
1371 // Procs may be garded, so may be expected
1372 }
1373
1374 validateOccMaster();
1375 }
1376
1377 #ifdef PLDM
1378 // Called when throttlePldmTraceTimer expires.
1379 // If this timer expires, that indicates there are no OCC active sensor PDRs
1380 // found which will trigger pldm traces to be throttled.
1381 // The second time this timer expires, a PEL will get created.
throttlePldmTraceExpired()1382 void Manager::throttlePldmTraceExpired()
1383 {
1384 if (utils::isHostRunning())
1385 {
1386 if (!onPldmTimeoutCreatePel)
1387 {
1388 // Throttle traces
1389 pldmHandle->setTraceThrottle(true);
1390 // Restart timer to log a PEL when timer expires
1391 onPldmTimeoutCreatePel = true;
1392 throttlePldmTraceTimer->restartOnce(40min);
1393 }
1394 else
1395 {
1396 log<level::ERR>(
1397 "throttlePldmTraceExpired(): OCC active sensors still not available!");
1398 // Create PEL
1399 createPldmSensorPEL();
1400 }
1401 }
1402 else
1403 {
1404 // Make sure traces are not throttled
1405 pldmHandle->setTraceThrottle(false);
1406 log<level::INFO>(
1407 "throttlePldmTraceExpired(): host it not running ignoring sensor timer");
1408 }
1409 }
1410
createPldmSensorPEL()1411 void Manager::createPldmSensorPEL()
1412 {
1413 Error::Descriptor d = Error::Descriptor(MISSING_OCC_SENSORS_PATH);
1414 std::map<std::string, std::string> additionalData;
1415
1416 additionalData.emplace("_PID", std::to_string(getpid()));
1417
1418 log<level::INFO>(
1419 std::format(
1420 "createPldmSensorPEL(): Unable to find PLDM sensors for the OCCs")
1421 .c_str());
1422
1423 auto& bus = utils::getBus();
1424
1425 try
1426 {
1427 FFDCFiles ffdc;
1428 // Add occ-control journal traces to PEL FFDC
1429 auto occJournalFile =
1430 FFDC::addJournalEntries(ffdc, "openpower-occ-control", 40);
1431
1432 static constexpr auto loggingObjectPath =
1433 "/xyz/openbmc_project/logging";
1434 static constexpr auto opLoggingInterface = "org.open_power.Logging.PEL";
1435 std::string service = utils::getService(loggingObjectPath,
1436 opLoggingInterface);
1437 auto method = bus.new_method_call(service.c_str(), loggingObjectPath,
1438 opLoggingInterface,
1439 "CreatePELWithFFDCFiles");
1440
1441 // Set level to Warning (Predictive).
1442 auto level =
1443 sdbusplus::xyz::openbmc_project::Logging::server::convertForMessage(
1444 sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level::
1445 Warning);
1446
1447 method.append(d.path, level, additionalData, ffdc);
1448 bus.call(method);
1449 }
1450 catch (const sdbusplus::exception_t& e)
1451 {
1452 log<level::ERR>(
1453 std::format("Failed to create MISSING_OCC_SENSORS PEL: {}",
1454 e.what())
1455 .c_str());
1456 }
1457 }
1458 #endif // PLDM
1459 #endif // POWER10
1460
1461 // Verify single master OCC and start presence monitor
validateOccMaster()1462 void Manager::validateOccMaster()
1463 {
1464 int masterInstance = -1;
1465 for (auto& obj : statusObjects)
1466 {
1467 auto instance = obj->getOccInstanceID();
1468 #ifdef POWER10
1469 if (!obj->occActive())
1470 {
1471 if (utils::isHostRunning())
1472 {
1473 // Check if sensor was queued while waiting for discovery
1474 auto match = queuedActiveState.find(instance);
1475 if (match != queuedActiveState.end())
1476 {
1477 queuedActiveState.erase(match);
1478 log<level::INFO>(
1479 std::format(
1480 "validateOccMaster: OCC{} is ACTIVE (queued)",
1481 instance)
1482 .c_str());
1483 obj->occActive(true);
1484 }
1485 else
1486 {
1487 // OCC does not appear to be active yet, check active sensor
1488 #ifdef PLDM
1489 pldmHandle->checkActiveSensor(instance);
1490 #endif
1491 if (obj->occActive())
1492 {
1493 log<level::INFO>(
1494 std::format(
1495 "validateOccMaster: OCC{} is ACTIVE after reading sensor",
1496 instance)
1497 .c_str());
1498 }
1499 }
1500 }
1501 else
1502 {
1503 log<level::WARNING>(
1504 std::format(
1505 "validateOccMaster: HOST is not running (OCC{})",
1506 instance)
1507 .c_str());
1508 return;
1509 }
1510 }
1511 #endif // POWER10
1512
1513 if (obj->isMasterOcc())
1514 {
1515 obj->addPresenceWatchMaster();
1516
1517 if (masterInstance == -1)
1518 {
1519 masterInstance = instance;
1520 }
1521 else
1522 {
1523 log<level::ERR>(
1524 std::format(
1525 "validateOccMaster: Multiple OCC masters! ({} and {})",
1526 masterInstance, instance)
1527 .c_str());
1528 // request reset
1529 obj->deviceError(Error::Descriptor(PRESENCE_ERROR_PATH));
1530 }
1531 }
1532 }
1533
1534 if (masterInstance < 0)
1535 {
1536 log<level::ERR>(
1537 std::format("validateOccMaster: Master OCC not found! (of {} OCCs)",
1538 statusObjects.size())
1539 .c_str());
1540 // request reset
1541 statusObjects.front()->deviceError(
1542 Error::Descriptor(PRESENCE_ERROR_PATH));
1543 }
1544 else
1545 {
1546 log<level::INFO>(
1547 std::format("validateOccMaster: OCC{} is master of {} OCCs",
1548 masterInstance, activeCount)
1549 .c_str());
1550 #ifdef POWER10
1551 pmode->updateDbusSafeMode(false);
1552 #endif
1553 }
1554 }
1555
updatePcapBounds() const1556 void Manager::updatePcapBounds() const
1557 {
1558 if (pcap)
1559 {
1560 pcap->updatePcapBounds();
1561 }
1562 }
1563
1564 } // namespace occ
1565 } // namespace open_power
1566