1 #include "occ_status.hpp"
2
3 #include "occ_manager.hpp"
4 #include "occ_sensor.hpp"
5 #include "powermode.hpp"
6 #include "utils.hpp"
7
8 #include <phosphor-logging/lg2.hpp>
9
10 #include <filesystem>
11
12 namespace open_power
13 {
14 namespace occ
15 {
16
17 using namespace phosphor::logging;
18
19 using ThrottleObj =
20 sdbusplus::xyz::openbmc_project::Control::Power::server::Throttle;
21
22 // Handles updates to occActive property
occActive(bool value)23 bool Status::occActive(bool value)
24 {
25 if (value != this->occActive())
26 {
27 lg2::info("Status::occActive OCC{INST} changed to {STATE}", "INST",
28 instance, "STATE", value);
29 if (value)
30 {
31 // Clear prior throttle reason (before setting device active)
32 updateThrottle(false, THROTTLED_ALL);
33
34 // Set the device active
35 device.setActive(true);
36
37 // Reset last OCC state
38 lastState = 0;
39
40 // Start watching for errors (throttles, etc)
41 try
42 {
43 addErrorWatch();
44 }
45 catch (const OpenFailure& e)
46 {
47 // Failed to add watch for throttle events, request reset to try
48 // to recover comm
49 lg2::error(
50 "Status::occActive: Unable to add error watch(s) for OCC{INST} watch: {ERROR}",
51 "INST", instance, "ERROR", e.what());
52 deviceError(Error::Descriptor(OCC_COMM_ERROR_PATH));
53 return Base::Status::occActive(false);
54 }
55
56 // Update the OCC active sensor
57 Base::Status::occActive(value);
58
59 if (device.master())
60 {
61 // Update powercap bounds from OCC
62 manager.updatePcapBounds();
63 }
64
65 // Call into Manager to let know that we have bound
66 if (this->managerCallBack)
67 {
68 this->managerCallBack(instance, value);
69 }
70 }
71 else
72 {
73 #ifdef POWER10
74 if (pmode && device.master())
75 {
76 // Prevent mode changes
77 pmode->setMasterActive(false);
78 }
79 if (safeStateDelayTimer.isEnabled())
80 {
81 // stop safe delay timer
82 safeStateDelayTimer.setEnabled(false);
83 }
84 #endif
85 // Call into Manager to let know that we will unbind.
86 if (this->managerCallBack)
87 {
88 this->managerCallBack(instance, value);
89 }
90
91 // Stop watching for errors
92 removeErrorWatch();
93
94 // Set the device inactive
95 device.setActive(false);
96
97 // Clear throttles (OCC not active after disabling device)
98 updateThrottle(false, THROTTLED_ALL);
99 }
100 }
101 else if (value && !device.active())
102 {
103 // Existing error watch is on a dead file descriptor.
104 removeErrorWatch();
105
106 /*
107 * In it's constructor, Status checks Device::bound() to see if OCC is
108 * active or not.
109 * Device::bound() checks for occX-dev0 directory.
110 * We will lose occX-dev0 directories during FSI rescan.
111 * So, if we start this application (and construct Status), and then
112 * later do FSI rescan, we will end up with occActive = true and device
113 * NOT bound. Lets correct that situation here.
114 */
115 device.setActive(true);
116
117 // Add error watch again
118 try
119 {
120 addErrorWatch();
121 }
122 catch (const OpenFailure& e)
123 {
124 // Failed to add watch for throttle events, request reset to try to
125 // recover comm
126 lg2::error(
127 "Status::occActive: Unable to add error watch(s) again for OCC{INST} watch: {ERROR}",
128 "INST", instance, "ERROR", e.what());
129 deviceError(Error::Descriptor(OCC_COMM_ERROR_PATH));
130 return Base::Status::occActive(false);
131 }
132 }
133 else if (!value && device.active())
134 {
135 removeErrorWatch();
136
137 // In the event that the application never receives the active signal
138 // even though the OCC is active (this can occur if the BMC is rebooted
139 // with the host on, since the initial OCC driver probe will discover
140 // the OCCs), this application needs to be able to unbind the device
141 // when we get the OCC inactive signal.
142 device.setActive(false);
143 }
144 return Base::Status::occActive(value);
145 }
146
147 // Callback handler when a device error is reported.
deviceError(Error::Descriptor d)148 void Status::deviceError(Error::Descriptor d)
149 {
150 #ifdef POWER10
151 if (pmode && device.master())
152 {
153 // Prevent mode changes
154 pmode->setMasterActive(false);
155 }
156 #endif
157
158 if (d.log)
159 {
160 FFDC::createOCCResetPEL(instance, d.path, d.err, d.callout);
161 }
162
163 // This would deem OCC inactive
164 this->occActive(false);
165
166 // Reset the OCC
167 this->resetOCC();
168 }
169
170 // Sends message to host control command handler to reset OCC
resetOCC()171 void Status::resetOCC()
172 {
173 lg2::info(">>Status::resetOCC() - requesting reset for OCC{INST}", "INST",
174 instance);
175 this->occActive(false);
176 #ifdef PLDM
177 if (resetCallBack)
178 {
179 this->resetCallBack(instance);
180 }
181 #else
182 constexpr auto CONTROL_HOST_PATH = "/org/open_power/control/host0";
183 constexpr auto CONTROL_HOST_INTF = "org.open_power.Control.Host";
184
185 // This will throw exception on failure
186 auto service = utils::getService(CONTROL_HOST_PATH, CONTROL_HOST_INTF);
187
188 auto& bus = utils::getBus();
189 auto method = bus.new_method_call(service.c_str(), CONTROL_HOST_PATH,
190 CONTROL_HOST_INTF, "Execute");
191 // OCC Reset control command
192 method.append(convertForMessage(Control::Host::Command::OCCReset).c_str());
193
194 // OCC Sensor ID for callout reasons
195 method.append(std::variant<uint8_t>(std::get<0>(sensorMap.at(instance))));
196 bus.call_noreply(method);
197 return;
198 #endif
199 }
200
201 // Handler called by Host control command handler to convey the
202 // status of the executed command
hostControlEvent(sdbusplus::message_t & msg)203 void Status::hostControlEvent(sdbusplus::message_t& msg)
204 {
205 std::string cmdCompleted{};
206 std::string cmdStatus{};
207
208 msg.read(cmdCompleted, cmdStatus);
209
210 lg2::debug("Host control signal values: command={CMD}, status={STATUS}",
211 "CMD", cmdCompleted, "STATUS", cmdStatus);
212
213 if (Control::Host::convertResultFromString(cmdStatus) !=
214 Control::Host::Result::Success)
215 {
216 if (Control::Host::convertCommandFromString(cmdCompleted) ==
217 Control::Host::Command::OCCReset)
218 {
219 // Must be a Timeout. Log an Error trace
220 lg2::error(
221 "Error resetting the OCC: path={PATH}, sensorid={SENSOR}",
222 "PATH", path, "SENSOR", std::get<0>(sensorMap.at(instance)));
223 }
224 }
225 return;
226 }
227
228 // Called from Manager::pollerTimerExpired() in preperation to POLL OCC.
readOccState()229 void Status::readOccState()
230 {
231 if (stateValid)
232 {
233 // Reset retry count (since state is good)
234 currentOccReadRetriesCount = occReadRetries;
235 }
236 occReadStateNow();
237 }
238
239 #ifdef POWER10
240 // Special processing that needs to happen once the OCCs change to ACTIVE state
occsWentActive()241 void Status::occsWentActive()
242 {
243 CmdStatus status = CmdStatus::SUCCESS;
244
245 // IPS data will get sent automatically after a mode change if the mode
246 // supports it.
247 pmode->needToSendIPS();
248
249 status = pmode->sendModeChange();
250 if (status != CmdStatus::SUCCESS)
251 {
252 lg2::error(
253 "Status::occsWentActive: OCC mode change failed with status {STATUS}",
254 "STATUS", status);
255
256 // Disable and reset to try recovering
257 deviceError();
258 }
259 }
260
261 // Send Ambient and Altitude to the OCC
sendAmbient(const uint8_t inTemp,const uint16_t inAltitude)262 CmdStatus Status::sendAmbient(const uint8_t inTemp, const uint16_t inAltitude)
263 {
264 CmdStatus status = CmdStatus::FAILURE;
265 bool ambientValid = true;
266 uint8_t ambientTemp = inTemp;
267 uint16_t altitude = inAltitude;
268
269 if (ambientTemp == 0xFF)
270 {
271 // Get latest readings from manager
272 manager.getAmbientData(ambientValid, ambientTemp, altitude);
273 lg2::debug(
274 "sendAmbient: valid: {VALID}, Ambient: {TEMP}C, altitude: {ALT}m",
275 "VALID", ambientValid, "TEMP", ambientTemp, "ALT", altitude);
276 }
277
278 std::vector<std::uint8_t> cmd, rsp;
279 cmd.reserve(11);
280 cmd.push_back(uint8_t(CmdType::SEND_AMBIENT));
281 cmd.push_back(0x00); // Data Length (2 bytes)
282 cmd.push_back(0x08); //
283 cmd.push_back(0x00); // Version
284 cmd.push_back(ambientValid ? 0 : 0xFF); // Ambient Status
285 cmd.push_back(ambientTemp); // Ambient Temperature
286 cmd.push_back(altitude >> 8); // Altitude in meters (2 bytes)
287 cmd.push_back(altitude & 0xFF); //
288 cmd.push_back(0x00); // Reserved (3 bytes)
289 cmd.push_back(0x00);
290 cmd.push_back(0x00);
291 lg2::debug("sendAmbient: SEND_AMBIENT "
292 "command to OCC{INST} ({SIZE} bytes)",
293 "INST", instance, "SIZE", cmd.size());
294 status = occCmd.send(cmd, rsp);
295 if (status == CmdStatus::SUCCESS)
296 {
297 if (rsp.size() == 5)
298 {
299 if (RspStatus::SUCCESS != RspStatus(rsp[2]))
300 {
301 lg2::error(
302 "sendAmbient: SEND_AMBIENT failed with rspStatus {STATUS}",
303 "STATUS", lg2::hex, rsp[2]);
304 dump_hex(rsp);
305 status = CmdStatus::FAILURE;
306 }
307 }
308 else
309 {
310 lg2::error(
311 "sendAmbient: INVALID SEND_AMBIENT response length:{SIZE}",
312 "SIZE", rsp.size());
313 dump_hex(rsp);
314 status = CmdStatus::FAILURE;
315 }
316 }
317 else
318 {
319 lg2::error("sendAmbient: SEND_AMBIENT FAILED! with status {STATUS}",
320 "STATUS", lg2::hex, uint8_t(status));
321
322 if (status == CmdStatus::COMM_FAILURE)
323 {
324 // Disable due to OCC comm failure and reset to try recovering
325 deviceError(Error::Descriptor(OCC_COMM_ERROR_PATH));
326 }
327 }
328
329 return status;
330 }
331
332 // Called when safe timer expires to determine if OCCs need to be reset
safeStateDelayExpired()333 void Status::safeStateDelayExpired()
334 {
335 if (this->occActive())
336 {
337 lg2::info(
338 "safeStateDelayExpired: OCC{INST} state missing or not valid, requesting reset",
339 "INST", instance);
340 // Disable and reset to try recovering
341 deviceError(Error::Descriptor(SAFE_ERROR_PATH));
342 }
343 }
344 #endif // POWER10
345
getHwmonPath()346 fs::path Status::getHwmonPath()
347 {
348 using namespace std::literals::string_literals;
349
350 if (!fs::exists(hwmonPath))
351 {
352 static bool tracedFail[8] = {0};
353
354 if (!hwmonPath.empty())
355 {
356 lg2::warning(
357 "Status::getHwmonPath(): path no longer exists: {PATH}", "PATH",
358 hwmonPath);
359 hwmonPath.clear();
360 }
361
362 // Build the base HWMON path
363 fs::path prefixPath =
364 fs::path{OCC_HWMON_PATH + "occ-hwmon."s +
365 std::to_string(instance + 1) + "/hwmon/"s};
366
367 // Get the hwmonXX directory name
368 try
369 {
370 // there should only be one directory
371 const int numDirs = std::distance(
372 fs::directory_iterator(prefixPath), fs::directory_iterator{});
373 if (numDirs == 1)
374 {
375 hwmonPath = *fs::directory_iterator(prefixPath);
376 tracedFail[instance] = false;
377 }
378 else
379 {
380 if (!tracedFail[instance])
381 {
382 lg2::error(
383 "Status::getHwmonPath(): Found multiple ({NUM}) hwmon paths!",
384 "NUM", numDirs);
385 tracedFail[instance] = true;
386 }
387 }
388 }
389 catch (const fs::filesystem_error& e)
390 {
391 if (!tracedFail[instance])
392 {
393 lg2::error(
394 "Status::getHwmonPath(): error accessing {PATH}: {ERROR}",
395 "PATH", prefixPath, "ERROR", e.what());
396 tracedFail[instance] = true;
397 }
398 }
399 }
400
401 return hwmonPath;
402 }
403
404 // Called to read state and handle any errors
occReadStateNow()405 void Status::occReadStateNow()
406 {
407 unsigned int state;
408 const fs::path filename =
409 fs::path(DEV_PATH) /
410 fs::path(sysfsName + "." + std::to_string(instance + 1)) / "occ_state";
411
412 std::ifstream file;
413 bool goodFile = false;
414
415 // open file.
416 file.open(filename, std::ios::in);
417 const int openErrno = errno;
418
419 // File is open and state can be used.
420 if (file.is_open() && file.good())
421 {
422 goodFile = true;
423 file >> state;
424 // Read the error code (if any) to check status of the read
425 std::ios_base::iostate readState = file.rdstate();
426 if (readState)
427 {
428 // There was a failure reading the file
429 if (lastOccReadStatus != -1)
430 {
431 // Trace error bits
432 std::string errorBits = "";
433 if (readState & std::ios_base::eofbit)
434 {
435 errorBits += " EOF";
436 }
437 if (readState & std::ios_base::failbit)
438 {
439 errorBits += " failbit";
440 }
441 if (readState & std::ios_base::badbit)
442 {
443 errorBits += " badbit";
444 }
445 lg2::error(
446 "readOccState: Failed to read OCC{INST} state: Read error on I/O operation - {ERROR}",
447 "INST", instance, "ERROR", errorBits);
448 lastOccReadStatus = -1;
449 }
450 goodFile = false;
451 }
452
453 if (goodFile && (state != lastState))
454 {
455 // Trace OCC state changes
456 lg2::info(
457 "Status::readOccState: OCC{INST} state {STATE} (lastState: {PRIOR})",
458 "INST", instance, "STATE", lg2::hex, state, "PRIOR", lg2::hex,
459 lastState);
460 lastState = state;
461 #ifdef POWER10
462 if (OccState(state) == OccState::ACTIVE)
463 {
464 if (pmode && device.master())
465 {
466 // Set the master OCC on the PowerMode object
467 pmode->setMasterOcc(path);
468 // Enable mode changes
469 pmode->setMasterActive();
470
471 // Special processing by master OCC when it goes active
472 occsWentActive();
473 }
474
475 CmdStatus status = sendAmbient();
476 if (status != CmdStatus::SUCCESS)
477 {
478 lg2::error(
479 "readOccState: Sending Ambient failed with status {STATUS}",
480 "STATUS", status);
481 }
482 }
483
484 // If OCC in known Good State.
485 if ((OccState(state) == OccState::ACTIVE) ||
486 (OccState(state) == OccState::CHARACTERIZATION) ||
487 (OccState(state) == OccState::OBSERVATION))
488 {
489 // Good OCC State then sensors valid again
490 stateValid = true;
491
492 if (safeStateDelayTimer.isEnabled())
493 {
494 // stop safe delay timer (no longer in SAFE state)
495 safeStateDelayTimer.setEnabled(false);
496 }
497 }
498 else
499 {
500 // OCC is in SAFE or some other unsupported state
501 if (!safeStateDelayTimer.isEnabled())
502 {
503 lg2::error(
504 "readOccState: Invalid OCC{INST} state of {STATE}, starting safe state delay timer",
505 "INST", instance, "STATE", state);
506 // start safe delay timer (before requesting reset)
507 using namespace std::literals::chrono_literals;
508 safeStateDelayTimer.restartOnce(60s);
509 }
510 // Not a supported state (update sensors to NaN and not
511 // functional)
512 stateValid = false;
513 }
514 #else
515 // Before P10 state not checked, only used good file open.
516 stateValid = true;
517 #endif
518 }
519 }
520 #ifdef POWER10
521 else
522 {
523 // Unable to read state
524 stateValid = false;
525 }
526 #endif
527 file.close();
528
529 // if failed to Read a state or not a valid state -> Attempt retry
530 // after 1 Second delay if allowed.
531 if ((!goodFile) || (!stateValid))
532 {
533 if (!goodFile)
534 {
535 // If not able to read, OCC may be offline
536 if (openErrno != lastOccReadStatus)
537 {
538 lg2::error(
539 "Status::readOccState: open/read failed trying to read OCC{INST} state (open errno={ERROR})",
540 "INST", instance, "ERROR", openErrno);
541 lastOccReadStatus = openErrno;
542 }
543 }
544 else
545 {
546 // else this failed due to state not valid.
547 if (state != lastState)
548 {
549 lg2::error(
550 "Status::readOccState: OCC{INST} Invalid state {STATE} (last state: {PRIOR})",
551 "INST", instance, "STATE", lg2::hex, state, "PRIOR",
552 lg2::hex, lastState);
553 }
554 }
555
556 #ifdef READ_OCC_SENSORS
557 manager.setSensorValueToNaN(instance);
558 #endif
559
560 // See occReadRetries for number of retry attempts.
561 if (currentOccReadRetriesCount > 0)
562 {
563 --currentOccReadRetriesCount;
564 }
565 else
566 {
567 lg2::error("readOccState: failed to read OCC{INST} state!", "INST",
568 instance);
569
570 // State could not be determined, set it to NO State.
571 lastState = 0;
572
573 // Disable the ability to send Failed actions until OCC is
574 // Active again.
575 stateValid = false;
576
577 // Disable due to OCC comm failure and reset to try recovering
578 deviceError(Error::Descriptor(OCC_COMM_ERROR_PATH));
579
580 // Reset retry count (for next attempt after recovery)
581 currentOccReadRetriesCount = occReadRetries;
582 }
583 }
584 else
585 {
586 if (lastOccReadStatus != 0)
587 {
588 lg2::info(
589 "Status::readOccState: successfully read OCC{INST} state: {STATE}",
590 "INST", instance, "STATE", state);
591 lastOccReadStatus = 0; // no error
592 }
593 }
594 }
595
596 // Update processor throttle status on dbus
updateThrottle(const bool isThrottled,const uint8_t newReason)597 void Status::updateThrottle(const bool isThrottled, const uint8_t newReason)
598 {
599 if (!throttleHandle)
600 {
601 return;
602 }
603
604 uint8_t newThrottleCause = throttleCause;
605
606 if (isThrottled) // throttled due to newReason
607 {
608 if ((newReason & throttleCause) == 0)
609 {
610 // set the bit(s) for passed in reason
611 newThrottleCause |= newReason;
612 }
613 // else no change
614 }
615 else // no longer throttled due to newReason
616 {
617 if ((newReason & throttleCause) != 0)
618 {
619 // clear the bit(s) for passed in reason
620 newThrottleCause &= ~newReason;
621 }
622 // else no change
623 }
624
625 if (newThrottleCause != throttleCause)
626 {
627 if (newThrottleCause == THROTTLED_NONE)
628 {
629 lg2::debug(
630 "updateThrottle: OCC{INST} no longer throttled (prior reason: {REASON})",
631 "INST", instance, "REASON", throttleCause);
632 throttleCause = THROTTLED_NONE;
633 throttleHandle->throttled(false);
634 throttleHandle->throttleCauses({});
635 }
636 else
637 {
638 lg2::debug(
639 "updateThrottle: OCC{INST} is throttled with reason {REASON} (prior reason: {PRIOR})",
640 "INST", instance, "REASON", newThrottleCause, "PRIOR",
641 throttleCause);
642 throttleCause = newThrottleCause;
643
644 std::vector<ThrottleObj::ThrottleReasons> updatedCauses;
645 if (throttleCause & THROTTLED_POWER)
646 {
647 updatedCauses.push_back(
648 throttleHandle->ThrottleReasons::PowerLimit);
649 }
650 if (throttleCause & THROTTLED_THERMAL)
651 {
652 updatedCauses.push_back(
653 throttleHandle->ThrottleReasons::ThermalLimit);
654 }
655 if (throttleCause & THROTTLED_SAFE)
656 {
657 updatedCauses.push_back(
658 throttleHandle->ThrottleReasons::ManagementDetectedFault);
659 }
660 throttleHandle->throttleCauses(updatedCauses);
661 throttleHandle->throttled(true);
662 }
663 }
664 // else no change to throttle status
665 }
666
667 // Get processor path associated with this OCC
readProcAssociation()668 void Status::readProcAssociation()
669 {
670 std::string managingPath = path + "/power_managing";
671 lg2::debug("readProcAssociation: getting endpoints for {MANAGE} ({PATH})",
672 "MANAGE", managingPath, "PATH", path);
673 try
674 {
675 utils::PropertyValue procPathProperty{};
676 procPathProperty = utils::getProperty(
677 managingPath, "xyz.openbmc_project.Association", "endpoints");
678 auto result = std::get<std::vector<std::string>>(procPathProperty);
679 if (result.size() > 0)
680 {
681 procPath = result[0];
682 lg2::info("readProcAssociation: OCC{INST} has proc={PATH}", "INST",
683 instance, "PATH", procPath);
684 }
685 else
686 {
687 lg2::error(
688 "readProcAssociation: No processor associated with OCC{INST} / {PATH}",
689 "INST", instance, "PATH", path);
690 }
691 }
692 catch (const sdbusplus::exception_t& e)
693 {
694 lg2::error(
695 "readProcAssociation: Unable to get proc assocated with {PATH} - {ERROR}",
696 "PATH", path, "ERROR", e.what());
697 procPath = {};
698 }
699 }
700
701 } // namespace occ
702 } // namespace open_power
703