1 #include "occ_status.hpp"
2
3 #include "occ_manager.hpp"
4 #include "occ_sensor.hpp"
5 #include "powermode.hpp"
6 #include "utils.hpp"
7
8 #include <phosphor-logging/lg2.hpp>
9 #include <xyz/openbmc_project/Control/Power/Throttle/server.hpp>
10
11 #include <filesystem>
12
13 namespace open_power
14 {
15 namespace occ
16 {
17
18 using namespace phosphor::logging;
19
20 using ThrottleObj =
21 sdbusplus::xyz::openbmc_project::Control::Power::server::Throttle;
22 using ThrottleReason = ThrottleObj::ThrottleReasons;
23
24 // Handles updates to occActive property
occActive(bool value)25 bool Status::occActive(bool value)
26 {
27 if (value != this->occActive())
28 {
29 lg2::info("Status::occActive OCC{INST} changed to {STATE}", "INST",
30 instance, "STATE", value);
31 if (value)
32 {
33 // OCC is active
34 // Clear prior throttle reason (before setting device active)
35 updateThrottle(false, THROTTLED_ALL);
36
37 // Set the device active
38 device.setActive(true);
39
40 // Reset last OCC state
41 lastState = 0;
42
43 // Start watching for errors (throttles, etc)
44 try
45 {
46 addErrorWatch();
47 }
48 catch (const OpenFailure& e)
49 {
50 // Failed to add watch for throttle events, request reset to try
51 // to recover comm
52 lg2::error(
53 "Status::occActive: Unable to add error watch(s) for OCC{INST} watch: {ERROR}",
54 "INST", instance, "ERROR", e.what());
55 deviceError(Error::Descriptor(OCC_COMM_ERROR_PATH));
56 return Base::Status::occActive(false);
57 }
58
59 // Update the OCC active sensor
60 Base::Status::occActive(value);
61
62 if (device.master())
63 {
64 // Update powercap bounds from OCC
65 manager.updatePcapBounds();
66 }
67
68 // Call into Manager to let know that we have bound
69 if (this->managerCallBack)
70 {
71 this->managerCallBack(instance, value);
72 }
73 }
74 else
75 {
76 // OCC is no longer active
77 if (sensorsValid)
78 {
79 sensorsValid = false;
80 // Sensors not supported (update to NaN and not functional)
81 manager.setSensorValueToNaN(instance);
82 }
83
84 if (pmode && device.master())
85 {
86 // Prevent mode changes
87 pmode->setMasterActive(false);
88 }
89 if (safeStateDelayTimer.isEnabled())
90 {
91 // stop safe delay timer
92 safeStateDelayTimer.setEnabled(false);
93 }
94
95 // Call into Manager to let know that we will unbind.
96 if (this->managerCallBack)
97 {
98 this->managerCallBack(instance, value);
99 }
100
101 // Stop watching for errors
102 removeErrorWatch();
103
104 // Set the device inactive
105 device.setActive(false);
106
107 // Clear throttles (OCC not active after disabling device)
108 updateThrottle(false, THROTTLED_ALL);
109 }
110 }
111 else if (value && !device.active())
112 {
113 // Existing error watch is on a dead file descriptor.
114 removeErrorWatch();
115
116 /*
117 * In it's constructor, Status checks Device::bound() to see if OCC is
118 * active or not.
119 * Device::bound() checks for occX-dev0 directory.
120 * We will lose occX-dev0 directories during FSI rescan.
121 * So, if we start this application (and construct Status), and then
122 * later do FSI rescan, we will end up with occActive = true and device
123 * NOT bound. Lets correct that situation here.
124 */
125 device.setActive(true);
126
127 // Add error watch again
128 try
129 {
130 addErrorWatch();
131 }
132 catch (const OpenFailure& e)
133 {
134 // Failed to add watch for throttle events, request reset to try to
135 // recover comm
136 lg2::error(
137 "Status::occActive: Unable to add error watch(s) again for OCC{INST} watch: {ERROR}",
138 "INST", instance, "ERROR", e.what());
139 deviceError(Error::Descriptor(OCC_COMM_ERROR_PATH));
140 return Base::Status::occActive(false);
141 }
142 }
143 else if (!value && device.active())
144 {
145 removeErrorWatch();
146
147 // In the event that the application never receives the active signal
148 // even though the OCC is active (this can occur if the BMC is rebooted
149 // with the host on, since the initial OCC driver probe will discover
150 // the OCCs), this application needs to be able to unbind the device
151 // when we get the OCC inactive signal.
152 device.setActive(false);
153 }
154 return Base::Status::occActive(value);
155 }
156
157 // Callback handler when a device error is reported.
deviceError(Error::Descriptor d)158 void Status::deviceError(Error::Descriptor d)
159 {
160 if (pmode && device.master())
161 {
162 // Prevent mode changes
163 pmode->setMasterActive(false);
164 }
165
166 if (d.log)
167 {
168 FFDC::createOCCResetPEL(instance, d.path, d.err, d.callout,
169 d.isInventoryCallout);
170 }
171
172 // This would deem OCC inactive
173 this->occActive(false);
174
175 // Reset the OCC
176 this->resetOCC();
177 }
178
179 // Sends message to host control command handler to reset OCC
resetOCC()180 void Status::resetOCC()
181 {
182 lg2::info(">>Status::resetOCC() - requesting reset for OCC{INST}", "INST",
183 instance);
184 this->occActive(false);
185 if (resetCallBack)
186 {
187 this->resetCallBack(instance);
188 }
189 }
190
191 // Handler called by Host control command handler to convey the
192 // status of the executed command
hostControlEvent(sdbusplus::message_t & msg)193 void Status::hostControlEvent(sdbusplus::message_t& msg)
194 {
195 std::string cmdCompleted{};
196 std::string cmdStatus{};
197
198 msg.read(cmdCompleted, cmdStatus);
199
200 lg2::debug("Host control signal values: command={CMD}, status={STATUS}",
201 "CMD", cmdCompleted, "STATUS", cmdStatus);
202
203 if (Control::Host::convertResultFromString(cmdStatus) !=
204 Control::Host::Result::Success)
205 {
206 if (Control::Host::convertCommandFromString(cmdCompleted) ==
207 Control::Host::Command::OCCReset)
208 {
209 // Must be a Timeout. Log an Error trace
210 lg2::error(
211 "Error resetting the OCC: path={PATH}, sensorid={SENSOR}",
212 "PATH", path, "SENSOR", std::get<0>(sensorMap.at(instance)));
213 }
214 }
215 return;
216 }
217
218 // Called from Manager::pollerTimerExpired() in preperation to POLL OCC.
readOccState()219 void Status::readOccState()
220 {
221 if (stateValid)
222 {
223 // Reset retry count (since state is good)
224 currentOccReadRetriesCount = occReadRetries;
225 }
226 occReadStateNow();
227 }
228
229 // Special processing that needs to happen once the OCCs change to ACTIVE state
occsWentActive()230 void Status::occsWentActive()
231 {
232 CmdStatus status = CmdStatus::SUCCESS;
233
234 // IPS data will get sent automatically after a mode change if the mode
235 // supports it.
236 pmode->needToSendIPS();
237
238 status = pmode->sendModeChange();
239 if (status != CmdStatus::SUCCESS)
240 {
241 lg2::error(
242 "Status::occsWentActive: OCC mode change failed with status {STATUS}",
243 "STATUS", status);
244
245 // Disable and reset to try recovering
246 deviceError();
247 }
248 }
249
250 // Send Ambient and Altitude to the OCC
sendAmbient(const uint8_t inTemp,const uint16_t inAltitude)251 CmdStatus Status::sendAmbient(const uint8_t inTemp, const uint16_t inAltitude)
252 {
253 CmdStatus status = CmdStatus::FAILURE;
254 bool ambientValid = true;
255 uint8_t ambientTemp = inTemp;
256 uint16_t altitude = inAltitude;
257
258 if (ambientTemp == 0xFF)
259 {
260 // Get latest readings from manager
261 manager.getAmbientData(ambientValid, ambientTemp, altitude);
262 lg2::debug(
263 "sendAmbient: valid: {VALID}, Ambient: {TEMP}C, altitude: {ALT}m",
264 "VALID", ambientValid, "TEMP", ambientTemp, "ALT", altitude);
265 }
266
267 std::vector<std::uint8_t> cmd, rsp;
268 cmd.reserve(11);
269 cmd.push_back(uint8_t(CmdType::SEND_AMBIENT));
270 cmd.push_back(0x00); // Data Length (2 bytes)
271 cmd.push_back(0x08); //
272 cmd.push_back(0x00); // Version
273 cmd.push_back(ambientValid ? 0 : 0xFF); // Ambient Status
274 cmd.push_back(ambientTemp); // Ambient Temperature
275 cmd.push_back(altitude >> 8); // Altitude in meters (2 bytes)
276 cmd.push_back(altitude & 0xFF); //
277 cmd.push_back(0x00); // Reserved (3 bytes)
278 cmd.push_back(0x00);
279 cmd.push_back(0x00);
280 lg2::debug("sendAmbient: SEND_AMBIENT "
281 "command to OCC{INST} ({SIZE} bytes)",
282 "INST", instance, "SIZE", cmd.size());
283 status = occCmd.send(cmd, rsp);
284 if (status == CmdStatus::SUCCESS)
285 {
286 if (rsp.size() == 5)
287 {
288 if (RspStatus::SUCCESS != RspStatus(rsp[2]))
289 {
290 lg2::error(
291 "sendAmbient: SEND_AMBIENT failed with rspStatus {STATUS}",
292 "STATUS", lg2::hex, rsp[2]);
293 dump_hex(rsp);
294 status = CmdStatus::FAILURE;
295 }
296 }
297 else
298 {
299 lg2::error(
300 "sendAmbient: INVALID SEND_AMBIENT response length:{SIZE}",
301 "SIZE", rsp.size());
302 dump_hex(rsp);
303 status = CmdStatus::FAILURE;
304 }
305 }
306 else
307 {
308 lg2::error("sendAmbient: SEND_AMBIENT FAILED! with status {STATUS}",
309 "STATUS", lg2::hex, uint8_t(status));
310
311 if (status == CmdStatus::COMM_FAILURE)
312 {
313 // Disable due to OCC comm failure and reset to try recovering
314 deviceError(Error::Descriptor(OCC_COMM_ERROR_PATH));
315 }
316 }
317
318 return status;
319 }
320
321 // Called when safe timer expires to determine if OCCs need to be reset
safeStateDelayExpired()322 void Status::safeStateDelayExpired()
323 {
324 if (this->occActive())
325 {
326 lg2::info(
327 "safeStateDelayExpired: OCC{INST} state missing or not valid, requesting reset",
328 "INST", instance);
329 // Disable and reset to try recovering
330 deviceError(Error::Descriptor(SAFE_ERROR_PATH));
331 }
332 }
333
getHwmonPath()334 fs::path Status::getHwmonPath()
335 {
336 using namespace std::literals::string_literals;
337
338 if (!fs::exists(hwmonPath))
339 {
340 static bool tracedFail[8] = {0};
341
342 if (!hwmonPath.empty())
343 {
344 lg2::warning(
345 "Status::getHwmonPath(): path no longer exists: {PATH}", "PATH",
346 hwmonPath);
347 hwmonPath.clear();
348 }
349
350 // Build the base HWMON path
351 fs::path prefixPath =
352 fs::path{OCC_HWMON_PATH + "occ-hwmon."s +
353 std::to_string(instance + 1) + "/hwmon/"s};
354
355 // Get the hwmonXX directory name
356 try
357 {
358 // there should only be one directory
359 const int numDirs = std::distance(
360 fs::directory_iterator(prefixPath), fs::directory_iterator{});
361 if (numDirs == 1)
362 {
363 hwmonPath = *fs::directory_iterator(prefixPath);
364 tracedFail[instance] = false;
365 }
366 else
367 {
368 if (!tracedFail[instance])
369 {
370 lg2::error(
371 "Status::getHwmonPath(): Found multiple ({NUM}) hwmon paths!",
372 "NUM", numDirs);
373 tracedFail[instance] = true;
374 }
375 }
376 }
377 catch (const fs::filesystem_error& e)
378 {
379 if (!tracedFail[instance])
380 {
381 lg2::error(
382 "Status::getHwmonPath(): error accessing {PATH}: {ERROR}",
383 "PATH", prefixPath, "ERROR", e.what());
384 tracedFail[instance] = true;
385 }
386 }
387 }
388
389 return hwmonPath;
390 }
391
392 // Called to read state and handle any errors
occReadStateNow()393 void Status::occReadStateNow()
394 {
395 unsigned int state;
396 const fs::path filename =
397 fs::path(DEV_PATH) /
398 fs::path(sysfsName + "." + std::to_string(instance + 1)) / "occ_state";
399
400 std::ifstream file;
401 bool stateWasRead = false;
402
403 // open file.
404 file.open(filename, std::ios::in);
405 const int openErrno = errno;
406
407 // File is open and state can be used.
408 if (file.is_open() && file.good())
409 {
410 stateWasRead = true;
411 file >> state;
412 // Read the error code (if any) to check status of the read
413 std::ios_base::iostate readState = file.rdstate();
414 if (readState)
415 {
416 // There was a failure reading the file
417 if (lastOccReadStatus != -1)
418 {
419 // Trace error bits
420 std::string errorBits = "";
421 if (readState & std::ios_base::eofbit)
422 {
423 errorBits += " EOF";
424 }
425 if (readState & std::ios_base::failbit)
426 {
427 errorBits += " failbit";
428 }
429 if (readState & std::ios_base::badbit)
430 {
431 errorBits += " badbit";
432 }
433 lg2::error(
434 "readOccState: Failed to read OCC{INST} state: Read error on I/O operation - {ERROR}",
435 "INST", instance, "ERROR", errorBits);
436 lastOccReadStatus = -1;
437 }
438 stateWasRead = false;
439 }
440
441 if (stateWasRead && (state != lastState))
442 {
443 // Trace OCC state changes
444 lg2::info(
445 "Status::readOccState: OCC{INST} state {STATE} (lastState: {PRIOR})",
446 "INST", instance, "STATE", lg2::hex, state, "PRIOR", lg2::hex,
447 lastState);
448 lastState = state;
449
450 if (OccState(state) == OccState::ACTIVE)
451 {
452 if (pmode && device.master())
453 {
454 // Set the master OCC on the PowerMode object
455 pmode->setMasterOcc(path);
456 // Enable mode changes
457 pmode->setMasterActive();
458
459 // Special processing by master OCC when it goes active
460 occsWentActive();
461 }
462
463 CmdStatus status = sendAmbient();
464 if (status != CmdStatus::SUCCESS)
465 {
466 lg2::error(
467 "readOccState: Sending Ambient failed with status {STATUS}",
468 "STATUS", status);
469 }
470 }
471
472 // If OCC in known Good State.
473 if ((OccState(state) == OccState::ACTIVE) ||
474 (OccState(state) == OccState::CHARACTERIZATION) ||
475 (OccState(state) == OccState::OBSERVATION))
476 {
477 // Good OCC State then sensors valid again
478 stateValid = true;
479 sensorsValid = true;
480
481 if (safeStateDelayTimer.isEnabled())
482 {
483 // stop safe delay timer (no longer in SAFE state)
484 safeStateDelayTimer.setEnabled(false);
485 }
486 }
487 else
488 {
489 // OCC is in SAFE or some other unsupported state
490 if (!safeStateDelayTimer.isEnabled())
491 {
492 lg2::error(
493 "readOccState: Invalid OCC{INST} state of {STATE} (last state: {PRIOR}), starting safe state delay timer",
494 "INST", instance, "STATE", lg2::hex, state, "PRIOR",
495 lg2::hex, lastState);
496 // start safe delay timer (before requesting reset)
497 using namespace std::literals::chrono_literals;
498 safeStateDelayTimer.restartOnce(60s);
499 }
500
501 if (sensorsValid)
502 {
503 sensorsValid = false;
504 // Sensors not supported (update to NaN and not functional)
505 manager.setSensorValueToNaN(instance);
506 }
507 }
508 }
509 }
510 else
511 {
512 // Unable to read state
513 stateValid = false;
514 }
515
516 file.close();
517
518 // if failed to read the OCC state -> Attempt retry
519 if (!stateWasRead)
520 {
521 if (sensorsValid)
522 {
523 sensorsValid = false;
524 manager.setSensorValueToNaN(instance);
525 }
526
527 // If not able to read, OCC may be offline
528 if (openErrno != lastOccReadStatus)
529 {
530 lg2::error(
531 "Status::readOccState: open/read failed trying to read OCC{INST} state (open errno={ERROR})",
532 "INST", instance, "ERROR", openErrno);
533 lastOccReadStatus = openErrno;
534 }
535
536 // See occReadRetries for number of retry attempts.
537 if (currentOccReadRetriesCount > 0)
538 {
539 --currentOccReadRetriesCount;
540 }
541 else
542 {
543 lg2::error(
544 "readOccState: failed to read OCC{INST} state! (last state: {PRIOR})",
545 "INST", instance, "PRIOR", lg2::hex, lastState);
546
547 // State could not be determined, set it to NO State.
548 lastState = 0;
549
550 // Disable the ability to send Failed actions until OCC is
551 // Active again.
552 stateValid = false;
553
554 // Disable due to OCC comm failure and reset to try recovering
555 // (processor callout will be added)
556 deviceError(Error::Descriptor(OCC_COMM_ERROR_PATH, ECOMM,
557 procPath.c_str(), true));
558
559 // Reset retry count (for next attempt after recovery)
560 currentOccReadRetriesCount = occReadRetries;
561 }
562 }
563 else if (lastOccReadStatus != 0)
564 {
565 lg2::info("readOccState: successfully read OCC{INST} state: {STATE}",
566 "INST", instance, "STATE", state);
567 lastOccReadStatus = 0; // no error
568 }
569 }
570
571 // Update processor throttle status on dbus
updateThrottle(const bool isThrottled,const uint8_t newReason)572 void Status::updateThrottle(const bool isThrottled, const uint8_t newReason)
573 {
574 if (!throttleHandle)
575 {
576 return;
577 }
578
579 uint8_t newThrottleCause = throttleCause;
580
581 if (isThrottled) // throttled due to newReason
582 {
583 if ((newReason & throttleCause) == 0)
584 {
585 // set the bit(s) for passed in reason
586 newThrottleCause |= newReason;
587 }
588 // else no change
589 }
590 else // no longer throttled due to newReason
591 {
592 if ((newReason & throttleCause) != 0)
593 {
594 // clear the bit(s) for passed in reason
595 newThrottleCause &= ~newReason;
596 }
597 // else no change
598 }
599
600 if (newThrottleCause != throttleCause)
601 {
602 if (newThrottleCause == THROTTLED_NONE)
603 {
604 lg2::debug(
605 "updateThrottle: OCC{INST} no longer throttled (prior reason: {REASON})",
606 "INST", instance, "REASON", throttleCause);
607 throttleCause = THROTTLED_NONE;
608 throttleHandle->throttled(false);
609 throttleHandle->throttleCauses({});
610 }
611 else
612 {
613 lg2::debug(
614 "updateThrottle: OCC{INST} is throttled with reason {REASON} (prior reason: {PRIOR})",
615 "INST", instance, "REASON", newThrottleCause, "PRIOR",
616 throttleCause);
617 throttleCause = newThrottleCause;
618
619 std::vector<ThrottleObj::ThrottleReasons> updatedCauses;
620 if (throttleCause & THROTTLED_POWER)
621 {
622 updatedCauses.push_back(ThrottleReason::PowerLimit);
623 }
624 if (throttleCause & THROTTLED_THERMAL)
625 {
626 updatedCauses.push_back(ThrottleReason::ThermalLimit);
627 }
628 if (throttleCause & THROTTLED_SAFE)
629 {
630 updatedCauses.push_back(
631 ThrottleReason::ManagementDetectedFault);
632 }
633 throttleHandle->throttleCauses(updatedCauses);
634 throttleHandle->throttled(true);
635 }
636 }
637 // else no change to throttle status
638 }
639
640 // Get processor path associated with this OCC
readProcAssociation()641 void Status::readProcAssociation()
642 {
643 std::string managingPath = path + "/power_managing";
644 lg2::debug("readProcAssociation: getting endpoints for {MANAGE} ({PATH})",
645 "MANAGE", managingPath, "PATH", path);
646 try
647 {
648 utils::PropertyValue procPathProperty{};
649 procPathProperty = utils::getProperty(
650 managingPath, "xyz.openbmc_project.Association", "endpoints");
651 auto result = std::get<std::vector<std::string>>(procPathProperty);
652 if (result.size() > 0)
653 {
654 procPath = result[0];
655 lg2::info("readProcAssociation: OCC{INST} has proc={PATH}", "INST",
656 instance, "PATH", procPath);
657 }
658 else
659 {
660 lg2::error(
661 "readProcAssociation: No processor associated with OCC{INST} / {PATH}",
662 "INST", instance, "PATH", path);
663 }
664 }
665 catch (const sdbusplus::exception_t& e)
666 {
667 lg2::error(
668 "readProcAssociation: Unable to get proc assocated with {PATH} - {ERROR}",
669 "PATH", path, "ERROR", e.what());
670 procPath = {};
671 }
672 }
673
674 } // namespace occ
675 } // namespace open_power
676