xref: /openbmc/openbmc-test-automation/systest/gpu_stress_test.robot (revision 286e2f07a1261ad043902a6db05d39429a31a1d4)
1*** Settings ***
2Documentation    Stress the system GPUs using the HTX exerciser.
3
4# Test Parameters:
5# OPENBMC_HOST        The BMC host name or IP address.
6# OS_HOST             The OS host name or IP Address.
7# OS_USERNAME         The OS login userid (usually "root").
8# OS_PASSWORD         The password for the OS login.
9# HTX_DURATION        Duration of HTX run, for example, "2h", or "30m".
10# HTX_LOOP            The number of times to loop HTX.
11# HTX_INTERVAL        The time delay between consecutive checks of HTX
12#                     status, for example, "15m".
13#                     In summary: Run HTX for $HTX_DURATION, looping
14#                     $HTX_LOOP times checking for errors every
15#                     $HTX_INTERVAL.  Then allow extra time for OS
16#                     Boot, HTX startup, shutdown.
17# HTX_KEEP_RUNNING    If set to 1, this indicates that the HTX is to
18#                     continue running after an error was found.
19
20
21Resource         ../lib/os_utilities.robot
22
23Suite Setup      Run Keyword  Start SOL Console Logging
24Test Setup       Test Setup Execution
25Test Teardown    Test Teardown Execution
26
27Test Tags       GPU_Stress
28
29*** Variables ****
30
31${HTX_DURATION}      1h
32${HTX_LOOP}          ${1}
33${HTX_INTERVAL}      30m
34${HTX_KEEP_RUNNING}  ${0}
35${stack_mode}        skip
36
37*** Test Cases ***
38
39GPU Stress Test
40    [Documentation]  Stress the GPU using HTX exerciser.
41    [Tags]  GPU_Stress_Test
42
43    # Get number of GPU reported by the BMC.
44    ${num_bmc_gpus}=  Count GPUs From BMC
45    Rpvars  num_bmc_gpus
46
47    # The BMC and OS should report the same number of GPUs.
48    ${failmsg01}=  Catenate  OS reports ${num_os_gpus} GPUs, but BMC
49    ...  reports ${num_bmc_gpus} present and functional GPUs.
50    IF  '${num_os_gpus}' != '${num_bmc_gpus}'  Fail  msg=${failmsg01}
51
52    # Show parameters for HTX stress test.
53    Printn
54    Rpvars  HTX_DURATION  HTX_LOOP  HTX_INTERVAL
55
56    # Set the iteration (loop) counter.
57    Set Suite Variable  ${iteration}  ${0}  children=true
58
59
60    # Shutdown HTX if it is already running.
61    ${status}=  Is HTX Running
62    IF  '${status}' == 'True'  Shutdown HTX Exerciser
63
64    Repeat Keyword  ${HTX_LOOP} times  Execute GPU Test
65
66
67*** Keywords ***
68
69Execute GPU Test
70    [Documentation]  Start HTX exerciser.
71    # Test Flow:
72    #              - Power on
73    #              - Establish SSH connection session
74    #              - Collect GPU nvidia status output
75    #              - Create HTX mdt profile
76    #              - Run GPU specific HTX exerciser
77    #              - Check for errors
78
79    Set Suite Variable  ${iteration}  ${iteration + 1}
80    ${loop_count}=  Catenate  Starting iteration: ${iteration}
81    Printn
82    Rpvars  loop_count
83
84    REST Power On  stack_mode=skip
85    Run Key U  Sleep \ 15s
86
87    # Collect data before the test starts.
88    Collect NVIDIA Log File  start
89
90    # Collect NVIDIA maximum limits.
91    ${power_max}=  Get GPU Power Limit
92    ${temperature_max}=  Get GPU Temperature Limit
93    ${clock_max}=  Get GPU Clock Limit
94
95    IF  '${HTX_MDT_PROFILE}' == 'mdt.bu'  Create Default MDT Profile
96
97    Run MDT Profile
98
99    Loop HTX Health Check
100
101    # Post test loop look out for dmesg error logged.
102    Check For Errors On OS Dmesg Log
103
104    # Check NVIDIA power, temperature, and clocks.
105    ${power}=  Get GPU Max Power
106    ${temperature}=  Get GPU Max Temperature
107    ${temperature_via_rest}=  Get GPU Temperature Via REST
108    ${clock}=  Get GPU Clock
109    Printn
110    Rpvars  power  power_max  temperature  temperature_via_rest
111    ...  temperature_max  clock  clock_max
112
113    IF  ${power} > ${power_max}
114        Fail  msg=GPU Power ${power} exceeds limit of ${power_max}.
115    END
116
117    ${err_msg}=  Catenate  GPU temperature of ${temperature} exceeds limit
118    ...  of ${temperature_max}.
119    IF  ${temperature} > ${temperature_max}  Fail  msg=${err_msg}
120
121    IF  ${clock} > ${clock_max}  Fail  msg=GPU clock of ${clock} exceeds limit of ${clock_max}.
122
123    ${err_msg}=  Catenate  The GPU temperature reported by REST is not within
124    ...  5 degrees of the nvidia_smi reported temperature.
125    ${upper_limit}=  Evaluate  ${temperature_via_rest}+5
126    ${lower_limit}=  Evaluate  ${temperature_via_rest}-5
127
128    IF  ${temperature} > ${upper_limit} or ${temperature} < ${lower_limit}
129        Fail  msg=${err_msg}
130    END
131
132    Shutdown HTX Exerciser
133
134    Collect NVIDIA Log File  end
135    Error Logs Should Not Exist
136    REST Power Off
137
138    Flush REST Sessions
139
140    Print Timen  HTX Test ran for: ${HTX_DURATION}
141
142    ${loop_count}=  Catenate  Ending iteration: ${iteration}
143    Printn
144    Rpvars  loop_count
145
146
147Loop HTX Health Check
148    [Documentation]  Run until HTX exerciser fails.
149
150    Repeat Keyword  ${HTX_DURATION}
151    ...  Run Keywords  Check HTX Run Status
152    ...  AND  Sleep  ${HTX_INTERVAL}
153
154
155Test Setup Execution
156    [Documentation]  Do the initial test setup.
157
158    REST Power On  stack_mode=skip
159    Run Key U  Sleep \ 15s
160    Delete All Error Logs
161    Tool Exist  lspci
162    Tool Exist  htxcmdline
163    Tool Exist  nvidia-smi
164
165    # Get number of GPUs reported by the OS.
166    ${cmd}=  Catenate  lspci | grep NVIDIA | wc -l
167    ${num_os_gpus}  ${stderr}  ${rc}=  OS Execute Command  ${cmd}
168    Printn
169    Rpvars  num_os_gpus
170
171    # If no GPUs detected, we cannot continue.
172    IF  '${num_os_gpus}' == '${0}'  Fail  msg=No GPUs detected so cannot run test.
173
174    Set Suite Variable  ${num_os_gpus}  children=true
175
176
177
178Test Teardown Execution
179    [Documentation]  Do the post test teardown.
180
181    # Keep HTX running if user set HTX_KEEP_RUNNING to 1.
182    IF  '${TEST_STATUS}' == 'FAIL' and ${HTX_KEEP_RUNNING} == ${0}  Shutdown HTX Exerciser
183
184    ${keyword_buf}=  Catenate  Stop SOL Console Logging
185    ...  \ targ_file_path=${EXECDIR}${/}logs${/}SOL.log
186    Run Key  ${keyword_buf}
187
188    FFDC On Test Case Fail
189    Close All Connections
190