xref: /openbmc/openbmc-test-automation/systest/gpu_stress_test.robot (revision 372cd862147895d4650208802617dc99a5bbdcd5)
1*** Settings ***
2Documentation    Stress the system GPUs using the HTX exerciser.
3
4# Test Parameters:
5# OPENBMC_HOST        The BMC host name or IP address.
6# OS_HOST             The OS host name or IP Address.
7# OS_USERNAME         The OS login userid (usually "root").
8# OS_PASSWORD         The password for the OS login.
9# HTX_DURATION        Duration of HTX run, for example, "2h", or "30m".
10# HTX_LOOP            The number of times to loop HTX.
11# HTX_INTERVAL        The time delay between consecutive checks of HTX
12#                     status, for example, "15m".
13#                     In summary: Run HTX for $HTX_DURATION, looping
14#                     $HTX_LOOP times checking for errors every
15#                     $HTX_INTERVAL.  Then allow extra time for OS
16#                     Boot, HTX startup, shutdown.
17# HTX_KEEP_RUNNING    If set to 1, this indicates that the HTX is to
18#                     continue running after an error was found.
19
20
21Resource         ../lib/os_utilities.robot
22
23Suite Setup      Run Keyword  Start SOL Console Logging
24Test Setup       Test Setup Execution
25Test Teardown    Test Teardown Execution
26
27Force Tags       GPU_Stress
28
29*** Variables ****
30
31${HTX_DURATION}      1h
32${HTX_LOOP}          ${1}
33${HTX_INTERVAL}      30m
34${HTX_KEEP_RUNNING}  ${0}
35${stack_mode}        skip
36
37*** Test Cases ***
38
39GPU Stress Test
40    [Documentation]  Stress the GPU using HTX exerciser.
41    [Tags]  GPU_Stress_Test
42
43    # Get number of GPU reported by the BMC.
44    ${num_bmc_gpus}=  Count GPUs From BMC
45    Rpvars  num_bmc_gpus
46
47    # The BMC and OS should report the same number of GPUs.
48    ${failmsg01}=  Catenate  OS reports ${num_os_gpus} GPUs, but BMC
49    ...  reports ${num_bmc_gpus} present and functional GPUs.
50    Run Keyword If  '${num_os_gpus}' != '${num_bmc_gpus}'
51    ...  Fail  msg=${failmsg01}
52
53    # Show parameters for HTX stress test.
54    Printn
55    Rpvars  HTX_DURATION  HTX_LOOP  HTX_INTERVAL
56
57    # Set the iteration (loop) counter.
58    Set Suite Variable  ${iteration}  ${0}  children=true
59
60
61    # Shutdown HTX if it is already running.
62    ${status}=  Is HTX Running
63    Run Keyword If  '${status}' == 'True'
64    ...  Shutdown HTX Exerciser
65
66    Repeat Keyword  ${HTX_LOOP} times  Execute GPU Test
67
68
69*** Keywords ***
70
71Execute GPU Test
72    [Documentation]  Start HTX exerciser.
73    # Test Flow:
74    #              - Power on
75    #              - Establish SSH connection session
76    #              - Collect GPU nvidia status output
77    #              - Create HTX mdt profile
78    #              - Run GPU specific HTX exerciser
79    #              - Check for errors
80
81    Set Suite Variable  ${iteration}  ${iteration + 1}
82    ${loop_count}=  Catenate  Starting iteration: ${iteration}
83    Printn
84    Rpvars  loop_count
85
86    REST Power On  stack_mode=skip
87    Run Key U  Sleep \ 15s
88
89    # Collect data before the test starts.
90    Collect NVIDIA Log File  start
91
92    # Collect NVIDIA maximum limits.
93    ${power_max}=  Get GPU Power Limit
94    ${temperature_max}=  Get GPU Temperature Limit
95    ${clock_max}=  Get GPU Clock Limit
96
97    Run Keyword If  '${HTX_MDT_PROFILE}' == 'mdt.bu'
98    ...  Create Default MDT Profile
99
100    Run MDT Profile
101
102    Loop HTX Health Check
103
104    # Post test loop look out for dmesg error logged.
105    Check For Errors On OS Dmesg Log
106
107    # Check NVIDIA power, temperature, and clocks.
108    ${power}=  Get GPU Max Power
109    ${temperature}=  Get GPU Max Temperature
110    ${temperature_via_rest}=  Get GPU Temperature Via REST
111    ${clock}=  Get GPU Clock
112    Printn
113    Rpvars  power  power_max  temperature  temperature_via_rest
114    ...  temperature_max  clock  clock_max
115
116    Run Keyword If  ${power} > ${power_max}  Fail
117    ...  msg=GPU Power ${power} exceeds limit of ${power_max}.
118
119    ${err_msg}=  Catenate  GPU temperature of ${temperature} exceeds limit
120    ...  of ${temperature_max}.
121    Run Keyword If  ${temperature} > ${temperature_max}  Fail  msg=${err_msg}
122
123    Run Keyword If  ${clock} > ${clock_max}  Fail
124    ...  msg=GPU clock of ${clock} exceeds limit of ${clock_max}.
125
126    ${err_msg}=  Catenate  The GPU temperature reported by REST is not within
127    ...  5 degrees of the nvidia_smi reported temperature.
128    ${upper_limit}=  Evaluate  ${temperature_via_rest}+5
129    ${lower_limit}=  Evaluate  ${temperature_via_rest}-5
130    Run Keyword If
131    ...  ${temperature} > ${upper_limit} or ${temperature} < ${lower_limit}
132    ...  Fail  msg=${err_msg}
133
134    Shutdown HTX Exerciser
135
136    Collect NVIDIA Log File  end
137    Error Logs Should Not Exist
138    REST Power Off
139
140    Flush REST Sessions
141
142    Print Timen  HTX Test ran for: ${HTX_DURATION}
143
144    ${loop_count}=  Catenate  Ending iteration: ${iteration}
145    Printn
146    Rpvars  loop_count
147
148
149Loop HTX Health Check
150    [Documentation]  Run until HTX exerciser fails.
151
152    Repeat Keyword  ${HTX_DURATION}
153    ...  Run Keywords  Check HTX Run Status
154    ...  AND  Sleep  ${HTX_INTERVAL}
155
156
157Test Setup Execution
158    [Documentation]  Do the initial test setup.
159
160    REST Power On  stack_mode=skip
161    Run Key U  Sleep \ 15s
162    Delete All Error Logs
163    Tool Exist  lspci
164    Tool Exist  htxcmdline
165    Tool Exist  nvidia-smi
166
167    # Get number of GPUs reported by the OS.
168    ${cmd}=  Catenate  lspci | grep NVIDIA | wc -l
169    ${num_os_gpus}  ${stderr}  ${rc}=  OS Execute Command  ${cmd}
170    Printn
171    Rpvars  num_os_gpus
172
173    # If no GPUs detected, we cannot continue.
174    Run Keyword If  '${num_os_gpus}' == '${0}'  Fail
175    ...  msg=No GPUs detected so cannot run test.
176
177    Set Suite Variable  ${num_os_gpus}  children=true
178
179
180
181Test Teardown Execution
182    [Documentation]  Do the post test teardown.
183
184    # Keep HTX running if user set HTX_KEEP_RUNNING to 1.
185    Run Keyword If  '${TEST_STATUS}' == 'FAIL' and ${HTX_KEEP_RUNNING} == ${0}
186    ...  Shutdown HTX Exerciser
187
188    ${keyword_buf}=  Catenate  Stop SOL Console Logging
189    ...  \ targ_file_path=${EXECDIR}${/}logs${/}SOL.log
190    Run Key  ${keyword_buf}
191
192    FFDC On Test Case Fail
193    Close All Connections
194