xref: /openbmc/openbmc-test-automation/systest/gpu_stress_test.robot (revision 162c52ac0eaf066d38319c7a4bef14440a0938a6)
1*** Settings ***
2Documentation    Stress the system GPUs using the HTX exerciser.
3
4# Test Parameters:
5# OPENBMC_HOST        The BMC host name or IP address.
6# OS_HOST             The OS host name or IP Address.
7# OS_USERNAME         The OS login userid (usually "root").
8# OS_PASSWORD         The password for the OS login.
9# HTX_DURATION        Duration of HTX run, for example, "2h", or "30m".
10# HTX_LOOP            The number of times to loop HTX.
11# HTX_INTERVAL        The time delay between consecutive checks of HTX
12#                     status, for example, "15m".
13#                     In summary: Run HTX for $HTX_DURATION, looping
14#                     $HTX_LOOP times checking for errors every
15#                     $HTX_INTERVAL.  Then allow extra time for OS
16#                     Boot, HTX startup, shutdown.
17# HTX_KEEP_RUNNING    If set to 1, this indicates that the HTX is to
18#                     continue running after an error was found.
19
20
21Resource         ../lib/os_utilities.robot
22
23Suite Setup      Run Keyword  Start SOL Console Logging
24Test Setup       Test Setup Execution
25Test Teardown    Test Teardown Execution
26
27Test Tags       GPU_Stress
28
29*** Variables ****
30
31${HTX_DURATION}      1h
32${HTX_LOOP}          ${1}
33${HTX_INTERVAL}      30m
34${HTX_KEEP_RUNNING}  ${0}
35${stack_mode}        skip
36
37*** Test Cases ***
38
39GPU Stress Test
40    [Documentation]  Stress the GPU using HTX exerciser.
41    [Tags]  GPU_Stress_Test
42
43    # Get number of GPU reported by the BMC.
44    ${num_bmc_gpus}=  Count GPUs From BMC
45    Rpvars  num_bmc_gpus
46
47    # The BMC and OS should report the same number of GPUs.
48    ${failmsg01}=  Catenate  OS reports ${num_os_gpus} GPUs, but BMC
49    ...  reports ${num_bmc_gpus} present and functional GPUs.
50    IF  '${num_os_gpus}' != '${num_bmc_gpus}'  Fail  msg=${failmsg01}
51
52    # Show parameters for HTX stress test.
53    Printn
54    Rpvars  HTX_DURATION  HTX_LOOP  HTX_INTERVAL
55
56    # Set the iteration (loop) counter.
57    Set Suite Variable  ${iteration}  ${0}  children=true
58
59
60    # Shutdown HTX if it is already running.
61    ${status}=  Is HTX Running
62    IF  '${status}' == 'True'  Shutdown HTX Exerciser
63
64    Repeat Keyword  ${HTX_LOOP} times  Execute GPU Test
65
66
67*** Keywords ***
68
69Execute GPU Test
70    [Documentation]  Start HTX exerciser.
71    # Test Flow:
72    #              - Power on
73    #              - Establish SSH connection session
74    #              - Collect GPU nvidia status output
75    #              - Create HTX mdt profile
76    #              - Run GPU specific HTX exerciser
77    #              - Check for errors
78
79    Set Suite Variable  ${iteration}  ${iteration + 1}
80    ${loop_count}=  Catenate  Starting iteration: ${iteration}
81    Printn
82    Rpvars  loop_count
83
84    REST Power On  stack_mode=skip
85    Run Key U  Sleep \ 15s
86
87    # Collect data before the test starts.
88    Collect NVIDIA Log File  start
89
90    # Collect NVIDIA maximum limits.
91    ${power_max}=  Get GPU Power Limit
92    ${temperature_max}=  Get GPU Temperature Limit
93    ${clock_max}=  Get GPU Clock Limit
94
95    Run Keyword If  '${HTX_MDT_PROFILE}' == 'mdt.bu'
96    ...  Create Default MDT Profile
97
98    Run MDT Profile
99
100    Loop HTX Health Check
101
102    # Post test loop look out for dmesg error logged.
103    Check For Errors On OS Dmesg Log
104
105    # Check NVIDIA power, temperature, and clocks.
106    ${power}=  Get GPU Max Power
107    ${temperature}=  Get GPU Max Temperature
108    ${temperature_via_rest}=  Get GPU Temperature Via REST
109    ${clock}=  Get GPU Clock
110    Printn
111    Rpvars  power  power_max  temperature  temperature_via_rest
112    ...  temperature_max  clock  clock_max
113
114    IF  ${power} > ${power_max}
115        Fail  msg=GPU Power ${power} exceeds limit of ${power_max}.
116    END
117
118    ${err_msg}=  Catenate  GPU temperature of ${temperature} exceeds limit
119    ...  of ${temperature_max}.
120    IF  ${temperature} > ${temperature_max}  Fail  msg=${err_msg}
121
122    IF  ${clock} > ${clock_max}  Fail  msg=GPU clock of ${clock} exceeds limit of ${clock_max}.
123
124    ${err_msg}=  Catenate  The GPU temperature reported by REST is not within
125    ...  5 degrees of the nvidia_smi reported temperature.
126    ${upper_limit}=  Evaluate  ${temperature_via_rest}+5
127    ${lower_limit}=  Evaluate  ${temperature_via_rest}-5
128
129    IF  ${temperature} > ${upper_limit} or ${temperature} < ${lower_limit}
130        Fail  msg=${err_msg}
131    END
132
133    Shutdown HTX Exerciser
134
135    Collect NVIDIA Log File  end
136    Error Logs Should Not Exist
137    REST Power Off
138
139    Flush REST Sessions
140
141    Print Timen  HTX Test ran for: ${HTX_DURATION}
142
143    ${loop_count}=  Catenate  Ending iteration: ${iteration}
144    Printn
145    Rpvars  loop_count
146
147
148Loop HTX Health Check
149    [Documentation]  Run until HTX exerciser fails.
150
151    Repeat Keyword  ${HTX_DURATION}
152    ...  Run Keywords  Check HTX Run Status
153    ...  AND  Sleep  ${HTX_INTERVAL}
154
155
156Test Setup Execution
157    [Documentation]  Do the initial test setup.
158
159    REST Power On  stack_mode=skip
160    Run Key U  Sleep \ 15s
161    Delete All Error Logs
162    Tool Exist  lspci
163    Tool Exist  htxcmdline
164    Tool Exist  nvidia-smi
165
166    # Get number of GPUs reported by the OS.
167    ${cmd}=  Catenate  lspci | grep NVIDIA | wc -l
168    ${num_os_gpus}  ${stderr}  ${rc}=  OS Execute Command  ${cmd}
169    Printn
170    Rpvars  num_os_gpus
171
172    # If no GPUs detected, we cannot continue.
173    IF  '${num_os_gpus}' == '${0}'  Fail  msg=No GPUs detected so cannot run test.
174
175    Set Suite Variable  ${num_os_gpus}  children=true
176
177
178
179Test Teardown Execution
180    [Documentation]  Do the post test teardown.
181
182    # Keep HTX running if user set HTX_KEEP_RUNNING to 1.
183    IF  '${TEST_STATUS}' == 'FAIL' and ${HTX_KEEP_RUNNING} == ${0}  Shutdown HTX Exerciser
184
185    ${keyword_buf}=  Catenate  Stop SOL Console Logging
186    ...  \ targ_file_path=${EXECDIR}${/}logs${/}SOL.log
187    Run Key  ${keyword_buf}
188
189    FFDC On Test Case Fail
190    Close All Connections
191