xref: /openbmc/openbmc-test-automation/systest/gpu_stress_test.robot (revision bfa16ee4f68964bd5dd20618cb3b293584b78c69)
1*** Settings ***
2Documentation    Stress the system GPUs using the HTX exerciser.
3
4# Test Parameters:
5# OPENBMC_HOST        The BMC host name or IP address.
6# OS_HOST             The OS host name or IP Address.
7# OS_USERNAME         The OS login userid (usually "root").
8# OS_PASSWORD         The password for the OS login.
9# HTX_DURATION        Duration of HTX run, for example, "2h", or "30m".
10# HTX_LOOP            The number of times to loop HTX.
11# HTX_INTERVAL        The time delay between consecutive checks of HTX
12#                     status, for example, "15m".
13#                     In summary: Run HTX for $HTX_DURATION, looping
14#                     $HTX_LOOP times checking for errors every
15#                     $HTX_INTERVAL.  Then allow extra time for OS
16#                     Boot, HTX startup, shutdown.
17# HTX_KEEP_RUNNING    If set to 1, this indicates that the HTX is to
18#                     continue running after an error was found.
19
20
21Resource         ../syslib/utils_os.robot
22
23Suite Setup      Run Keyword  Start SOL Console Logging
24Test Setup       Test Setup Execution
25Test Teardown    Test Teardown Execution
26
27*** Variables ****
28
29${HTX_DURATION}      1h
30${HTX_LOOP}          ${1}
31${HTX_INTERVAL}      30m
32${HTX_KEEP_RUNNING}  ${0}
33${stack_mode}        skip
34
35*** Test Cases ***
36
37GPU Stress Test
38    [Documentation]  Stress the GPU using HTX exerciser.
39    [Tags]  GPU_Stress_Test
40
41    # Get number of GPU reported by the BMC.
42    ${num_bmc_gpus}=  Count GPUs From BMC
43    Rpvars  num_bmc_gpus
44
45    # The BMC and OS should report the same number of GPUs.
46    ${failmsg01}=  Catenate  OS reports ${num_os_gpus} GPUs, but BMC
47    ...  reports ${num_bmc_gpus} present and functional GPUs.
48    Run Keyword If  '${num_os_gpus}' != '${num_bmc_gpus}'
49    ...  Fail  msg=${failmsg01}
50
51    # Show parameters for HTX stress test.
52    Rprintn
53    Rpvars  HTX_DURATION  HTX_LOOP  HTX_INTERVAL
54
55    # Set the iteration (loop) counter.
56    Set Suite Variable  ${iteration}  ${0}  children=true
57
58
59    # Shutdown HTX if it is already running.
60    ${status}=  Run Keyword And Return Status  Is HTX Running
61    Run Keyword If  '${status}' == 'True'
62    ...  Shutdown HTX Exerciser
63
64    Repeat Keyword  ${HTX_LOOP} times  Execute GPU Test
65
66
67*** Keywords ***
68
69Execute GPU Test
70    [Documentation]  Start HTX exerciser.
71    # Test Flow:
72    #              - Power on
73    #              - Establish SSH connection session
74    #              - Collect GPU nvidia status output
75    #              - Create HTX mdt profile
76    #              - Run GPU specific HTX exerciser
77    #              - Check for errors
78
79    Set Suite Variable  ${iteration}  ${iteration + 1}
80    ${loop_count}=  Catenate  Starting iteration: ${iteration}
81    Rprintn
82    Rpvars  loop_count
83
84    REST Power On  stack_mode=skip
85
86    # Collect data before the test starts.
87    Collect NVIDIA Log File  start
88
89    # Collect NVIDIA maximum limits.
90    ${power_max}=  Get GPU Power Limit
91    ${temperature_max}=  Get GPU Temperature Limit
92    ${clock_max}=  Get GPU Clock Limit
93
94    Run Keyword If  '${HTX_MDT_PROFILE}' == 'mdt.bu'
95    ...  Create Default MDT Profile
96
97    Run MDT Profile
98
99    Loop HTX Health Check
100
101    # Post test loop look out for dmesg error logged.
102    Check For Errors On OS Dmesg Log
103
104    # Check NVIDIA power, temperature, and clocks.
105    ${power}=  Get GPU Power
106    ${temperature}=  Get GPU Temperature
107    ${clock}=  Get GPU Clock
108    Rprintn
109    Rpvars  power  power_max  temperature  temperature_max  clock  clock_max
110    Run Keyword If  ${power} > ${power_max}  Fail
111    ...  msg=GPU Power ${power} exceeds limit of ${power_max}.
112    ${errmsg}=  Catenate  GPU temperature of ${temperature} exceeds limit
113    ...  of ${temperature_max}.
114    Run Keyword If  ${temperature} > ${temperature_max}  Fail  msg=${errmsg}
115    Run Keyword If  ${clock} > ${clock_max}  Fail
116    ...  msg=GPU clock of ${clock} exceeds limit of ${clock_max}.
117
118    Shutdown HTX Exerciser
119
120    Collect NVIDIA Log File  end
121    Error Logs Should Not Exist
122    REST Power Off
123
124    Flush REST Sessions
125
126    Rprint Timen  HTX Test ran for: ${HTX_DURATION}
127
128    ${loop_count}=  Catenate  Ending iteration: ${iteration}
129    Rprintn
130    Rpvars  loop_count
131
132
133Loop HTX Health Check
134    [Documentation]  Run until HTX exerciser fails.
135
136    Repeat Keyword  ${HTX_DURATION}
137    ...  Run Keywords  Check HTX Run Status
138    ...  AND  Sleep  ${HTX_INTERVAL}
139
140
141Test Setup Execution
142    [Documentation]  Do the initial test setup.
143
144    REST Power On  stack_mode=skip
145    Delete All Error Logs
146    Tool Exist  lspci
147    Tool Exist  htxcmdline
148    Tool Exist  nvidia-smi
149
150    # Get number of GPUs reported by the OS.
151    ${cmd}=  Catenate  lspci | grep NVIDIA | wc -l
152    ${num_os_gpus}  ${stderr}  ${rc}=  OS Execute Command  ${cmd}
153    Rprintn
154    Rpvars  num_os_gpus
155
156    # If no GPUs detected, we cannot continue.
157    Run Keyword If  '${num_os_gpus}' == '${0}'  Fail
158    ...  msg=No GPUs detected so cannot run test.
159
160    Set Suite Variable  ${num_os_gpus}  children=true
161
162
163
164Test Teardown Execution
165    [Documentation]  Do the post test teardown.
166
167    # Keep HTX running if user set HTX_KEEP_RUNNING to 1.
168    Run Keyword If  '${TEST_STATUS}' == 'FAIL' and ${HTX_KEEP_RUNNING} == ${0}
169    ...  Shutdown HTX Exerciser
170
171    ${keyword_buf}=  Catenate  Stop SOL Console Logging
172    ...  \ targ_file_path=${EXECDIR}${/}logs${/}SOL.log
173    Run Key  ${keyword_buf}
174
175    FFDC On Test Case Fail
176    Close All Connections
177