1*** Settings ***
2Documentation    Stress the system GPUs using the HTX exerciser.
3
4# Test Parameters:
5# OPENBMC_HOST        The BMC host name or IP address.
6# OS_HOST             The OS host name or IP Address.
7# OS_USERNAME         The OS login userid (usually "root").
8# OS_PASSWORD         The password for the OS login.
9# HTX_DURATION        Duration of HTX run, for example, "2h", or "30m".
10# HTX_LOOP            The number of times to loop HTX.
11# HTX_INTERVAL        The time delay between consecutive checks of HTX
12#                     status, for example, "15m".
13#                     In summary: Run HTX for $HTX_DURATION, looping
14#                     $HTX_LOOP times checking for errors every
15#                     $HTX_INTERVAL.  Then allow extra time for OS
16#                     Boot, HTX startup, shutdown.
17# HTX_KEEP_RUNNING    If set to 1, this indicates that the HTX is to
18#                     continue running after an error was found.
19
20
21Resource         ../syslib/utils_os.robot
22
23Suite Setup      Run Keyword  Start SOL Console Logging
24Test Setup       Test Setup Execution
25Test Teardown    Test Teardown Execution
26
27*** Variables ****
28
29${HTX_DURATION}      1h
30${HTX_LOOP}          ${1}
31${HTX_INTERVAL}      30m
32${HTX_KEEP_RUNNING}  ${0}
33${stack_mode}        skip
34
35*** Test Cases ***
36
37GPU Stress Test
38    [Documentation]  Stress the GPU using HTX exerciser.
39    [Tags]  GPU_Stress_Test
40
41    # Get number of GPU reported by the BMC.
42    ${num_bmc_gpus}=  Count GPUs From BMC
43    Rpvars  num_bmc_gpus
44
45    # The BMC and OS should report the same number of GPUs.
46    ${failmsg01}=  Catenate  OS reports ${num_os_gpus} GPUs, but BMC
47    ...  reports ${num_bmc_gpus} present and functional GPUs.
48    Run Keyword If  '${num_os_gpus}' != '${num_bmc_gpus}'
49    ...  Fail  msg=${failmsg01}
50
51    # Show parameters for HTX stress test.
52    Rprintn
53    Rpvars  HTX_DURATION  HTX_LOOP  HTX_INTERVAL
54
55    # Set the iteration (loop) counter.
56    Set Suite Variable  ${iteration}  ${0}  children=true
57
58
59    # Shutdown HTX if it is already running.
60    ${status}=  Is HTX Running
61    Run Keyword If  '${status}' == 'True'
62    ...  Shutdown HTX Exerciser
63
64    Repeat Keyword  ${HTX_LOOP} times  Execute GPU Test
65
66
67*** Keywords ***
68
69Execute GPU Test
70    [Documentation]  Start HTX exerciser.
71    # Test Flow:
72    #              - Power on
73    #              - Establish SSH connection session
74    #              - Collect GPU nvidia status output
75    #              - Create HTX mdt profile
76    #              - Run GPU specific HTX exerciser
77    #              - Check for errors
78
79    Set Suite Variable  ${iteration}  ${iteration + 1}
80    ${loop_count}=  Catenate  Starting iteration: ${iteration}
81    Rprintn
82    Rpvars  loop_count
83
84    REST Power On  stack_mode=skip
85
86    # Collect data before the test starts.
87    Collect NVIDIA Log File  start
88
89    # Collect NVIDIA maximum limits.
90    ${power_max}=  Get GPU Power Limit
91    ${temperature_max}=  Get GPU Temperature Limit
92    ${clock_max}=  Get GPU Clock Limit
93
94    Run Keyword If  '${HTX_MDT_PROFILE}' == 'mdt.bu'
95    ...  Create Default MDT Profile
96
97    Run MDT Profile
98
99    Loop HTX Health Check
100
101    # Post test loop look out for dmesg error logged.
102    Check For Errors On OS Dmesg Log
103
104    # Check NVIDIA power, temperature, and clocks.
105    ${power}=  Get GPU Max Power
106    ${temperature}=  Get GPU Max Temperature
107    ${temperature_via_rest}=  Get GPU Temperature Via REST
108    ${clock}=  Get GPU Clock
109    Rprintn
110    Rpvars  power  power_max  temperature  temperature_via_rest
111    ...  temperature_max  clock  clock_max
112
113    Run Keyword If  ${power} > ${power_max}  Fail
114    ...  msg=GPU Power ${power} exceeds limit of ${power_max}.
115
116    ${err_msg}=  Catenate  GPU temperature of ${temperature} exceeds limit
117    ...  of ${temperature_max}.
118    Run Keyword If  ${temperature} > ${temperature_max}  Fail  msg=${err_msg}
119
120    Run Keyword If  ${clock} > ${clock_max}  Fail
121    ...  msg=GPU clock of ${clock} exceeds limit of ${clock_max}.
122
123    ${err_msg}=  Catenate  The GPU temperature reported by REST is not within
124    ...  5 degrees of the nvidia_smi reported temperature.
125    ${upper_limit}=  Evaluate  ${temperature_via_rest}+5
126    ${lower_limit}=  Evaluate  ${temperature_via_rest}-5
127    Run Keyword If
128    ...  ${temperature} > ${upper_limit} or ${temperature} < ${lower_limit}
129    ...  Fail  msg=${err_msg}
130
131    Shutdown HTX Exerciser
132
133    Collect NVIDIA Log File  end
134    Error Logs Should Not Exist
135    REST Power Off
136
137    Flush REST Sessions
138
139    Rprint Timen  HTX Test ran for: ${HTX_DURATION}
140
141    ${loop_count}=  Catenate  Ending iteration: ${iteration}
142    Rprintn
143    Rpvars  loop_count
144
145
146Loop HTX Health Check
147    [Documentation]  Run until HTX exerciser fails.
148
149    Repeat Keyword  ${HTX_DURATION}
150    ...  Run Keywords  Check HTX Run Status
151    ...  AND  Sleep  ${HTX_INTERVAL}
152
153
154Test Setup Execution
155    [Documentation]  Do the initial test setup.
156
157    REST Power On  stack_mode=skip
158    Delete All Error Logs
159    Tool Exist  lspci
160    Tool Exist  htxcmdline
161    Tool Exist  nvidia-smi
162
163    # Get number of GPUs reported by the OS.
164    ${cmd}=  Catenate  lspci | grep NVIDIA | wc -l
165    ${num_os_gpus}  ${stderr}  ${rc}=  OS Execute Command  ${cmd}
166    Rprintn
167    Rpvars  num_os_gpus
168
169    # If no GPUs detected, we cannot continue.
170    Run Keyword If  '${num_os_gpus}' == '${0}'  Fail
171    ...  msg=No GPUs detected so cannot run test.
172
173    Set Suite Variable  ${num_os_gpus}  children=true
174
175
176
177Test Teardown Execution
178    [Documentation]  Do the post test teardown.
179
180    # Keep HTX running if user set HTX_KEEP_RUNNING to 1.
181    Run Keyword If  '${TEST_STATUS}' == 'FAIL' and ${HTX_KEEP_RUNNING} == ${0}
182    ...  Shutdown HTX Exerciser
183
184    ${keyword_buf}=  Catenate  Stop SOL Console Logging
185    ...  \ targ_file_path=${EXECDIR}${/}logs${/}SOL.log
186    Run Key  ${keyword_buf}
187
188    FFDC On Test Case Fail
189    Close All Connections
190