1*** Settings ***
2Documentation    Stress the system GPUs using the HTX exerciser.
3
4# Test Parameters:
5# OPENBMC_HOST        The BMC host name or IP address.
6# OS_HOST             The OS host name or IP Address.
7# OS_USERNAME         The OS login userid (usually "root").
8# OS_PASSWORD         The password for the OS login.
9# HTX_DURATION        Duration of HTX run, for example, "2h", or "30m".
10# HTX_LOOP            The number of times to loop HTX.
11# HTX_INTERVAL        The time delay between consecutive checks of HTX
12#                     status, for example, "15m".
13#                     In summary: Run HTX for $HTX_DURATION, looping
14#                     $HTX_LOOP times checking for errors every
15#                     $HTX_INTERVAL.  Then allow extra time for OS
16#                     Boot, HTX startup, shutdown.
17# HTX_KEEP_RUNNING    If set to 1, this indicates that the HTX is to
18#                     continue running after an error was found.
19
20
21Resource         ../syslib/utils_os.robot
22
23Suite Setup      Run Keyword  Start SOL Console Logging
24Test Setup       Test Setup Execution
25Test Teardown    Test Teardown Execution
26
27*** Variables ****
28
29${HTX_DURATION}      1h
30${HTX_LOOP}          ${1}
31${HTX_INTERVAL}      30m
32${HTX_KEEP_RUNNING}  ${0}
33${stack_mode}        skip
34
35*** Test Cases ***
36
37GPU Stress Test
38    [Documentation]  Stress the GPU using HTX exerciser.
39    [Tags]  GPU_Stress_Test
40
41    # Get number of GPU reported by the BMC.
42    ${num_bmc_gpus}=  Count GPUs From BMC
43    Rpvars  num_bmc_gpus
44
45    # The BMC and OS should report the same number of GPUs.
46    ${failmsg01}=  Catenate  OS reports ${num_os_gpus} GPUs, but BMC
47    ...  reports ${num_bmc_gpus} present and functional GPUs.
48    Run Keyword If  '${num_os_gpus}' != '${num_bmc_gpus}'
49    ...  Fail  msg=${failmsg01}
50
51    # Show parameters for HTX stress test.
52    Rprintn
53    Rpvars  HTX_DURATION  HTX_LOOP  HTX_INTERVAL
54
55    # Set the iteration (loop) counter.
56    Set Suite Variable  ${iteration}  ${0}  children=true
57
58
59    # Shutdown HTX if it is already running.
60    ${status}=  Is HTX Running
61    Run Keyword If  '${status}' == 'True'
62    ...  Shutdown HTX Exerciser
63
64    Repeat Keyword  ${HTX_LOOP} times  Execute GPU Test
65
66
67*** Keywords ***
68
69Execute GPU Test
70    [Documentation]  Start HTX exerciser.
71    # Test Flow:
72    #              - Power on
73    #              - Establish SSH connection session
74    #              - Collect GPU nvidia status output
75    #              - Create HTX mdt profile
76    #              - Run GPU specific HTX exerciser
77    #              - Check for errors
78
79    Set Suite Variable  ${iteration}  ${iteration + 1}
80    ${loop_count}=  Catenate  Starting iteration: ${iteration}
81    Rprintn
82    Rpvars  loop_count
83
84    REST Power On  stack_mode=skip
85    Run Key U  Sleep \ 15s
86
87    # Collect data before the test starts.
88    Collect NVIDIA Log File  start
89
90    # Collect NVIDIA maximum limits.
91    ${power_max}=  Get GPU Power Limit
92    ${temperature_max}=  Get GPU Temperature Limit
93    ${clock_max}=  Get GPU Clock Limit
94
95    Run Keyword If  '${HTX_MDT_PROFILE}' == 'mdt.bu'
96    ...  Create Default MDT Profile
97
98    Run MDT Profile
99
100    Loop HTX Health Check
101
102    # Post test loop look out for dmesg error logged.
103    Check For Errors On OS Dmesg Log
104
105    # Check NVIDIA power, temperature, and clocks.
106    ${power}=  Get GPU Max Power
107    ${temperature}=  Get GPU Max Temperature
108    ${temperature_via_rest}=  Get GPU Temperature Via REST
109    ${clock}=  Get GPU Clock
110    Rprintn
111    Rpvars  power  power_max  temperature  temperature_via_rest
112    ...  temperature_max  clock  clock_max
113
114    Run Keyword If  ${power} > ${power_max}  Fail
115    ...  msg=GPU Power ${power} exceeds limit of ${power_max}.
116
117    ${err_msg}=  Catenate  GPU temperature of ${temperature} exceeds limit
118    ...  of ${temperature_max}.
119    Run Keyword If  ${temperature} > ${temperature_max}  Fail  msg=${err_msg}
120
121    Run Keyword If  ${clock} > ${clock_max}  Fail
122    ...  msg=GPU clock of ${clock} exceeds limit of ${clock_max}.
123
124    ${err_msg}=  Catenate  The GPU temperature reported by REST is not within
125    ...  5 degrees of the nvidia_smi reported temperature.
126    ${upper_limit}=  Evaluate  ${temperature_via_rest}+5
127    ${lower_limit}=  Evaluate  ${temperature_via_rest}-5
128    Run Keyword If
129    ...  ${temperature} > ${upper_limit} or ${temperature} < ${lower_limit}
130    ...  Fail  msg=${err_msg}
131
132    Shutdown HTX Exerciser
133
134    Collect NVIDIA Log File  end
135    Error Logs Should Not Exist
136    REST Power Off
137
138    Flush REST Sessions
139
140    Rprint Timen  HTX Test ran for: ${HTX_DURATION}
141
142    ${loop_count}=  Catenate  Ending iteration: ${iteration}
143    Rprintn
144    Rpvars  loop_count
145
146
147Loop HTX Health Check
148    [Documentation]  Run until HTX exerciser fails.
149
150    Repeat Keyword  ${HTX_DURATION}
151    ...  Run Keywords  Check HTX Run Status
152    ...  AND  Sleep  ${HTX_INTERVAL}
153
154
155Test Setup Execution
156    [Documentation]  Do the initial test setup.
157
158    REST Power On  stack_mode=skip
159    Run Key U  Sleep \ 15s
160    Delete All Error Logs
161    Tool Exist  lspci
162    Tool Exist  htxcmdline
163    Tool Exist  nvidia-smi
164
165    # Get number of GPUs reported by the OS.
166    ${cmd}=  Catenate  lspci | grep NVIDIA | wc -l
167    ${num_os_gpus}  ${stderr}  ${rc}=  OS Execute Command  ${cmd}
168    Rprintn
169    Rpvars  num_os_gpus
170
171    # If no GPUs detected, we cannot continue.
172    Run Keyword If  '${num_os_gpus}' == '${0}'  Fail
173    ...  msg=No GPUs detected so cannot run test.
174
175    Set Suite Variable  ${num_os_gpus}  children=true
176
177
178
179Test Teardown Execution
180    [Documentation]  Do the post test teardown.
181
182    # Keep HTX running if user set HTX_KEEP_RUNNING to 1.
183    Run Keyword If  '${TEST_STATUS}' == 'FAIL' and ${HTX_KEEP_RUNNING} == ${0}
184    ...  Shutdown HTX Exerciser
185
186    ${keyword_buf}=  Catenate  Stop SOL Console Logging
187    ...  \ targ_file_path=${EXECDIR}${/}logs${/}SOL.log
188    Run Key  ${keyword_buf}
189
190    FFDC On Test Case Fail
191    Close All Connections
192