xref: /openbmc/openbmc-test-automation/systest/gpu_stress_test.robot (revision c108e429de3727bd196385b88abe78f8c9794b0e)
1664a0162SGeorge Keishing*** Settings ***
20278b130SSteven SombarDocumentation    Stress the system GPUs using the HTX exerciser.
30278b130SSteven Sombar
40278b130SSteven Sombar# Test Parameters:
50278b130SSteven Sombar# OPENBMC_HOST        The BMC host name or IP address.
60278b130SSteven Sombar# OS_HOST             The OS host name or IP Address.
70278b130SSteven Sombar# OS_USERNAME         The OS login userid (usually "root").
80278b130SSteven Sombar# OS_PASSWORD         The password for the OS login.
90278b130SSteven Sombar# HTX_DURATION        Duration of HTX run, for example, "2h", or "30m".
100278b130SSteven Sombar# HTX_LOOP            The number of times to loop HTX.
110278b130SSteven Sombar# HTX_INTERVAL        The time delay between consecutive checks of HTX
120278b130SSteven Sombar#                     status, for example, "15m".
130278b130SSteven Sombar#                     In summary: Run HTX for $HTX_DURATION, looping
140278b130SSteven Sombar#                     $HTX_LOOP times checking for errors every
150278b130SSteven Sombar#                     $HTX_INTERVAL.  Then allow extra time for OS
160278b130SSteven Sombar#                     Boot, HTX startup, shutdown.
170278b130SSteven Sombar# HTX_KEEP_RUNNING    If set to 1, this indicates that the HTX is to
180278b130SSteven Sombar#                     continue running after an error was found.
190278b130SSteven Sombar
20664a0162SGeorge Keishing
21664a0162SGeorge KeishingResource         ../syslib/utils_os.robot
22664a0162SGeorge Keishing
23197e3800SSteven SombarSuite Setup      Run Keyword  Start SOL Console Logging
24dc1691d9SGeorge KeishingTest Setup       Test Setup Execution
25197e3800SSteven SombarTest Teardown    Test Teardown Execution
26664a0162SGeorge Keishing
27664a0162SGeorge Keishing*** Variables ****
28664a0162SGeorge Keishing
290278b130SSteven Sombar${HTX_DURATION}      1h
300278b130SSteven Sombar${HTX_LOOP}          ${1}
310278b130SSteven Sombar${HTX_INTERVAL}      30m
320278b130SSteven Sombar${HTX_KEEP_RUNNING}  ${0}
33664a0162SGeorge Keishing${stack_mode}        skip
34664a0162SGeorge Keishing
35664a0162SGeorge Keishing*** Test Cases ***
36664a0162SGeorge Keishing
37664a0162SGeorge KeishingGPU Stress Test
38664a0162SGeorge Keishing    [Documentation]  Stress the GPU using HTX exerciser.
39664a0162SGeorge Keishing    [Tags]  GPU_Stress_Test
40664a0162SGeorge Keishing
410278b130SSteven Sombar    # Get number of GPU reported by the BMC.
420278b130SSteven Sombar    ${num_bmc_gpus}=  Count GPUs From BMC
430278b130SSteven Sombar    Rpvars  num_bmc_gpus
440278b130SSteven Sombar
450278b130SSteven Sombar    # The BMC and OS should report the same number of GPUs.
460278b130SSteven Sombar    ${failmsg01}=  Catenate  OS reports ${num_os_gpus} GPUs, but BMC
470278b130SSteven Sombar    ...  reports ${num_bmc_gpus} present and functional GPUs.
480278b130SSteven Sombar    Run Keyword If  '${num_os_gpus}' != '${num_bmc_gpus}'
490278b130SSteven Sombar    ...  Fail  msg=${failmsg01}
500278b130SSteven Sombar
510278b130SSteven Sombar    # Show parameters for HTX stress test.
52*c108e429SMichael Walsh    Printn
530278b130SSteven Sombar    Rpvars  HTX_DURATION  HTX_LOOP  HTX_INTERVAL
540278b130SSteven Sombar
550278b130SSteven Sombar    # Set the iteration (loop) counter.
560278b130SSteven Sombar    Set Suite Variable  ${iteration}  ${0}  children=true
570278b130SSteven Sombar
580278b130SSteven Sombar
590278b130SSteven Sombar    # Shutdown HTX if it is already running.
603ecb6899SSteven Sombar    ${status}=  Is HTX Running
610278b130SSteven Sombar    Run Keyword If  '${status}' == 'True'
620278b130SSteven Sombar    ...  Shutdown HTX Exerciser
63664a0162SGeorge Keishing
64664a0162SGeorge Keishing    Repeat Keyword  ${HTX_LOOP} times  Execute GPU Test
65664a0162SGeorge Keishing
66664a0162SGeorge Keishing
67664a0162SGeorge Keishing*** Keywords ***
68664a0162SGeorge Keishing
69664a0162SGeorge KeishingExecute GPU Test
70664a0162SGeorge Keishing    [Documentation]  Start HTX exerciser.
71664a0162SGeorge Keishing    # Test Flow:
72664a0162SGeorge Keishing    #              - Power on
73664a0162SGeorge Keishing    #              - Establish SSH connection session
74664a0162SGeorge Keishing    #              - Collect GPU nvidia status output
75664a0162SGeorge Keishing    #              - Create HTX mdt profile
76664a0162SGeorge Keishing    #              - Run GPU specific HTX exerciser
770278b130SSteven Sombar    #              - Check for errors
780278b130SSteven Sombar
790278b130SSteven Sombar    Set Suite Variable  ${iteration}  ${iteration + 1}
800278b130SSteven Sombar    ${loop_count}=  Catenate  Starting iteration: ${iteration}
81*c108e429SMichael Walsh    Printn
820278b130SSteven Sombar    Rpvars  loop_count
830278b130SSteven Sombar
840278b130SSteven Sombar    REST Power On  stack_mode=skip
851ddc7c68SSteven Sombar    Run Key U  Sleep \ 15s
86664a0162SGeorge Keishing
87664a0162SGeorge Keishing    # Collect data before the test starts.
88664a0162SGeorge Keishing    Collect NVIDIA Log File  start
89664a0162SGeorge Keishing
900278b130SSteven Sombar    # Collect NVIDIA maximum limits.
910278b130SSteven Sombar    ${power_max}=  Get GPU Power Limit
920278b130SSteven Sombar    ${temperature_max}=  Get GPU Temperature Limit
930278b130SSteven Sombar    ${clock_max}=  Get GPU Clock Limit
940278b130SSteven Sombar
95664a0162SGeorge Keishing    Run Keyword If  '${HTX_MDT_PROFILE}' == 'mdt.bu'
96664a0162SGeorge Keishing    ...  Create Default MDT Profile
97664a0162SGeorge Keishing
98664a0162SGeorge Keishing    Run MDT Profile
99664a0162SGeorge Keishing
100664a0162SGeorge Keishing    Loop HTX Health Check
101664a0162SGeorge Keishing
102664a0162SGeorge Keishing    # Post test loop look out for dmesg error logged.
103664a0162SGeorge Keishing    Check For Errors On OS Dmesg Log
104664a0162SGeorge Keishing
1050278b130SSteven Sombar    # Check NVIDIA power, temperature, and clocks.
10626975f0bSJoy Onyerikwu    ${power}=  Get GPU Max Power
10726975f0bSJoy Onyerikwu    ${temperature}=  Get GPU Max Temperature
108c02dde84SSteven Sombar    ${temperature_via_rest}=  Get GPU Temperature Via REST
1090278b130SSteven Sombar    ${clock}=  Get GPU Clock
110*c108e429SMichael Walsh    Printn
111c02dde84SSteven Sombar    Rpvars  power  power_max  temperature  temperature_via_rest
112c02dde84SSteven Sombar    ...  temperature_max  clock  clock_max
113c02dde84SSteven Sombar
1140278b130SSteven Sombar    Run Keyword If  ${power} > ${power_max}  Fail
1150278b130SSteven Sombar    ...  msg=GPU Power ${power} exceeds limit of ${power_max}.
116c02dde84SSteven Sombar
117c02dde84SSteven Sombar    ${err_msg}=  Catenate  GPU temperature of ${temperature} exceeds limit
1180278b130SSteven Sombar    ...  of ${temperature_max}.
119c02dde84SSteven Sombar    Run Keyword If  ${temperature} > ${temperature_max}  Fail  msg=${err_msg}
120c02dde84SSteven Sombar
1210278b130SSteven Sombar    Run Keyword If  ${clock} > ${clock_max}  Fail
1220278b130SSteven Sombar    ...  msg=GPU clock of ${clock} exceeds limit of ${clock_max}.
1230278b130SSteven Sombar
124c02dde84SSteven Sombar    ${err_msg}=  Catenate  The GPU temperature reported by REST is not within
125c02dde84SSteven Sombar    ...  5 degrees of the nvidia_smi reported temperature.
126c02dde84SSteven Sombar    ${upper_limit}=  Evaluate  ${temperature_via_rest}+5
127c02dde84SSteven Sombar    ${lower_limit}=  Evaluate  ${temperature_via_rest}-5
128c02dde84SSteven Sombar    Run Keyword If
129c02dde84SSteven Sombar    ...  ${temperature} > ${upper_limit} or ${temperature} < ${lower_limit}
130c02dde84SSteven Sombar    ...  Fail  msg=${err_msg}
131c02dde84SSteven Sombar
132664a0162SGeorge Keishing    Shutdown HTX Exerciser
133664a0162SGeorge Keishing
1340278b130SSteven Sombar    Collect NVIDIA Log File  end
1350278b130SSteven Sombar    Error Logs Should Not Exist
1360278b130SSteven Sombar    REST Power Off
1370278b130SSteven Sombar
1380278b130SSteven Sombar    Flush REST Sessions
1390278b130SSteven Sombar
140*c108e429SMichael Walsh    Print Timen  HTX Test ran for: ${HTX_DURATION}
141664a0162SGeorge Keishing
1420278b130SSteven Sombar    ${loop_count}=  Catenate  Ending iteration: ${iteration}
143*c108e429SMichael Walsh    Printn
1440278b130SSteven Sombar    Rpvars  loop_count
1450278b130SSteven Sombar
146664a0162SGeorge Keishing
147664a0162SGeorge KeishingLoop HTX Health Check
148664a0162SGeorge Keishing    [Documentation]  Run until HTX exerciser fails.
149664a0162SGeorge Keishing
150664a0162SGeorge Keishing    Repeat Keyword  ${HTX_DURATION}
151664a0162SGeorge Keishing    ...  Run Keywords  Check HTX Run Status
152664a0162SGeorge Keishing    ...  AND  Sleep  ${HTX_INTERVAL}
153664a0162SGeorge Keishing
154664a0162SGeorge Keishing
1550278b130SSteven SombarTest Setup Execution
1560278b130SSteven Sombar    [Documentation]  Do the initial test setup.
1570278b130SSteven Sombar
1580278b130SSteven Sombar    REST Power On  stack_mode=skip
1591ddc7c68SSteven Sombar    Run Key U  Sleep \ 15s
1600278b130SSteven Sombar    Delete All Error Logs
1610278b130SSteven Sombar    Tool Exist  lspci
1620278b130SSteven Sombar    Tool Exist  htxcmdline
1630278b130SSteven Sombar    Tool Exist  nvidia-smi
1640278b130SSteven Sombar
1650278b130SSteven Sombar    # Get number of GPUs reported by the OS.
1660278b130SSteven Sombar    ${cmd}=  Catenate  lspci | grep NVIDIA | wc -l
1670278b130SSteven Sombar    ${num_os_gpus}  ${stderr}  ${rc}=  OS Execute Command  ${cmd}
168*c108e429SMichael Walsh    Printn
1690278b130SSteven Sombar    Rpvars  num_os_gpus
1700278b130SSteven Sombar
1710278b130SSteven Sombar    # If no GPUs detected, we cannot continue.
1720278b130SSteven Sombar    Run Keyword If  '${num_os_gpus}' == '${0}'  Fail
1730278b130SSteven Sombar    ...  msg=No GPUs detected so cannot run test.
1740278b130SSteven Sombar
1750278b130SSteven Sombar    Set Suite Variable  ${num_os_gpus}  children=true
1760278b130SSteven Sombar
1770278b130SSteven Sombar
1780278b130SSteven Sombar
179197e3800SSteven SombarTest Teardown Execution
180664a0162SGeorge Keishing    [Documentation]  Do the post test teardown.
181664a0162SGeorge Keishing
182664a0162SGeorge Keishing    # Keep HTX running if user set HTX_KEEP_RUNNING to 1.
183664a0162SGeorge Keishing    Run Keyword If  '${TEST_STATUS}' == 'FAIL' and ${HTX_KEEP_RUNNING} == ${0}
184664a0162SGeorge Keishing    ...  Shutdown HTX Exerciser
185664a0162SGeorge Keishing
186197e3800SSteven Sombar    ${keyword_buf}=  Catenate  Stop SOL Console Logging
187197e3800SSteven Sombar    ...  \ targ_file_path=${EXECDIR}${/}logs${/}SOL.log
188197e3800SSteven Sombar    Run Key  ${keyword_buf}
189197e3800SSteven Sombar
190664a0162SGeorge Keishing    FFDC On Test Case Fail
191664a0162SGeorge Keishing    Close All Connections
192