xref: /openbmc/openbmc-test-automation/systest/gpu_stress_test.robot (revision 6fb70d98f2f1cb9273ba912deaa2cebe3c23ea86)
1664a0162SGeorge Keishing*** Settings ***
20278b130SSteven SombarDocumentation    Stress the system GPUs using the HTX exerciser.
30278b130SSteven Sombar
40278b130SSteven Sombar# Test Parameters:
50278b130SSteven Sombar# OPENBMC_HOST        The BMC host name or IP address.
60278b130SSteven Sombar# OS_HOST             The OS host name or IP Address.
70278b130SSteven Sombar# OS_USERNAME         The OS login userid (usually "root").
80278b130SSteven Sombar# OS_PASSWORD         The password for the OS login.
90278b130SSteven Sombar# HTX_DURATION        Duration of HTX run, for example, "2h", or "30m".
100278b130SSteven Sombar# HTX_LOOP            The number of times to loop HTX.
110278b130SSteven Sombar# HTX_INTERVAL        The time delay between consecutive checks of HTX
120278b130SSteven Sombar#                     status, for example, "15m".
130278b130SSteven Sombar#                     In summary: Run HTX for $HTX_DURATION, looping
140278b130SSteven Sombar#                     $HTX_LOOP times checking for errors every
150278b130SSteven Sombar#                     $HTX_INTERVAL.  Then allow extra time for OS
160278b130SSteven Sombar#                     Boot, HTX startup, shutdown.
170278b130SSteven Sombar# HTX_KEEP_RUNNING    If set to 1, this indicates that the HTX is to
180278b130SSteven Sombar#                     continue running after an error was found.
190278b130SSteven Sombar
20664a0162SGeorge Keishing
21a28061a0SGeorge KeishingResource         ../lib/os_utilities.robot
22664a0162SGeorge Keishing
23197e3800SSteven SombarSuite Setup      Run Keyword  Start SOL Console Logging
24dc1691d9SGeorge KeishingTest Setup       Test Setup Execution
25197e3800SSteven SombarTest Teardown    Test Teardown Execution
26664a0162SGeorge Keishing
27*6fb70d98SMatt FischerTest Tags       GPU_Stress
2887dc442cSGeorge Keishing
29664a0162SGeorge Keishing*** Variables ****
30664a0162SGeorge Keishing
310278b130SSteven Sombar${HTX_DURATION}      1h
320278b130SSteven Sombar${HTX_LOOP}          ${1}
330278b130SSteven Sombar${HTX_INTERVAL}      30m
340278b130SSteven Sombar${HTX_KEEP_RUNNING}  ${0}
35664a0162SGeorge Keishing${stack_mode}        skip
36664a0162SGeorge Keishing
37664a0162SGeorge Keishing*** Test Cases ***
38664a0162SGeorge Keishing
39664a0162SGeorge KeishingGPU Stress Test
40664a0162SGeorge Keishing    [Documentation]  Stress the GPU using HTX exerciser.
41664a0162SGeorge Keishing    [Tags]  GPU_Stress_Test
42664a0162SGeorge Keishing
430278b130SSteven Sombar    # Get number of GPU reported by the BMC.
440278b130SSteven Sombar    ${num_bmc_gpus}=  Count GPUs From BMC
450278b130SSteven Sombar    Rpvars  num_bmc_gpus
460278b130SSteven Sombar
470278b130SSteven Sombar    # The BMC and OS should report the same number of GPUs.
480278b130SSteven Sombar    ${failmsg01}=  Catenate  OS reports ${num_os_gpus} GPUs, but BMC
490278b130SSteven Sombar    ...  reports ${num_bmc_gpus} present and functional GPUs.
500278b130SSteven Sombar    Run Keyword If  '${num_os_gpus}' != '${num_bmc_gpus}'
510278b130SSteven Sombar    ...  Fail  msg=${failmsg01}
520278b130SSteven Sombar
530278b130SSteven Sombar    # Show parameters for HTX stress test.
54c108e429SMichael Walsh    Printn
550278b130SSteven Sombar    Rpvars  HTX_DURATION  HTX_LOOP  HTX_INTERVAL
560278b130SSteven Sombar
570278b130SSteven Sombar    # Set the iteration (loop) counter.
580278b130SSteven Sombar    Set Suite Variable  ${iteration}  ${0}  children=true
590278b130SSteven Sombar
600278b130SSteven Sombar
610278b130SSteven Sombar    # Shutdown HTX if it is already running.
623ecb6899SSteven Sombar    ${status}=  Is HTX Running
630278b130SSteven Sombar    Run Keyword If  '${status}' == 'True'
640278b130SSteven Sombar    ...  Shutdown HTX Exerciser
65664a0162SGeorge Keishing
66664a0162SGeorge Keishing    Repeat Keyword  ${HTX_LOOP} times  Execute GPU Test
67664a0162SGeorge Keishing
68664a0162SGeorge Keishing
69664a0162SGeorge Keishing*** Keywords ***
70664a0162SGeorge Keishing
71664a0162SGeorge KeishingExecute GPU Test
72664a0162SGeorge Keishing    [Documentation]  Start HTX exerciser.
73664a0162SGeorge Keishing    # Test Flow:
74664a0162SGeorge Keishing    #              - Power on
75664a0162SGeorge Keishing    #              - Establish SSH connection session
76664a0162SGeorge Keishing    #              - Collect GPU nvidia status output
77664a0162SGeorge Keishing    #              - Create HTX mdt profile
78664a0162SGeorge Keishing    #              - Run GPU specific HTX exerciser
790278b130SSteven Sombar    #              - Check for errors
800278b130SSteven Sombar
810278b130SSteven Sombar    Set Suite Variable  ${iteration}  ${iteration + 1}
820278b130SSteven Sombar    ${loop_count}=  Catenate  Starting iteration: ${iteration}
83c108e429SMichael Walsh    Printn
840278b130SSteven Sombar    Rpvars  loop_count
850278b130SSteven Sombar
860278b130SSteven Sombar    REST Power On  stack_mode=skip
871ddc7c68SSteven Sombar    Run Key U  Sleep \ 15s
88664a0162SGeorge Keishing
89664a0162SGeorge Keishing    # Collect data before the test starts.
90664a0162SGeorge Keishing    Collect NVIDIA Log File  start
91664a0162SGeorge Keishing
920278b130SSteven Sombar    # Collect NVIDIA maximum limits.
930278b130SSteven Sombar    ${power_max}=  Get GPU Power Limit
940278b130SSteven Sombar    ${temperature_max}=  Get GPU Temperature Limit
950278b130SSteven Sombar    ${clock_max}=  Get GPU Clock Limit
960278b130SSteven Sombar
97664a0162SGeorge Keishing    Run Keyword If  '${HTX_MDT_PROFILE}' == 'mdt.bu'
98664a0162SGeorge Keishing    ...  Create Default MDT Profile
99664a0162SGeorge Keishing
100664a0162SGeorge Keishing    Run MDT Profile
101664a0162SGeorge Keishing
102664a0162SGeorge Keishing    Loop HTX Health Check
103664a0162SGeorge Keishing
104664a0162SGeorge Keishing    # Post test loop look out for dmesg error logged.
105664a0162SGeorge Keishing    Check For Errors On OS Dmesg Log
106664a0162SGeorge Keishing
1070278b130SSteven Sombar    # Check NVIDIA power, temperature, and clocks.
10826975f0bSJoy Onyerikwu    ${power}=  Get GPU Max Power
10926975f0bSJoy Onyerikwu    ${temperature}=  Get GPU Max Temperature
110c02dde84SSteven Sombar    ${temperature_via_rest}=  Get GPU Temperature Via REST
1110278b130SSteven Sombar    ${clock}=  Get GPU Clock
112c108e429SMichael Walsh    Printn
113c02dde84SSteven Sombar    Rpvars  power  power_max  temperature  temperature_via_rest
114c02dde84SSteven Sombar    ...  temperature_max  clock  clock_max
115c02dde84SSteven Sombar
1160278b130SSteven Sombar    Run Keyword If  ${power} > ${power_max}  Fail
1170278b130SSteven Sombar    ...  msg=GPU Power ${power} exceeds limit of ${power_max}.
118c02dde84SSteven Sombar
119c02dde84SSteven Sombar    ${err_msg}=  Catenate  GPU temperature of ${temperature} exceeds limit
1200278b130SSteven Sombar    ...  of ${temperature_max}.
121c02dde84SSteven Sombar    Run Keyword If  ${temperature} > ${temperature_max}  Fail  msg=${err_msg}
122c02dde84SSteven Sombar
1230278b130SSteven Sombar    Run Keyword If  ${clock} > ${clock_max}  Fail
1240278b130SSteven Sombar    ...  msg=GPU clock of ${clock} exceeds limit of ${clock_max}.
1250278b130SSteven Sombar
126c02dde84SSteven Sombar    ${err_msg}=  Catenate  The GPU temperature reported by REST is not within
127c02dde84SSteven Sombar    ...  5 degrees of the nvidia_smi reported temperature.
128c02dde84SSteven Sombar    ${upper_limit}=  Evaluate  ${temperature_via_rest}+5
129c02dde84SSteven Sombar    ${lower_limit}=  Evaluate  ${temperature_via_rest}-5
130c02dde84SSteven Sombar    Run Keyword If
131c02dde84SSteven Sombar    ...  ${temperature} > ${upper_limit} or ${temperature} < ${lower_limit}
132c02dde84SSteven Sombar    ...  Fail  msg=${err_msg}
133c02dde84SSteven Sombar
134664a0162SGeorge Keishing    Shutdown HTX Exerciser
135664a0162SGeorge Keishing
1360278b130SSteven Sombar    Collect NVIDIA Log File  end
1370278b130SSteven Sombar    Error Logs Should Not Exist
1380278b130SSteven Sombar    REST Power Off
1390278b130SSteven Sombar
1400278b130SSteven Sombar    Flush REST Sessions
1410278b130SSteven Sombar
142c108e429SMichael Walsh    Print Timen  HTX Test ran for: ${HTX_DURATION}
143664a0162SGeorge Keishing
1440278b130SSteven Sombar    ${loop_count}=  Catenate  Ending iteration: ${iteration}
145c108e429SMichael Walsh    Printn
1460278b130SSteven Sombar    Rpvars  loop_count
1470278b130SSteven Sombar
148664a0162SGeorge Keishing
149664a0162SGeorge KeishingLoop HTX Health Check
150664a0162SGeorge Keishing    [Documentation]  Run until HTX exerciser fails.
151664a0162SGeorge Keishing
152664a0162SGeorge Keishing    Repeat Keyword  ${HTX_DURATION}
153664a0162SGeorge Keishing    ...  Run Keywords  Check HTX Run Status
154664a0162SGeorge Keishing    ...  AND  Sleep  ${HTX_INTERVAL}
155664a0162SGeorge Keishing
156664a0162SGeorge Keishing
1570278b130SSteven SombarTest Setup Execution
1580278b130SSteven Sombar    [Documentation]  Do the initial test setup.
1590278b130SSteven Sombar
1600278b130SSteven Sombar    REST Power On  stack_mode=skip
1611ddc7c68SSteven Sombar    Run Key U  Sleep \ 15s
1620278b130SSteven Sombar    Delete All Error Logs
1630278b130SSteven Sombar    Tool Exist  lspci
1640278b130SSteven Sombar    Tool Exist  htxcmdline
1650278b130SSteven Sombar    Tool Exist  nvidia-smi
1660278b130SSteven Sombar
1670278b130SSteven Sombar    # Get number of GPUs reported by the OS.
1680278b130SSteven Sombar    ${cmd}=  Catenate  lspci | grep NVIDIA | wc -l
1690278b130SSteven Sombar    ${num_os_gpus}  ${stderr}  ${rc}=  OS Execute Command  ${cmd}
170c108e429SMichael Walsh    Printn
1710278b130SSteven Sombar    Rpvars  num_os_gpus
1720278b130SSteven Sombar
1730278b130SSteven Sombar    # If no GPUs detected, we cannot continue.
1740278b130SSteven Sombar    Run Keyword If  '${num_os_gpus}' == '${0}'  Fail
1750278b130SSteven Sombar    ...  msg=No GPUs detected so cannot run test.
1760278b130SSteven Sombar
1770278b130SSteven Sombar    Set Suite Variable  ${num_os_gpus}  children=true
1780278b130SSteven Sombar
1790278b130SSteven Sombar
1800278b130SSteven Sombar
181197e3800SSteven SombarTest Teardown Execution
182664a0162SGeorge Keishing    [Documentation]  Do the post test teardown.
183664a0162SGeorge Keishing
184664a0162SGeorge Keishing    # Keep HTX running if user set HTX_KEEP_RUNNING to 1.
185664a0162SGeorge Keishing    Run Keyword If  '${TEST_STATUS}' == 'FAIL' and ${HTX_KEEP_RUNNING} == ${0}
186664a0162SGeorge Keishing    ...  Shutdown HTX Exerciser
187664a0162SGeorge Keishing
188197e3800SSteven Sombar    ${keyword_buf}=  Catenate  Stop SOL Console Logging
189197e3800SSteven Sombar    ...  \ targ_file_path=${EXECDIR}${/}logs${/}SOL.log
190197e3800SSteven Sombar    Run Key  ${keyword_buf}
191197e3800SSteven Sombar
192664a0162SGeorge Keishing    FFDC On Test Case Fail
193664a0162SGeorge Keishing    Close All Connections
194