1d5a114a6SFelix Kuehling /* 2d5a114a6SFelix Kuehling * Copyright 2014-2018 Advanced Micro Devices, Inc. 3d5a114a6SFelix Kuehling * 4d5a114a6SFelix Kuehling * Permission is hereby granted, free of charge, to any person obtaining a 5d5a114a6SFelix Kuehling * copy of this software and associated documentation files (the "Software"), 6d5a114a6SFelix Kuehling * to deal in the Software without restriction, including without limitation 7d5a114a6SFelix Kuehling * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8d5a114a6SFelix Kuehling * and/or sell copies of the Software, and to permit persons to whom the 9d5a114a6SFelix Kuehling * Software is furnished to do so, subject to the following conditions: 10d5a114a6SFelix Kuehling * 11d5a114a6SFelix Kuehling * The above copyright notice and this permission notice shall be included in 12d5a114a6SFelix Kuehling * all copies or substantial portions of the Software. 13d5a114a6SFelix Kuehling * 14d5a114a6SFelix Kuehling * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15d5a114a6SFelix Kuehling * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16d5a114a6SFelix Kuehling * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17d5a114a6SFelix Kuehling * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18d5a114a6SFelix Kuehling * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19d5a114a6SFelix Kuehling * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20d5a114a6SFelix Kuehling * OTHER DEALINGS IN THE SOFTWARE. 21d5a114a6SFelix Kuehling */ 22d5a114a6SFelix Kuehling #include "amdgpu.h" 23d5a114a6SFelix Kuehling #include "amdgpu_amdkfd.h" 24d5a114a6SFelix Kuehling #include "gc/gc_9_0_offset.h" 25d5a114a6SFelix Kuehling #include "gc/gc_9_0_sh_mask.h" 26d5a114a6SFelix Kuehling #include "vega10_enum.h" 27d5a114a6SFelix Kuehling #include "sdma0/sdma0_4_0_offset.h" 28d5a114a6SFelix Kuehling #include "sdma0/sdma0_4_0_sh_mask.h" 29d5a114a6SFelix Kuehling #include "sdma1/sdma1_4_0_offset.h" 30d5a114a6SFelix Kuehling #include "sdma1/sdma1_4_0_sh_mask.h" 31d5a114a6SFelix Kuehling #include "athub/athub_1_0_offset.h" 32d5a114a6SFelix Kuehling #include "athub/athub_1_0_sh_mask.h" 33d5a114a6SFelix Kuehling #include "oss/osssys_4_0_offset.h" 34d5a114a6SFelix Kuehling #include "oss/osssys_4_0_sh_mask.h" 35d5a114a6SFelix Kuehling #include "soc15_common.h" 36d5a114a6SFelix Kuehling #include "v9_structs.h" 37d5a114a6SFelix Kuehling #include "soc15.h" 38d5a114a6SFelix Kuehling #include "soc15d.h" 3943a4bc82SRamesh Errabolu #include "gfx_v9_0.h" 403ac2bc76SRamesh Errabolu #include "amdgpu_amdkfd_gfx_v9.h" 41d5a114a6SFelix Kuehling 42d5a114a6SFelix Kuehling enum hqd_dequeue_request_type { 43d5a114a6SFelix Kuehling NO_ACTION = 0, 44d5a114a6SFelix Kuehling DRAIN_PIPE, 45d5a114a6SFelix Kuehling RESET_WAVES 46d5a114a6SFelix Kuehling }; 47d5a114a6SFelix Kuehling 48d5a114a6SFelix Kuehling static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd) 49d5a114a6SFelix Kuehling { 50d5a114a6SFelix Kuehling return (struct amdgpu_device *)kgd; 51d5a114a6SFelix Kuehling } 52d5a114a6SFelix Kuehling 53d5a114a6SFelix Kuehling static void lock_srbm(struct kgd_dev *kgd, uint32_t mec, uint32_t pipe, 54d5a114a6SFelix Kuehling uint32_t queue, uint32_t vmid) 55d5a114a6SFelix Kuehling { 56d5a114a6SFelix Kuehling struct amdgpu_device *adev = get_amdgpu_device(kgd); 57d5a114a6SFelix Kuehling 58d5a114a6SFelix Kuehling mutex_lock(&adev->srbm_mutex); 59d5a114a6SFelix Kuehling soc15_grbm_select(adev, mec, pipe, queue, vmid); 60d5a114a6SFelix Kuehling } 61d5a114a6SFelix Kuehling 62d5a114a6SFelix Kuehling static void unlock_srbm(struct kgd_dev *kgd) 63d5a114a6SFelix Kuehling { 64d5a114a6SFelix Kuehling struct amdgpu_device *adev = get_amdgpu_device(kgd); 65d5a114a6SFelix Kuehling 66d5a114a6SFelix Kuehling soc15_grbm_select(adev, 0, 0, 0, 0); 67d5a114a6SFelix Kuehling mutex_unlock(&adev->srbm_mutex); 68d5a114a6SFelix Kuehling } 69d5a114a6SFelix Kuehling 70d5a114a6SFelix Kuehling static void acquire_queue(struct kgd_dev *kgd, uint32_t pipe_id, 71d5a114a6SFelix Kuehling uint32_t queue_id) 72d5a114a6SFelix Kuehling { 73d5a114a6SFelix Kuehling struct amdgpu_device *adev = get_amdgpu_device(kgd); 74d5a114a6SFelix Kuehling 75d5a114a6SFelix Kuehling uint32_t mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; 76d5a114a6SFelix Kuehling uint32_t pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); 77d5a114a6SFelix Kuehling 78d5a114a6SFelix Kuehling lock_srbm(kgd, mec, pipe, queue_id, 0); 79d5a114a6SFelix Kuehling } 80d5a114a6SFelix Kuehling 8135cd89d5SAaron Liu static uint64_t get_queue_mask(struct amdgpu_device *adev, 82d5a114a6SFelix Kuehling uint32_t pipe_id, uint32_t queue_id) 83d5a114a6SFelix Kuehling { 8435cd89d5SAaron Liu unsigned int bit = pipe_id * adev->gfx.mec.num_queue_per_pipe + 8535cd89d5SAaron Liu queue_id; 86d5a114a6SFelix Kuehling 8735cd89d5SAaron Liu return 1ull << bit; 88d5a114a6SFelix Kuehling } 89d5a114a6SFelix Kuehling 90d5a114a6SFelix Kuehling static void release_queue(struct kgd_dev *kgd) 91d5a114a6SFelix Kuehling { 92d5a114a6SFelix Kuehling unlock_srbm(kgd); 93d5a114a6SFelix Kuehling } 94d5a114a6SFelix Kuehling 953e205a08SOak Zeng void kgd_gfx_v9_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid, 96d5a114a6SFelix Kuehling uint32_t sh_mem_config, 97d5a114a6SFelix Kuehling uint32_t sh_mem_ape1_base, 98d5a114a6SFelix Kuehling uint32_t sh_mem_ape1_limit, 99d5a114a6SFelix Kuehling uint32_t sh_mem_bases) 100d5a114a6SFelix Kuehling { 101d5a114a6SFelix Kuehling struct amdgpu_device *adev = get_amdgpu_device(kgd); 102d5a114a6SFelix Kuehling 103d5a114a6SFelix Kuehling lock_srbm(kgd, 0, 0, 0, vmid); 104d5a114a6SFelix Kuehling 1051bff7f6cSTrigger Huang WREG32_RLC(SOC15_REG_OFFSET(GC, 0, mmSH_MEM_CONFIG), sh_mem_config); 1061bff7f6cSTrigger Huang WREG32_RLC(SOC15_REG_OFFSET(GC, 0, mmSH_MEM_BASES), sh_mem_bases); 107d5a114a6SFelix Kuehling /* APE1 no longer exists on GFX9 */ 108d5a114a6SFelix Kuehling 109d5a114a6SFelix Kuehling unlock_srbm(kgd); 110d5a114a6SFelix Kuehling } 111d5a114a6SFelix Kuehling 112c7b6bac9SFenghua Yu int kgd_gfx_v9_set_pasid_vmid_mapping(struct kgd_dev *kgd, u32 pasid, 113d5a114a6SFelix Kuehling unsigned int vmid) 114d5a114a6SFelix Kuehling { 115d5a114a6SFelix Kuehling struct amdgpu_device *adev = get_amdgpu_device(kgd); 116d5a114a6SFelix Kuehling 117d5a114a6SFelix Kuehling /* 118d5a114a6SFelix Kuehling * We have to assume that there is no outstanding mapping. 119d5a114a6SFelix Kuehling * The ATC_VMID_PASID_MAPPING_UPDATE_STATUS bit could be 0 because 120d5a114a6SFelix Kuehling * a mapping is in progress or because a mapping finished 121d5a114a6SFelix Kuehling * and the SW cleared it. 122d5a114a6SFelix Kuehling * So the protocol is to always wait & clear. 123d5a114a6SFelix Kuehling */ 124d5a114a6SFelix Kuehling uint32_t pasid_mapping = (pasid == 0) ? 0 : (uint32_t)pasid | 125d5a114a6SFelix Kuehling ATC_VMID0_PASID_MAPPING__VALID_MASK; 126d5a114a6SFelix Kuehling 127d5a114a6SFelix Kuehling /* 128d5a114a6SFelix Kuehling * need to do this twice, once for gfx and once for mmhub 129d5a114a6SFelix Kuehling * for ATC add 16 to VMID for mmhub, for IH different registers. 130d5a114a6SFelix Kuehling * ATC_VMID0..15 registers are separate from ATC_VMID16..31. 131d5a114a6SFelix Kuehling */ 132d5a114a6SFelix Kuehling 133d5a114a6SFelix Kuehling WREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) + vmid, 134d5a114a6SFelix Kuehling pasid_mapping); 135d5a114a6SFelix Kuehling 136d5a114a6SFelix Kuehling while (!(RREG32(SOC15_REG_OFFSET( 137d5a114a6SFelix Kuehling ATHUB, 0, 138d5a114a6SFelix Kuehling mmATC_VMID_PASID_MAPPING_UPDATE_STATUS)) & 139d5a114a6SFelix Kuehling (1U << vmid))) 140d5a114a6SFelix Kuehling cpu_relax(); 141d5a114a6SFelix Kuehling 142d5a114a6SFelix Kuehling WREG32(SOC15_REG_OFFSET(ATHUB, 0, 143d5a114a6SFelix Kuehling mmATC_VMID_PASID_MAPPING_UPDATE_STATUS), 144d5a114a6SFelix Kuehling 1U << vmid); 145d5a114a6SFelix Kuehling 146d5a114a6SFelix Kuehling /* Mapping vmid to pasid also for IH block */ 147d5a114a6SFelix Kuehling WREG32(SOC15_REG_OFFSET(OSSSYS, 0, mmIH_VMID_0_LUT) + vmid, 148d5a114a6SFelix Kuehling pasid_mapping); 149d5a114a6SFelix Kuehling 150d5a114a6SFelix Kuehling WREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID16_PASID_MAPPING) + vmid, 151d5a114a6SFelix Kuehling pasid_mapping); 152d5a114a6SFelix Kuehling 153d5a114a6SFelix Kuehling while (!(RREG32(SOC15_REG_OFFSET( 154d5a114a6SFelix Kuehling ATHUB, 0, 155d5a114a6SFelix Kuehling mmATC_VMID_PASID_MAPPING_UPDATE_STATUS)) & 156d5a114a6SFelix Kuehling (1U << (vmid + 16)))) 157d5a114a6SFelix Kuehling cpu_relax(); 158d5a114a6SFelix Kuehling 159d5a114a6SFelix Kuehling WREG32(SOC15_REG_OFFSET(ATHUB, 0, 160d5a114a6SFelix Kuehling mmATC_VMID_PASID_MAPPING_UPDATE_STATUS), 161d5a114a6SFelix Kuehling 1U << (vmid + 16)); 162d5a114a6SFelix Kuehling 163d5a114a6SFelix Kuehling /* Mapping vmid to pasid also for IH block */ 164d5a114a6SFelix Kuehling WREG32(SOC15_REG_OFFSET(OSSSYS, 0, mmIH_VMID_0_LUT_MM) + vmid, 165d5a114a6SFelix Kuehling pasid_mapping); 166d5a114a6SFelix Kuehling return 0; 167d5a114a6SFelix Kuehling } 168d5a114a6SFelix Kuehling 169d5a114a6SFelix Kuehling /* TODO - RING0 form of field is obsolete, seems to date back to SI 170d5a114a6SFelix Kuehling * but still works 171d5a114a6SFelix Kuehling */ 172d5a114a6SFelix Kuehling 1733e205a08SOak Zeng int kgd_gfx_v9_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id) 174d5a114a6SFelix Kuehling { 175d5a114a6SFelix Kuehling struct amdgpu_device *adev = get_amdgpu_device(kgd); 176d5a114a6SFelix Kuehling uint32_t mec; 177d5a114a6SFelix Kuehling uint32_t pipe; 178d5a114a6SFelix Kuehling 179d5a114a6SFelix Kuehling mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; 180d5a114a6SFelix Kuehling pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); 181d5a114a6SFelix Kuehling 182d5a114a6SFelix Kuehling lock_srbm(kgd, mec, pipe, 0, 0); 183d5a114a6SFelix Kuehling 184d5a114a6SFelix Kuehling WREG32(SOC15_REG_OFFSET(GC, 0, mmCPC_INT_CNTL), 185d5a114a6SFelix Kuehling CP_INT_CNTL_RING0__TIME_STAMP_INT_ENABLE_MASK | 186d5a114a6SFelix Kuehling CP_INT_CNTL_RING0__OPCODE_ERROR_INT_ENABLE_MASK); 187d5a114a6SFelix Kuehling 188d5a114a6SFelix Kuehling unlock_srbm(kgd); 189d5a114a6SFelix Kuehling 190d5a114a6SFelix Kuehling return 0; 191d5a114a6SFelix Kuehling } 192d5a114a6SFelix Kuehling 193b55a8b8bSYong Zhao static uint32_t get_sdma_rlc_reg_offset(struct amdgpu_device *adev, 194d5a114a6SFelix Kuehling unsigned int engine_id, 195d5a114a6SFelix Kuehling unsigned int queue_id) 196d5a114a6SFelix Kuehling { 19734174b89SHuang Rui uint32_t sdma_engine_reg_base = 0; 19834174b89SHuang Rui uint32_t sdma_rlc_reg_offset; 19934174b89SHuang Rui 20034174b89SHuang Rui switch (engine_id) { 20134174b89SHuang Rui default: 20234174b89SHuang Rui dev_warn(adev->dev, 20334174b89SHuang Rui "Invalid sdma engine id (%d), using engine id 0\n", 20434174b89SHuang Rui engine_id); 20534174b89SHuang Rui fallthrough; 20634174b89SHuang Rui case 0: 20734174b89SHuang Rui sdma_engine_reg_base = SOC15_REG_OFFSET(SDMA0, 0, 20834174b89SHuang Rui mmSDMA0_RLC0_RB_CNTL) - mmSDMA0_RLC0_RB_CNTL; 20934174b89SHuang Rui break; 21034174b89SHuang Rui case 1: 21134174b89SHuang Rui sdma_engine_reg_base = SOC15_REG_OFFSET(SDMA1, 0, 21234174b89SHuang Rui mmSDMA1_RLC0_RB_CNTL) - mmSDMA0_RLC0_RB_CNTL; 21334174b89SHuang Rui break; 21434174b89SHuang Rui } 21534174b89SHuang Rui 21634174b89SHuang Rui sdma_rlc_reg_offset = sdma_engine_reg_base 217b55a8b8bSYong Zhao + queue_id * (mmSDMA0_RLC1_RB_CNTL - mmSDMA0_RLC0_RB_CNTL); 218d5a114a6SFelix Kuehling 219b55a8b8bSYong Zhao pr_debug("RLC register offset for SDMA%d RLC%d: 0x%x\n", engine_id, 22034174b89SHuang Rui queue_id, sdma_rlc_reg_offset); 221d5a114a6SFelix Kuehling 22234174b89SHuang Rui return sdma_rlc_reg_offset; 223d5a114a6SFelix Kuehling } 224d5a114a6SFelix Kuehling 225d5a114a6SFelix Kuehling static inline struct v9_mqd *get_mqd(void *mqd) 226d5a114a6SFelix Kuehling { 227d5a114a6SFelix Kuehling return (struct v9_mqd *)mqd; 228d5a114a6SFelix Kuehling } 229d5a114a6SFelix Kuehling 230d5a114a6SFelix Kuehling static inline struct v9_sdma_mqd *get_sdma_mqd(void *mqd) 231d5a114a6SFelix Kuehling { 232d5a114a6SFelix Kuehling return (struct v9_sdma_mqd *)mqd; 233d5a114a6SFelix Kuehling } 234d5a114a6SFelix Kuehling 2353e205a08SOak Zeng int kgd_gfx_v9_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, 236d5a114a6SFelix Kuehling uint32_t queue_id, uint32_t __user *wptr, 237d5a114a6SFelix Kuehling uint32_t wptr_shift, uint32_t wptr_mask, 238d5a114a6SFelix Kuehling struct mm_struct *mm) 239d5a114a6SFelix Kuehling { 240d5a114a6SFelix Kuehling struct amdgpu_device *adev = get_amdgpu_device(kgd); 241d5a114a6SFelix Kuehling struct v9_mqd *m; 242d5a114a6SFelix Kuehling uint32_t *mqd_hqd; 243d5a114a6SFelix Kuehling uint32_t reg, hqd_base, data; 244d5a114a6SFelix Kuehling 245d5a114a6SFelix Kuehling m = get_mqd(mqd); 246d5a114a6SFelix Kuehling 247d5a114a6SFelix Kuehling acquire_queue(kgd, pipe_id, queue_id); 248d5a114a6SFelix Kuehling 249d5a114a6SFelix Kuehling /* HQD registers extend from CP_MQD_BASE_ADDR to CP_HQD_EOP_WPTR_MEM. */ 250d5a114a6SFelix Kuehling mqd_hqd = &m->cp_mqd_base_addr_lo; 251d5a114a6SFelix Kuehling hqd_base = SOC15_REG_OFFSET(GC, 0, mmCP_MQD_BASE_ADDR); 252d5a114a6SFelix Kuehling 253d5a114a6SFelix Kuehling for (reg = hqd_base; 254d5a114a6SFelix Kuehling reg <= SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI); reg++) 2551bff7f6cSTrigger Huang WREG32_RLC(reg, mqd_hqd[reg - hqd_base]); 256d5a114a6SFelix Kuehling 257d5a114a6SFelix Kuehling 258d5a114a6SFelix Kuehling /* Activate doorbell logic before triggering WPTR poll. */ 259d5a114a6SFelix Kuehling data = REG_SET_FIELD(m->cp_hqd_pq_doorbell_control, 260d5a114a6SFelix Kuehling CP_HQD_PQ_DOORBELL_CONTROL, DOORBELL_EN, 1); 2611bff7f6cSTrigger Huang WREG32_RLC(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_DOORBELL_CONTROL), data); 262d5a114a6SFelix Kuehling 263d5a114a6SFelix Kuehling if (wptr) { 264d5a114a6SFelix Kuehling /* Don't read wptr with get_user because the user 265d5a114a6SFelix Kuehling * context may not be accessible (if this function 266d5a114a6SFelix Kuehling * runs in a work queue). Instead trigger a one-shot 267d5a114a6SFelix Kuehling * polling read from memory in the CP. This assumes 268d5a114a6SFelix Kuehling * that wptr is GPU-accessible in the queue's VMID via 269d5a114a6SFelix Kuehling * ATC or SVM. WPTR==RPTR before starting the poll so 270d5a114a6SFelix Kuehling * the CP starts fetching new commands from the right 271d5a114a6SFelix Kuehling * place. 272d5a114a6SFelix Kuehling * 273d5a114a6SFelix Kuehling * Guessing a 64-bit WPTR from a 32-bit RPTR is a bit 274d5a114a6SFelix Kuehling * tricky. Assume that the queue didn't overflow. The 275d5a114a6SFelix Kuehling * number of valid bits in the 32-bit RPTR depends on 276d5a114a6SFelix Kuehling * the queue size. The remaining bits are taken from 277d5a114a6SFelix Kuehling * the saved 64-bit WPTR. If the WPTR wrapped, add the 278d5a114a6SFelix Kuehling * queue size. 279d5a114a6SFelix Kuehling */ 280d5a114a6SFelix Kuehling uint32_t queue_size = 281d5a114a6SFelix Kuehling 2 << REG_GET_FIELD(m->cp_hqd_pq_control, 282d5a114a6SFelix Kuehling CP_HQD_PQ_CONTROL, QUEUE_SIZE); 283d5a114a6SFelix Kuehling uint64_t guessed_wptr = m->cp_hqd_pq_rptr & (queue_size - 1); 284d5a114a6SFelix Kuehling 285d5a114a6SFelix Kuehling if ((m->cp_hqd_pq_wptr_lo & (queue_size - 1)) < guessed_wptr) 286d5a114a6SFelix Kuehling guessed_wptr += queue_size; 287d5a114a6SFelix Kuehling guessed_wptr += m->cp_hqd_pq_wptr_lo & ~(queue_size - 1); 288d5a114a6SFelix Kuehling guessed_wptr += (uint64_t)m->cp_hqd_pq_wptr_hi << 32; 289d5a114a6SFelix Kuehling 2901bff7f6cSTrigger Huang WREG32_RLC(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_LO), 291d5a114a6SFelix Kuehling lower_32_bits(guessed_wptr)); 2921bff7f6cSTrigger Huang WREG32_RLC(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI), 293d5a114a6SFelix Kuehling upper_32_bits(guessed_wptr)); 2941bff7f6cSTrigger Huang WREG32_RLC(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_POLL_ADDR), 295ebe1d22bSArnd Bergmann lower_32_bits((uintptr_t)wptr)); 2961bff7f6cSTrigger Huang WREG32_RLC(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_POLL_ADDR_HI), 297ebe1d22bSArnd Bergmann upper_32_bits((uintptr_t)wptr)); 298d5a114a6SFelix Kuehling WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_PQ_WPTR_POLL_CNTL1), 29935cd89d5SAaron Liu (uint32_t)get_queue_mask(adev, pipe_id, queue_id)); 300d5a114a6SFelix Kuehling } 301d5a114a6SFelix Kuehling 302d5a114a6SFelix Kuehling /* Start the EOP fetcher */ 3031bff7f6cSTrigger Huang WREG32_RLC(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_EOP_RPTR), 304d5a114a6SFelix Kuehling REG_SET_FIELD(m->cp_hqd_eop_rptr, 305d5a114a6SFelix Kuehling CP_HQD_EOP_RPTR, INIT_FETCHER, 1)); 306d5a114a6SFelix Kuehling 307d5a114a6SFelix Kuehling data = REG_SET_FIELD(m->cp_hqd_active, CP_HQD_ACTIVE, ACTIVE, 1); 3081bff7f6cSTrigger Huang WREG32_RLC(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE), data); 309d5a114a6SFelix Kuehling 310d5a114a6SFelix Kuehling release_queue(kgd); 311d5a114a6SFelix Kuehling 312d5a114a6SFelix Kuehling return 0; 313d5a114a6SFelix Kuehling } 314d5a114a6SFelix Kuehling 31535cd89d5SAaron Liu int kgd_gfx_v9_hiq_mqd_load(struct kgd_dev *kgd, void *mqd, 31635cd89d5SAaron Liu uint32_t pipe_id, uint32_t queue_id, 31735cd89d5SAaron Liu uint32_t doorbell_off) 31835cd89d5SAaron Liu { 31935cd89d5SAaron Liu struct amdgpu_device *adev = get_amdgpu_device(kgd); 32035cd89d5SAaron Liu struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring; 32135cd89d5SAaron Liu struct v9_mqd *m; 32235cd89d5SAaron Liu uint32_t mec, pipe; 32335cd89d5SAaron Liu int r; 32435cd89d5SAaron Liu 32535cd89d5SAaron Liu m = get_mqd(mqd); 32635cd89d5SAaron Liu 32735cd89d5SAaron Liu acquire_queue(kgd, pipe_id, queue_id); 32835cd89d5SAaron Liu 32935cd89d5SAaron Liu mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; 33035cd89d5SAaron Liu pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); 33135cd89d5SAaron Liu 33235cd89d5SAaron Liu pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n", 33335cd89d5SAaron Liu mec, pipe, queue_id); 33435cd89d5SAaron Liu 33535cd89d5SAaron Liu spin_lock(&adev->gfx.kiq.ring_lock); 33635cd89d5SAaron Liu r = amdgpu_ring_alloc(kiq_ring, 7); 33735cd89d5SAaron Liu if (r) { 33835cd89d5SAaron Liu pr_err("Failed to alloc KIQ (%d).\n", r); 33935cd89d5SAaron Liu goto out_unlock; 34035cd89d5SAaron Liu } 34135cd89d5SAaron Liu 34235cd89d5SAaron Liu amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_MAP_QUEUES, 5)); 34335cd89d5SAaron Liu amdgpu_ring_write(kiq_ring, 34435cd89d5SAaron Liu PACKET3_MAP_QUEUES_QUEUE_SEL(0) | /* Queue_Sel */ 34535cd89d5SAaron Liu PACKET3_MAP_QUEUES_VMID(m->cp_hqd_vmid) | /* VMID */ 34635cd89d5SAaron Liu PACKET3_MAP_QUEUES_QUEUE(queue_id) | 34735cd89d5SAaron Liu PACKET3_MAP_QUEUES_PIPE(pipe) | 34835cd89d5SAaron Liu PACKET3_MAP_QUEUES_ME((mec - 1)) | 34935cd89d5SAaron Liu PACKET3_MAP_QUEUES_QUEUE_TYPE(0) | /*queue_type: normal compute queue */ 35035cd89d5SAaron Liu PACKET3_MAP_QUEUES_ALLOC_FORMAT(0) | /* alloc format: all_on_one_pipe */ 35135cd89d5SAaron Liu PACKET3_MAP_QUEUES_ENGINE_SEL(1) | /* engine_sel: hiq */ 35235cd89d5SAaron Liu PACKET3_MAP_QUEUES_NUM_QUEUES(1)); /* num_queues: must be 1 */ 35335cd89d5SAaron Liu amdgpu_ring_write(kiq_ring, 35435cd89d5SAaron Liu PACKET3_MAP_QUEUES_DOORBELL_OFFSET(doorbell_off)); 35535cd89d5SAaron Liu amdgpu_ring_write(kiq_ring, m->cp_mqd_base_addr_lo); 35635cd89d5SAaron Liu amdgpu_ring_write(kiq_ring, m->cp_mqd_base_addr_hi); 35735cd89d5SAaron Liu amdgpu_ring_write(kiq_ring, m->cp_hqd_pq_wptr_poll_addr_lo); 35835cd89d5SAaron Liu amdgpu_ring_write(kiq_ring, m->cp_hqd_pq_wptr_poll_addr_hi); 35935cd89d5SAaron Liu amdgpu_ring_commit(kiq_ring); 36035cd89d5SAaron Liu 36135cd89d5SAaron Liu out_unlock: 36235cd89d5SAaron Liu spin_unlock(&adev->gfx.kiq.ring_lock); 36335cd89d5SAaron Liu release_queue(kgd); 36435cd89d5SAaron Liu 36535cd89d5SAaron Liu return r; 36635cd89d5SAaron Liu } 36735cd89d5SAaron Liu 3683e205a08SOak Zeng int kgd_gfx_v9_hqd_dump(struct kgd_dev *kgd, 369d5a114a6SFelix Kuehling uint32_t pipe_id, uint32_t queue_id, 370d5a114a6SFelix Kuehling uint32_t (**dump)[2], uint32_t *n_regs) 371d5a114a6SFelix Kuehling { 372d5a114a6SFelix Kuehling struct amdgpu_device *adev = get_amdgpu_device(kgd); 373d5a114a6SFelix Kuehling uint32_t i = 0, reg; 374d5a114a6SFelix Kuehling #define HQD_N_REGS 56 375d5a114a6SFelix Kuehling #define DUMP_REG(addr) do { \ 376d5a114a6SFelix Kuehling if (WARN_ON_ONCE(i >= HQD_N_REGS)) \ 377d5a114a6SFelix Kuehling break; \ 378d5a114a6SFelix Kuehling (*dump)[i][0] = (addr) << 2; \ 379d5a114a6SFelix Kuehling (*dump)[i++][1] = RREG32(addr); \ 380d5a114a6SFelix Kuehling } while (0) 381d5a114a6SFelix Kuehling 3826da2ec56SKees Cook *dump = kmalloc_array(HQD_N_REGS * 2, sizeof(uint32_t), GFP_KERNEL); 383d5a114a6SFelix Kuehling if (*dump == NULL) 384d5a114a6SFelix Kuehling return -ENOMEM; 385d5a114a6SFelix Kuehling 386d5a114a6SFelix Kuehling acquire_queue(kgd, pipe_id, queue_id); 387d5a114a6SFelix Kuehling 388d5a114a6SFelix Kuehling for (reg = SOC15_REG_OFFSET(GC, 0, mmCP_MQD_BASE_ADDR); 389d5a114a6SFelix Kuehling reg <= SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI); reg++) 390d5a114a6SFelix Kuehling DUMP_REG(reg); 391d5a114a6SFelix Kuehling 392d5a114a6SFelix Kuehling release_queue(kgd); 393d5a114a6SFelix Kuehling 394d5a114a6SFelix Kuehling WARN_ON_ONCE(i != HQD_N_REGS); 395d5a114a6SFelix Kuehling *n_regs = i; 396d5a114a6SFelix Kuehling 397d5a114a6SFelix Kuehling return 0; 398d5a114a6SFelix Kuehling } 399d5a114a6SFelix Kuehling 400d5a114a6SFelix Kuehling static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, 401d5a114a6SFelix Kuehling uint32_t __user *wptr, struct mm_struct *mm) 402d5a114a6SFelix Kuehling { 403d5a114a6SFelix Kuehling struct amdgpu_device *adev = get_amdgpu_device(kgd); 404d5a114a6SFelix Kuehling struct v9_sdma_mqd *m; 405b55a8b8bSYong Zhao uint32_t sdma_rlc_reg_offset; 406d5a114a6SFelix Kuehling unsigned long end_jiffies; 407d5a114a6SFelix Kuehling uint32_t data; 408d5a114a6SFelix Kuehling uint64_t data64; 409d5a114a6SFelix Kuehling uint64_t __user *wptr64 = (uint64_t __user *)wptr; 410d5a114a6SFelix Kuehling 411d5a114a6SFelix Kuehling m = get_sdma_mqd(mqd); 412b55a8b8bSYong Zhao sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, m->sdma_engine_id, 413d5a114a6SFelix Kuehling m->sdma_queue_id); 414d5a114a6SFelix Kuehling 415b55a8b8bSYong Zhao WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL, 416d5a114a6SFelix Kuehling m->sdmax_rlcx_rb_cntl & (~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK)); 417d5a114a6SFelix Kuehling 418d5a114a6SFelix Kuehling end_jiffies = msecs_to_jiffies(2000) + jiffies; 419d5a114a6SFelix Kuehling while (true) { 420b55a8b8bSYong Zhao data = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_CONTEXT_STATUS); 421d5a114a6SFelix Kuehling if (data & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) 422d5a114a6SFelix Kuehling break; 423812330ebSYong Zhao if (time_after(jiffies, end_jiffies)) { 424812330ebSYong Zhao pr_err("SDMA RLC not idle in %s\n", __func__); 425d5a114a6SFelix Kuehling return -ETIME; 426812330ebSYong Zhao } 427d5a114a6SFelix Kuehling usleep_range(500, 1000); 428d5a114a6SFelix Kuehling } 429d5a114a6SFelix Kuehling 430b55a8b8bSYong Zhao WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_DOORBELL_OFFSET, 431d5a114a6SFelix Kuehling m->sdmax_rlcx_doorbell_offset); 432d5a114a6SFelix Kuehling 433d5a114a6SFelix Kuehling data = REG_SET_FIELD(m->sdmax_rlcx_doorbell, SDMA0_RLC0_DOORBELL, 434d5a114a6SFelix Kuehling ENABLE, 1); 435b55a8b8bSYong Zhao WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_DOORBELL, data); 436b55a8b8bSYong Zhao WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR, 437b55a8b8bSYong Zhao m->sdmax_rlcx_rb_rptr); 438b55a8b8bSYong Zhao WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR_HI, 439d5a114a6SFelix Kuehling m->sdmax_rlcx_rb_rptr_hi); 440d5a114a6SFelix Kuehling 441b55a8b8bSYong Zhao WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 1); 442d5a114a6SFelix Kuehling if (read_user_wptr(mm, wptr64, data64)) { 443b55a8b8bSYong Zhao WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_WPTR, 444d5a114a6SFelix Kuehling lower_32_bits(data64)); 445b55a8b8bSYong Zhao WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_WPTR_HI, 446d5a114a6SFelix Kuehling upper_32_bits(data64)); 447d5a114a6SFelix Kuehling } else { 448b55a8b8bSYong Zhao WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_WPTR, 449d5a114a6SFelix Kuehling m->sdmax_rlcx_rb_rptr); 450b55a8b8bSYong Zhao WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_WPTR_HI, 451d5a114a6SFelix Kuehling m->sdmax_rlcx_rb_rptr_hi); 452d5a114a6SFelix Kuehling } 453b55a8b8bSYong Zhao WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 0); 454d5a114a6SFelix Kuehling 455b55a8b8bSYong Zhao WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_BASE, m->sdmax_rlcx_rb_base); 456b55a8b8bSYong Zhao WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_BASE_HI, 457d5a114a6SFelix Kuehling m->sdmax_rlcx_rb_base_hi); 458b55a8b8bSYong Zhao WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR_ADDR_LO, 459d5a114a6SFelix Kuehling m->sdmax_rlcx_rb_rptr_addr_lo); 460b55a8b8bSYong Zhao WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR_ADDR_HI, 461d5a114a6SFelix Kuehling m->sdmax_rlcx_rb_rptr_addr_hi); 462d5a114a6SFelix Kuehling 463d5a114a6SFelix Kuehling data = REG_SET_FIELD(m->sdmax_rlcx_rb_cntl, SDMA0_RLC0_RB_CNTL, 464d5a114a6SFelix Kuehling RB_ENABLE, 1); 465b55a8b8bSYong Zhao WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL, data); 466d5a114a6SFelix Kuehling 467d5a114a6SFelix Kuehling return 0; 468d5a114a6SFelix Kuehling } 469d5a114a6SFelix Kuehling 470d5a114a6SFelix Kuehling static int kgd_hqd_sdma_dump(struct kgd_dev *kgd, 471d5a114a6SFelix Kuehling uint32_t engine_id, uint32_t queue_id, 472d5a114a6SFelix Kuehling uint32_t (**dump)[2], uint32_t *n_regs) 473d5a114a6SFelix Kuehling { 474d5a114a6SFelix Kuehling struct amdgpu_device *adev = get_amdgpu_device(kgd); 475b55a8b8bSYong Zhao uint32_t sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, 476b55a8b8bSYong Zhao engine_id, queue_id); 477d5a114a6SFelix Kuehling uint32_t i = 0, reg; 478d5a114a6SFelix Kuehling #undef HQD_N_REGS 479d5a114a6SFelix Kuehling #define HQD_N_REGS (19+6+7+10) 480d5a114a6SFelix Kuehling 4816da2ec56SKees Cook *dump = kmalloc_array(HQD_N_REGS * 2, sizeof(uint32_t), GFP_KERNEL); 482d5a114a6SFelix Kuehling if (*dump == NULL) 483d5a114a6SFelix Kuehling return -ENOMEM; 484d5a114a6SFelix Kuehling 485d5a114a6SFelix Kuehling for (reg = mmSDMA0_RLC0_RB_CNTL; reg <= mmSDMA0_RLC0_DOORBELL; reg++) 486b55a8b8bSYong Zhao DUMP_REG(sdma_rlc_reg_offset + reg); 487d5a114a6SFelix Kuehling for (reg = mmSDMA0_RLC0_STATUS; reg <= mmSDMA0_RLC0_CSA_ADDR_HI; reg++) 488b55a8b8bSYong Zhao DUMP_REG(sdma_rlc_reg_offset + reg); 489d5a114a6SFelix Kuehling for (reg = mmSDMA0_RLC0_IB_SUB_REMAIN; 490d5a114a6SFelix Kuehling reg <= mmSDMA0_RLC0_MINOR_PTR_UPDATE; reg++) 491b55a8b8bSYong Zhao DUMP_REG(sdma_rlc_reg_offset + reg); 492d5a114a6SFelix Kuehling for (reg = mmSDMA0_RLC0_MIDCMD_DATA0; 493d5a114a6SFelix Kuehling reg <= mmSDMA0_RLC0_MIDCMD_CNTL; reg++) 494b55a8b8bSYong Zhao DUMP_REG(sdma_rlc_reg_offset + reg); 495d5a114a6SFelix Kuehling 496d5a114a6SFelix Kuehling WARN_ON_ONCE(i != HQD_N_REGS); 497d5a114a6SFelix Kuehling *n_regs = i; 498d5a114a6SFelix Kuehling 499d5a114a6SFelix Kuehling return 0; 500d5a114a6SFelix Kuehling } 501d5a114a6SFelix Kuehling 5023e205a08SOak Zeng bool kgd_gfx_v9_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, 503d5a114a6SFelix Kuehling uint32_t pipe_id, uint32_t queue_id) 504d5a114a6SFelix Kuehling { 505d5a114a6SFelix Kuehling struct amdgpu_device *adev = get_amdgpu_device(kgd); 506d5a114a6SFelix Kuehling uint32_t act; 507d5a114a6SFelix Kuehling bool retval = false; 508d5a114a6SFelix Kuehling uint32_t low, high; 509d5a114a6SFelix Kuehling 510d5a114a6SFelix Kuehling acquire_queue(kgd, pipe_id, queue_id); 511d5a114a6SFelix Kuehling act = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE)); 512d5a114a6SFelix Kuehling if (act) { 513d5a114a6SFelix Kuehling low = lower_32_bits(queue_address >> 8); 514d5a114a6SFelix Kuehling high = upper_32_bits(queue_address >> 8); 515d5a114a6SFelix Kuehling 516d5a114a6SFelix Kuehling if (low == RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_BASE)) && 517d5a114a6SFelix Kuehling high == RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_BASE_HI))) 518d5a114a6SFelix Kuehling retval = true; 519d5a114a6SFelix Kuehling } 520d5a114a6SFelix Kuehling release_queue(kgd); 521d5a114a6SFelix Kuehling return retval; 522d5a114a6SFelix Kuehling } 523d5a114a6SFelix Kuehling 524d5a114a6SFelix Kuehling static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd) 525d5a114a6SFelix Kuehling { 526d5a114a6SFelix Kuehling struct amdgpu_device *adev = get_amdgpu_device(kgd); 527d5a114a6SFelix Kuehling struct v9_sdma_mqd *m; 528b55a8b8bSYong Zhao uint32_t sdma_rlc_reg_offset; 529d5a114a6SFelix Kuehling uint32_t sdma_rlc_rb_cntl; 530d5a114a6SFelix Kuehling 531d5a114a6SFelix Kuehling m = get_sdma_mqd(mqd); 532b55a8b8bSYong Zhao sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, m->sdma_engine_id, 533d5a114a6SFelix Kuehling m->sdma_queue_id); 534d5a114a6SFelix Kuehling 535b55a8b8bSYong Zhao sdma_rlc_rb_cntl = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL); 536d5a114a6SFelix Kuehling 537d5a114a6SFelix Kuehling if (sdma_rlc_rb_cntl & SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK) 538d5a114a6SFelix Kuehling return true; 539d5a114a6SFelix Kuehling 540d5a114a6SFelix Kuehling return false; 541d5a114a6SFelix Kuehling } 542d5a114a6SFelix Kuehling 5433e205a08SOak Zeng int kgd_gfx_v9_hqd_destroy(struct kgd_dev *kgd, void *mqd, 544d5a114a6SFelix Kuehling enum kfd_preempt_type reset_type, 545d5a114a6SFelix Kuehling unsigned int utimeout, uint32_t pipe_id, 546d5a114a6SFelix Kuehling uint32_t queue_id) 547d5a114a6SFelix Kuehling { 548d5a114a6SFelix Kuehling struct amdgpu_device *adev = get_amdgpu_device(kgd); 549d5a114a6SFelix Kuehling enum hqd_dequeue_request_type type; 550d5a114a6SFelix Kuehling unsigned long end_jiffies; 551d5a114a6SFelix Kuehling uint32_t temp; 552d5a114a6SFelix Kuehling struct v9_mqd *m = get_mqd(mqd); 553d5a114a6SFelix Kuehling 55453b3f8f4SDennis Li if (amdgpu_in_reset(adev)) 5551b0bfcffSShaoyun Liu return -EIO; 5561b0bfcffSShaoyun Liu 557d5a114a6SFelix Kuehling acquire_queue(kgd, pipe_id, queue_id); 558d5a114a6SFelix Kuehling 559d5a114a6SFelix Kuehling if (m->cp_hqd_vmid == 0) 5601bff7f6cSTrigger Huang WREG32_FIELD15_RLC(GC, 0, RLC_CP_SCHEDULERS, scheduler1, 0); 561d5a114a6SFelix Kuehling 562d5a114a6SFelix Kuehling switch (reset_type) { 563d5a114a6SFelix Kuehling case KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN: 564d5a114a6SFelix Kuehling type = DRAIN_PIPE; 565d5a114a6SFelix Kuehling break; 566d5a114a6SFelix Kuehling case KFD_PREEMPT_TYPE_WAVEFRONT_RESET: 567d5a114a6SFelix Kuehling type = RESET_WAVES; 568d5a114a6SFelix Kuehling break; 569d5a114a6SFelix Kuehling default: 570d5a114a6SFelix Kuehling type = DRAIN_PIPE; 571d5a114a6SFelix Kuehling break; 572d5a114a6SFelix Kuehling } 573d5a114a6SFelix Kuehling 5741bff7f6cSTrigger Huang WREG32_RLC(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_DEQUEUE_REQUEST), type); 575d5a114a6SFelix Kuehling 576d5a114a6SFelix Kuehling end_jiffies = (utimeout * HZ / 1000) + jiffies; 577d5a114a6SFelix Kuehling while (true) { 578d5a114a6SFelix Kuehling temp = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE)); 579d5a114a6SFelix Kuehling if (!(temp & CP_HQD_ACTIVE__ACTIVE_MASK)) 580d5a114a6SFelix Kuehling break; 581d5a114a6SFelix Kuehling if (time_after(jiffies, end_jiffies)) { 582d5a114a6SFelix Kuehling pr_err("cp queue preemption time out.\n"); 583d5a114a6SFelix Kuehling release_queue(kgd); 584d5a114a6SFelix Kuehling return -ETIME; 585d5a114a6SFelix Kuehling } 586d5a114a6SFelix Kuehling usleep_range(500, 1000); 587d5a114a6SFelix Kuehling } 588d5a114a6SFelix Kuehling 589d5a114a6SFelix Kuehling release_queue(kgd); 590d5a114a6SFelix Kuehling return 0; 591d5a114a6SFelix Kuehling } 592d5a114a6SFelix Kuehling 593d5a114a6SFelix Kuehling static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, 594d5a114a6SFelix Kuehling unsigned int utimeout) 595d5a114a6SFelix Kuehling { 596d5a114a6SFelix Kuehling struct amdgpu_device *adev = get_amdgpu_device(kgd); 597d5a114a6SFelix Kuehling struct v9_sdma_mqd *m; 598b55a8b8bSYong Zhao uint32_t sdma_rlc_reg_offset; 599d5a114a6SFelix Kuehling uint32_t temp; 600d5a114a6SFelix Kuehling unsigned long end_jiffies = (utimeout * HZ / 1000) + jiffies; 601d5a114a6SFelix Kuehling 602d5a114a6SFelix Kuehling m = get_sdma_mqd(mqd); 603b55a8b8bSYong Zhao sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, m->sdma_engine_id, 604d5a114a6SFelix Kuehling m->sdma_queue_id); 605d5a114a6SFelix Kuehling 606b55a8b8bSYong Zhao temp = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL); 607d5a114a6SFelix Kuehling temp = temp & ~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK; 608b55a8b8bSYong Zhao WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL, temp); 609d5a114a6SFelix Kuehling 610d5a114a6SFelix Kuehling while (true) { 611b55a8b8bSYong Zhao temp = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_CONTEXT_STATUS); 612d5a114a6SFelix Kuehling if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) 613d5a114a6SFelix Kuehling break; 614812330ebSYong Zhao if (time_after(jiffies, end_jiffies)) { 615812330ebSYong Zhao pr_err("SDMA RLC not idle in %s\n", __func__); 616d5a114a6SFelix Kuehling return -ETIME; 617812330ebSYong Zhao } 618d5a114a6SFelix Kuehling usleep_range(500, 1000); 619d5a114a6SFelix Kuehling } 620d5a114a6SFelix Kuehling 621b55a8b8bSYong Zhao WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_DOORBELL, 0); 622b55a8b8bSYong Zhao WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL, 623b55a8b8bSYong Zhao RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL) | 624d5a114a6SFelix Kuehling SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK); 625d5a114a6SFelix Kuehling 626b55a8b8bSYong Zhao m->sdmax_rlcx_rb_rptr = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR); 627d5a114a6SFelix Kuehling m->sdmax_rlcx_rb_rptr_hi = 628b55a8b8bSYong Zhao RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR_HI); 629d5a114a6SFelix Kuehling 630d5a114a6SFelix Kuehling return 0; 631d5a114a6SFelix Kuehling } 632d5a114a6SFelix Kuehling 63356fc40abSYong Zhao bool kgd_gfx_v9_get_atc_vmid_pasid_mapping_info(struct kgd_dev *kgd, 63456fc40abSYong Zhao uint8_t vmid, uint16_t *p_pasid) 635d5a114a6SFelix Kuehling { 63656fc40abSYong Zhao uint32_t value; 637d5a114a6SFelix Kuehling struct amdgpu_device *adev = (struct amdgpu_device *) kgd; 638d5a114a6SFelix Kuehling 63956fc40abSYong Zhao value = RREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) 640d5a114a6SFelix Kuehling + vmid); 64156fc40abSYong Zhao *p_pasid = value & ATC_VMID0_PASID_MAPPING__PASID_MASK; 642d5a114a6SFelix Kuehling 64356fc40abSYong Zhao return !!(value & ATC_VMID0_PASID_MAPPING__VALID_MASK); 644d5a114a6SFelix Kuehling } 645d5a114a6SFelix Kuehling 6463e205a08SOak Zeng int kgd_gfx_v9_address_watch_disable(struct kgd_dev *kgd) 647d5a114a6SFelix Kuehling { 648d5a114a6SFelix Kuehling return 0; 649d5a114a6SFelix Kuehling } 650d5a114a6SFelix Kuehling 6513e205a08SOak Zeng int kgd_gfx_v9_address_watch_execute(struct kgd_dev *kgd, 652d5a114a6SFelix Kuehling unsigned int watch_point_id, 653d5a114a6SFelix Kuehling uint32_t cntl_val, 654d5a114a6SFelix Kuehling uint32_t addr_hi, 655d5a114a6SFelix Kuehling uint32_t addr_lo) 656d5a114a6SFelix Kuehling { 657d5a114a6SFelix Kuehling return 0; 658d5a114a6SFelix Kuehling } 659d5a114a6SFelix Kuehling 6603e205a08SOak Zeng int kgd_gfx_v9_wave_control_execute(struct kgd_dev *kgd, 661d5a114a6SFelix Kuehling uint32_t gfx_index_val, 662d5a114a6SFelix Kuehling uint32_t sq_cmd) 663d5a114a6SFelix Kuehling { 664d5a114a6SFelix Kuehling struct amdgpu_device *adev = get_amdgpu_device(kgd); 665d5a114a6SFelix Kuehling uint32_t data = 0; 666d5a114a6SFelix Kuehling 667d5a114a6SFelix Kuehling mutex_lock(&adev->grbm_idx_mutex); 668d5a114a6SFelix Kuehling 6691bff7f6cSTrigger Huang WREG32_SOC15_RLC_SHADOW(GC, 0, mmGRBM_GFX_INDEX, gfx_index_val); 670d5a114a6SFelix Kuehling WREG32(SOC15_REG_OFFSET(GC, 0, mmSQ_CMD), sq_cmd); 671d5a114a6SFelix Kuehling 672d5a114a6SFelix Kuehling data = REG_SET_FIELD(data, GRBM_GFX_INDEX, 673d5a114a6SFelix Kuehling INSTANCE_BROADCAST_WRITES, 1); 674d5a114a6SFelix Kuehling data = REG_SET_FIELD(data, GRBM_GFX_INDEX, 675d5a114a6SFelix Kuehling SH_BROADCAST_WRITES, 1); 676d5a114a6SFelix Kuehling data = REG_SET_FIELD(data, GRBM_GFX_INDEX, 677d5a114a6SFelix Kuehling SE_BROADCAST_WRITES, 1); 678d5a114a6SFelix Kuehling 6791bff7f6cSTrigger Huang WREG32_SOC15_RLC_SHADOW(GC, 0, mmGRBM_GFX_INDEX, data); 680d5a114a6SFelix Kuehling mutex_unlock(&adev->grbm_idx_mutex); 681d5a114a6SFelix Kuehling 682d5a114a6SFelix Kuehling return 0; 683d5a114a6SFelix Kuehling } 684d5a114a6SFelix Kuehling 6853e205a08SOak Zeng uint32_t kgd_gfx_v9_address_watch_get_offset(struct kgd_dev *kgd, 686d5a114a6SFelix Kuehling unsigned int watch_point_id, 687d5a114a6SFelix Kuehling unsigned int reg_offset) 688d5a114a6SFelix Kuehling { 689d5a114a6SFelix Kuehling return 0; 690d5a114a6SFelix Kuehling } 691d5a114a6SFelix Kuehling 6929fb1506eSOak Zeng void kgd_gfx_v9_set_vm_context_page_table_base(struct kgd_dev *kgd, 693ad5901dfSYong Zhao uint32_t vmid, uint64_t page_table_base) 694d5a114a6SFelix Kuehling { 695d5a114a6SFelix Kuehling struct amdgpu_device *adev = get_amdgpu_device(kgd); 696d5a114a6SFelix Kuehling 697d5a114a6SFelix Kuehling if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid)) { 698d5a114a6SFelix Kuehling pr_err("trying to set page table base for wrong VMID %u\n", 699d5a114a6SFelix Kuehling vmid); 700d5a114a6SFelix Kuehling return; 701d5a114a6SFelix Kuehling } 702d5a114a6SFelix Kuehling 7039fb1506eSOak Zeng adev->mmhub.funcs->setup_vm_pt_regs(adev, vmid, page_table_base); 704d5a114a6SFelix Kuehling 7058ffff9b4SOak Zeng adev->gfxhub.funcs->setup_vm_pt_regs(adev, vmid, page_table_base); 706d5a114a6SFelix Kuehling } 7073e205a08SOak Zeng 70843a4bc82SRamesh Errabolu static void lock_spi_csq_mutexes(struct amdgpu_device *adev) 70943a4bc82SRamesh Errabolu { 71043a4bc82SRamesh Errabolu mutex_lock(&adev->srbm_mutex); 71143a4bc82SRamesh Errabolu mutex_lock(&adev->grbm_idx_mutex); 71243a4bc82SRamesh Errabolu 71343a4bc82SRamesh Errabolu } 71443a4bc82SRamesh Errabolu 71543a4bc82SRamesh Errabolu static void unlock_spi_csq_mutexes(struct amdgpu_device *adev) 71643a4bc82SRamesh Errabolu { 71743a4bc82SRamesh Errabolu mutex_unlock(&adev->grbm_idx_mutex); 71843a4bc82SRamesh Errabolu mutex_unlock(&adev->srbm_mutex); 71943a4bc82SRamesh Errabolu } 72043a4bc82SRamesh Errabolu 72143a4bc82SRamesh Errabolu /** 722*1fdbbc12SFabio M. De Francesco * get_wave_count: Read device registers to get number of waves in flight for 72343a4bc82SRamesh Errabolu * a particular queue. The method also returns the VMID associated with the 72443a4bc82SRamesh Errabolu * queue. 72543a4bc82SRamesh Errabolu * 72643a4bc82SRamesh Errabolu * @adev: Handle of device whose registers are to be read 72743a4bc82SRamesh Errabolu * @queue_idx: Index of queue in the queue-map bit-field 72843a4bc82SRamesh Errabolu * @wave_cnt: Output parameter updated with number of waves in flight 72943a4bc82SRamesh Errabolu * @vmid: Output parameter updated with VMID of queue whose wave count 73043a4bc82SRamesh Errabolu * is being collected 73143a4bc82SRamesh Errabolu */ 73243a4bc82SRamesh Errabolu static void get_wave_count(struct amdgpu_device *adev, int queue_idx, 73343a4bc82SRamesh Errabolu int *wave_cnt, int *vmid) 73443a4bc82SRamesh Errabolu { 73543a4bc82SRamesh Errabolu int pipe_idx; 73643a4bc82SRamesh Errabolu int queue_slot; 73743a4bc82SRamesh Errabolu unsigned int reg_val; 73843a4bc82SRamesh Errabolu 73943a4bc82SRamesh Errabolu /* 74043a4bc82SRamesh Errabolu * Program GRBM with appropriate MEID, PIPEID, QUEUEID and VMID 74143a4bc82SRamesh Errabolu * parameters to read out waves in flight. Get VMID if there are 74243a4bc82SRamesh Errabolu * non-zero waves in flight. 74343a4bc82SRamesh Errabolu */ 74443a4bc82SRamesh Errabolu *vmid = 0xFF; 74543a4bc82SRamesh Errabolu *wave_cnt = 0; 74643a4bc82SRamesh Errabolu pipe_idx = queue_idx / adev->gfx.mec.num_queue_per_pipe; 74743a4bc82SRamesh Errabolu queue_slot = queue_idx % adev->gfx.mec.num_queue_per_pipe; 74843a4bc82SRamesh Errabolu soc15_grbm_select(adev, 1, pipe_idx, queue_slot, 0); 74943a4bc82SRamesh Errabolu reg_val = RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_CSQ_WF_ACTIVE_COUNT_0) + 75043a4bc82SRamesh Errabolu queue_slot); 75143a4bc82SRamesh Errabolu *wave_cnt = reg_val & SPI_CSQ_WF_ACTIVE_COUNT_0__COUNT_MASK; 75243a4bc82SRamesh Errabolu if (*wave_cnt != 0) 75343a4bc82SRamesh Errabolu *vmid = (RREG32_SOC15(GC, 0, mmCP_HQD_VMID) & 75443a4bc82SRamesh Errabolu CP_HQD_VMID__VMID_MASK) >> CP_HQD_VMID__VMID__SHIFT; 75543a4bc82SRamesh Errabolu } 75643a4bc82SRamesh Errabolu 75743a4bc82SRamesh Errabolu /** 758*1fdbbc12SFabio M. De Francesco * kgd_gfx_v9_get_cu_occupancy: Reads relevant registers associated with each 75943a4bc82SRamesh Errabolu * shader engine and aggregates the number of waves that are in flight for the 76043a4bc82SRamesh Errabolu * process whose pasid is provided as a parameter. The process could have ZERO 76143a4bc82SRamesh Errabolu * or more queues running and submitting waves to compute units. 76243a4bc82SRamesh Errabolu * 76343a4bc82SRamesh Errabolu * @kgd: Handle of device from which to get number of waves in flight 76443a4bc82SRamesh Errabolu * @pasid: Identifies the process for which this query call is invoked 765*1fdbbc12SFabio M. De Francesco * @pasid_wave_cnt: Output parameter updated with number of waves in flight that 76643a4bc82SRamesh Errabolu * belong to process with given pasid 76743a4bc82SRamesh Errabolu * @max_waves_per_cu: Output parameter updated with maximum number of waves 76843a4bc82SRamesh Errabolu * possible per Compute Unit 76943a4bc82SRamesh Errabolu * 770*1fdbbc12SFabio M. De Francesco * Note: It's possible that the device has too many queues (oversubscription) 77143a4bc82SRamesh Errabolu * in which case a VMID could be remapped to a different PASID. This could lead 77243a4bc82SRamesh Errabolu * to an iaccurate wave count. Following is a high-level sequence: 77343a4bc82SRamesh Errabolu * Time T1: vmid = getVmid(); vmid is associated with Pasid P1 77443a4bc82SRamesh Errabolu * Time T2: passId = getPasId(vmid); vmid is associated with Pasid P2 77543a4bc82SRamesh Errabolu * In the sequence above wave count obtained from time T1 will be incorrectly 77643a4bc82SRamesh Errabolu * lost or added to total wave count. 77743a4bc82SRamesh Errabolu * 77843a4bc82SRamesh Errabolu * The registers that provide the waves in flight are: 77943a4bc82SRamesh Errabolu * 78043a4bc82SRamesh Errabolu * SPI_CSQ_WF_ACTIVE_STATUS - bit-map of queues per pipe. The bit is ON if a 78143a4bc82SRamesh Errabolu * queue is slotted, OFF if there is no queue. A process could have ZERO or 78243a4bc82SRamesh Errabolu * more queues slotted and submitting waves to be run on compute units. Even 78343a4bc82SRamesh Errabolu * when there is a queue it is possible there could be zero wave fronts, this 78443a4bc82SRamesh Errabolu * can happen when queue is waiting on top-of-pipe events - e.g. waitRegMem 78543a4bc82SRamesh Errabolu * command 78643a4bc82SRamesh Errabolu * 78743a4bc82SRamesh Errabolu * For each bit that is ON from above: 78843a4bc82SRamesh Errabolu * 78943a4bc82SRamesh Errabolu * Read (SPI_CSQ_WF_ACTIVE_COUNT_0 + queue_idx) register. It provides the 79043a4bc82SRamesh Errabolu * number of waves that are in flight for the queue at specified index. The 79143a4bc82SRamesh Errabolu * index ranges from 0 to 7. 79243a4bc82SRamesh Errabolu * 79343a4bc82SRamesh Errabolu * If non-zero waves are in flight, read CP_HQD_VMID register to obtain VMID 79443a4bc82SRamesh Errabolu * of the wave(s). 79543a4bc82SRamesh Errabolu * 79643a4bc82SRamesh Errabolu * Determine if VMID from above step maps to pasid provided as parameter. If 79743a4bc82SRamesh Errabolu * it matches agrregate the wave count. That the VMID will not match pasid is 79843a4bc82SRamesh Errabolu * a normal condition i.e. a device is expected to support multiple queues 79943a4bc82SRamesh Errabolu * from multiple proceses. 80043a4bc82SRamesh Errabolu * 80143a4bc82SRamesh Errabolu * Reading registers referenced above involves programming GRBM appropriately 80243a4bc82SRamesh Errabolu */ 803aeee2a48SRamesh Errabolu void kgd_gfx_v9_get_cu_occupancy(struct kgd_dev *kgd, int pasid, 80443a4bc82SRamesh Errabolu int *pasid_wave_cnt, int *max_waves_per_cu) 80543a4bc82SRamesh Errabolu { 80643a4bc82SRamesh Errabolu int qidx; 80743a4bc82SRamesh Errabolu int vmid; 80843a4bc82SRamesh Errabolu int se_idx; 80943a4bc82SRamesh Errabolu int sh_idx; 81043a4bc82SRamesh Errabolu int se_cnt; 81143a4bc82SRamesh Errabolu int sh_cnt; 81243a4bc82SRamesh Errabolu int wave_cnt; 81343a4bc82SRamesh Errabolu int queue_map; 81443a4bc82SRamesh Errabolu int pasid_tmp; 81543a4bc82SRamesh Errabolu int max_queue_cnt; 81643a4bc82SRamesh Errabolu int vmid_wave_cnt = 0; 81743a4bc82SRamesh Errabolu struct amdgpu_device *adev; 81843a4bc82SRamesh Errabolu DECLARE_BITMAP(cp_queue_bitmap, KGD_MAX_QUEUES); 81943a4bc82SRamesh Errabolu 82043a4bc82SRamesh Errabolu adev = get_amdgpu_device(kgd); 82143a4bc82SRamesh Errabolu lock_spi_csq_mutexes(adev); 82243a4bc82SRamesh Errabolu soc15_grbm_select(adev, 1, 0, 0, 0); 82343a4bc82SRamesh Errabolu 82443a4bc82SRamesh Errabolu /* 82543a4bc82SRamesh Errabolu * Iterate through the shader engines and arrays of the device 82643a4bc82SRamesh Errabolu * to get number of waves in flight 82743a4bc82SRamesh Errabolu */ 82843a4bc82SRamesh Errabolu bitmap_complement(cp_queue_bitmap, adev->gfx.mec.queue_bitmap, 82943a4bc82SRamesh Errabolu KGD_MAX_QUEUES); 83043a4bc82SRamesh Errabolu max_queue_cnt = adev->gfx.mec.num_pipe_per_mec * 83143a4bc82SRamesh Errabolu adev->gfx.mec.num_queue_per_pipe; 83243a4bc82SRamesh Errabolu sh_cnt = adev->gfx.config.max_sh_per_se; 83343a4bc82SRamesh Errabolu se_cnt = adev->gfx.config.max_shader_engines; 83443a4bc82SRamesh Errabolu for (se_idx = 0; se_idx < se_cnt; se_idx++) { 83543a4bc82SRamesh Errabolu for (sh_idx = 0; sh_idx < sh_cnt; sh_idx++) { 83643a4bc82SRamesh Errabolu 83743a4bc82SRamesh Errabolu gfx_v9_0_select_se_sh(adev, se_idx, sh_idx, 0xffffffff); 83843a4bc82SRamesh Errabolu queue_map = RREG32(SOC15_REG_OFFSET(GC, 0, 83943a4bc82SRamesh Errabolu mmSPI_CSQ_WF_ACTIVE_STATUS)); 84043a4bc82SRamesh Errabolu 84143a4bc82SRamesh Errabolu /* 84243a4bc82SRamesh Errabolu * Assumption: queue map encodes following schema: four 84343a4bc82SRamesh Errabolu * pipes per each micro-engine, with each pipe mapping 84443a4bc82SRamesh Errabolu * eight queues. This schema is true for GFX9 devices 84543a4bc82SRamesh Errabolu * and must be verified for newer device families 84643a4bc82SRamesh Errabolu */ 84743a4bc82SRamesh Errabolu for (qidx = 0; qidx < max_queue_cnt; qidx++) { 84843a4bc82SRamesh Errabolu 84943a4bc82SRamesh Errabolu /* Skip qeueus that are not associated with 85043a4bc82SRamesh Errabolu * compute functions 85143a4bc82SRamesh Errabolu */ 85243a4bc82SRamesh Errabolu if (!test_bit(qidx, cp_queue_bitmap)) 85343a4bc82SRamesh Errabolu continue; 85443a4bc82SRamesh Errabolu 85543a4bc82SRamesh Errabolu if (!(queue_map & (1 << qidx))) 85643a4bc82SRamesh Errabolu continue; 85743a4bc82SRamesh Errabolu 85843a4bc82SRamesh Errabolu /* Get number of waves in flight and aggregate them */ 85943a4bc82SRamesh Errabolu get_wave_count(adev, qidx, &wave_cnt, &vmid); 86043a4bc82SRamesh Errabolu if (wave_cnt != 0) { 86143a4bc82SRamesh Errabolu pasid_tmp = 86243a4bc82SRamesh Errabolu RREG32(SOC15_REG_OFFSET(OSSSYS, 0, 86343a4bc82SRamesh Errabolu mmIH_VMID_0_LUT) + vmid); 86443a4bc82SRamesh Errabolu if (pasid_tmp == pasid) 86543a4bc82SRamesh Errabolu vmid_wave_cnt += wave_cnt; 86643a4bc82SRamesh Errabolu } 86743a4bc82SRamesh Errabolu } 86843a4bc82SRamesh Errabolu } 86943a4bc82SRamesh Errabolu } 87043a4bc82SRamesh Errabolu 87143a4bc82SRamesh Errabolu gfx_v9_0_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff); 87243a4bc82SRamesh Errabolu soc15_grbm_select(adev, 0, 0, 0, 0); 87343a4bc82SRamesh Errabolu unlock_spi_csq_mutexes(adev); 87443a4bc82SRamesh Errabolu 87543a4bc82SRamesh Errabolu /* Update the output parameters and return */ 87643a4bc82SRamesh Errabolu *pasid_wave_cnt = vmid_wave_cnt; 87743a4bc82SRamesh Errabolu *max_waves_per_cu = adev->gfx.cu_info.simd_per_cu * 87843a4bc82SRamesh Errabolu adev->gfx.cu_info.max_waves_per_simd; 879d5a114a6SFelix Kuehling } 8803e205a08SOak Zeng 881e392c887SYong Zhao const struct kfd2kgd_calls gfx_v9_kfd2kgd = { 8823e205a08SOak Zeng .program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings, 8833e205a08SOak Zeng .set_pasid_vmid_mapping = kgd_gfx_v9_set_pasid_vmid_mapping, 8843e205a08SOak Zeng .init_interrupts = kgd_gfx_v9_init_interrupts, 8853e205a08SOak Zeng .hqd_load = kgd_gfx_v9_hqd_load, 88635cd89d5SAaron Liu .hiq_mqd_load = kgd_gfx_v9_hiq_mqd_load, 8873e205a08SOak Zeng .hqd_sdma_load = kgd_hqd_sdma_load, 8883e205a08SOak Zeng .hqd_dump = kgd_gfx_v9_hqd_dump, 8893e205a08SOak Zeng .hqd_sdma_dump = kgd_hqd_sdma_dump, 8903e205a08SOak Zeng .hqd_is_occupied = kgd_gfx_v9_hqd_is_occupied, 8913e205a08SOak Zeng .hqd_sdma_is_occupied = kgd_hqd_sdma_is_occupied, 8923e205a08SOak Zeng .hqd_destroy = kgd_gfx_v9_hqd_destroy, 8933e205a08SOak Zeng .hqd_sdma_destroy = kgd_hqd_sdma_destroy, 8943e205a08SOak Zeng .address_watch_disable = kgd_gfx_v9_address_watch_disable, 8953e205a08SOak Zeng .address_watch_execute = kgd_gfx_v9_address_watch_execute, 8963e205a08SOak Zeng .wave_control_execute = kgd_gfx_v9_wave_control_execute, 8973e205a08SOak Zeng .address_watch_get_offset = kgd_gfx_v9_address_watch_get_offset, 89856fc40abSYong Zhao .get_atc_vmid_pasid_mapping_info = 89956fc40abSYong Zhao kgd_gfx_v9_get_atc_vmid_pasid_mapping_info, 9003e205a08SOak Zeng .set_vm_context_page_table_base = kgd_gfx_v9_set_vm_context_page_table_base, 90143a4bc82SRamesh Errabolu .get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy, 9023e205a08SOak Zeng }; 903