1 /* 2 * Copyright 2019 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 */ 22 #undef pr_fmt 23 #define pr_fmt(fmt) "kfd2kgd: " fmt 24 25 #include <linux/mmu_context.h> 26 #include "amdgpu.h" 27 #include "amdgpu_amdkfd.h" 28 #include "gc/gc_10_1_0_offset.h" 29 #include "gc/gc_10_1_0_sh_mask.h" 30 #include "navi10_enum.h" 31 #include "athub/athub_2_0_0_offset.h" 32 #include "athub/athub_2_0_0_sh_mask.h" 33 #include "oss/osssys_5_0_0_offset.h" 34 #include "oss/osssys_5_0_0_sh_mask.h" 35 #include "soc15_common.h" 36 #include "v10_structs.h" 37 #include "nv.h" 38 #include "nvd.h" 39 #include "gfxhub_v2_0.h" 40 41 enum hqd_dequeue_request_type { 42 NO_ACTION = 0, 43 DRAIN_PIPE, 44 RESET_WAVES, 45 SAVE_WAVES 46 }; 47 48 /* Because of REG_GET_FIELD() being used, we put this function in the 49 * asic specific file. 50 */ 51 static int amdgpu_amdkfd_get_tile_config(struct kgd_dev *kgd, 52 struct tile_config *config) 53 { 54 struct amdgpu_device *adev = (struct amdgpu_device *)kgd; 55 56 config->gb_addr_config = adev->gfx.config.gb_addr_config; 57 #if 0 58 /* TODO - confirm REG_GET_FIELD x2, should be OK as is... but 59 * MC_ARB_RAMCFG register doesn't exist on Vega10 - initial amdgpu 60 * changes commented out related code, doing the same here for now but 61 * need to sync with Ken et al 62 */ 63 config->num_banks = REG_GET_FIELD(adev->gfx.config.mc_arb_ramcfg, 64 MC_ARB_RAMCFG, NOOFBANK); 65 config->num_ranks = REG_GET_FIELD(adev->gfx.config.mc_arb_ramcfg, 66 MC_ARB_RAMCFG, NOOFRANKS); 67 #endif 68 69 config->tile_config_ptr = adev->gfx.config.tile_mode_array; 70 config->num_tile_configs = 71 ARRAY_SIZE(adev->gfx.config.tile_mode_array); 72 config->macro_tile_config_ptr = 73 adev->gfx.config.macrotile_mode_array; 74 config->num_macro_tile_configs = 75 ARRAY_SIZE(adev->gfx.config.macrotile_mode_array); 76 77 return 0; 78 } 79 80 static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd) 81 { 82 return (struct amdgpu_device *)kgd; 83 } 84 85 static void lock_srbm(struct kgd_dev *kgd, uint32_t mec, uint32_t pipe, 86 uint32_t queue, uint32_t vmid) 87 { 88 struct amdgpu_device *adev = get_amdgpu_device(kgd); 89 90 mutex_lock(&adev->srbm_mutex); 91 nv_grbm_select(adev, mec, pipe, queue, vmid); 92 } 93 94 static void unlock_srbm(struct kgd_dev *kgd) 95 { 96 struct amdgpu_device *adev = get_amdgpu_device(kgd); 97 98 nv_grbm_select(adev, 0, 0, 0, 0); 99 mutex_unlock(&adev->srbm_mutex); 100 } 101 102 static void acquire_queue(struct kgd_dev *kgd, uint32_t pipe_id, 103 uint32_t queue_id) 104 { 105 struct amdgpu_device *adev = get_amdgpu_device(kgd); 106 107 uint32_t mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; 108 uint32_t pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); 109 110 lock_srbm(kgd, mec, pipe, queue_id, 0); 111 } 112 113 static uint32_t get_queue_mask(struct amdgpu_device *adev, 114 uint32_t pipe_id, uint32_t queue_id) 115 { 116 unsigned int bit = (pipe_id * adev->gfx.mec.num_queue_per_pipe + 117 queue_id) & 31; 118 119 return ((uint32_t)1) << bit; 120 } 121 122 static void release_queue(struct kgd_dev *kgd) 123 { 124 unlock_srbm(kgd); 125 } 126 127 static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid, 128 uint32_t sh_mem_config, 129 uint32_t sh_mem_ape1_base, 130 uint32_t sh_mem_ape1_limit, 131 uint32_t sh_mem_bases) 132 { 133 struct amdgpu_device *adev = get_amdgpu_device(kgd); 134 135 lock_srbm(kgd, 0, 0, 0, vmid); 136 137 WREG32(SOC15_REG_OFFSET(GC, 0, mmSH_MEM_CONFIG), sh_mem_config); 138 WREG32(SOC15_REG_OFFSET(GC, 0, mmSH_MEM_BASES), sh_mem_bases); 139 /* APE1 no longer exists on GFX9 */ 140 141 unlock_srbm(kgd); 142 } 143 144 static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, 145 unsigned int vmid) 146 { 147 struct amdgpu_device *adev = get_amdgpu_device(kgd); 148 149 /* 150 * We have to assume that there is no outstanding mapping. 151 * The ATC_VMID_PASID_MAPPING_UPDATE_STATUS bit could be 0 because 152 * a mapping is in progress or because a mapping finished 153 * and the SW cleared it. 154 * So the protocol is to always wait & clear. 155 */ 156 uint32_t pasid_mapping = (pasid == 0) ? 0 : (uint32_t)pasid | 157 ATC_VMID0_PASID_MAPPING__VALID_MASK; 158 159 pr_debug("pasid 0x%x vmid %d, reg value %x\n", pasid, vmid, pasid_mapping); 160 161 pr_debug("ATHUB, reg %x\n", SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) + vmid); 162 WREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) + vmid, 163 pasid_mapping); 164 165 #if 0 166 /* TODO: uncomment this code when the hardware support is ready. */ 167 while (!(RREG32(SOC15_REG_OFFSET( 168 ATHUB, 0, 169 mmATC_VMID_PASID_MAPPING_UPDATE_STATUS)) & 170 (1U << vmid))) 171 cpu_relax(); 172 173 pr_debug("ATHUB mapping update finished\n"); 174 WREG32(SOC15_REG_OFFSET(ATHUB, 0, 175 mmATC_VMID_PASID_MAPPING_UPDATE_STATUS), 176 1U << vmid); 177 #endif 178 179 /* Mapping vmid to pasid also for IH block */ 180 pr_debug("update mapping for IH block and mmhub"); 181 WREG32(SOC15_REG_OFFSET(OSSSYS, 0, mmIH_VMID_0_LUT) + vmid, 182 pasid_mapping); 183 184 return 0; 185 } 186 187 /* TODO - RING0 form of field is obsolete, seems to date back to SI 188 * but still works 189 */ 190 191 static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id) 192 { 193 struct amdgpu_device *adev = get_amdgpu_device(kgd); 194 uint32_t mec; 195 uint32_t pipe; 196 197 mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; 198 pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); 199 200 lock_srbm(kgd, mec, pipe, 0, 0); 201 202 WREG32(SOC15_REG_OFFSET(GC, 0, mmCPC_INT_CNTL), 203 CP_INT_CNTL_RING0__TIME_STAMP_INT_ENABLE_MASK | 204 CP_INT_CNTL_RING0__OPCODE_ERROR_INT_ENABLE_MASK); 205 206 unlock_srbm(kgd); 207 208 return 0; 209 } 210 211 static uint32_t get_sdma_rlc_reg_offset(struct amdgpu_device *adev, 212 unsigned int engine_id, 213 unsigned int queue_id) 214 { 215 uint32_t sdma_engine_reg_base[2] = { 216 SOC15_REG_OFFSET(SDMA0, 0, 217 mmSDMA0_RLC0_RB_CNTL) - mmSDMA0_RLC0_RB_CNTL, 218 /* On gfx10, mmSDMA1_xxx registers are defined NOT based 219 * on SDMA1 base address (dw 0x1860) but based on SDMA0 220 * base address (dw 0x1260). Therefore use mmSDMA0_RLC0_RB_CNTL 221 * instead of mmSDMA1_RLC0_RB_CNTL for the base address calc 222 * below 223 */ 224 SOC15_REG_OFFSET(SDMA1, 0, 225 mmSDMA1_RLC0_RB_CNTL) - mmSDMA0_RLC0_RB_CNTL 226 }; 227 228 uint32_t retval = sdma_engine_reg_base[engine_id] 229 + queue_id * (mmSDMA0_RLC1_RB_CNTL - mmSDMA0_RLC0_RB_CNTL); 230 231 pr_debug("RLC register offset for SDMA%d RLC%d: 0x%x\n", engine_id, 232 queue_id, retval); 233 234 return retval; 235 } 236 237 #if 0 238 static uint32_t get_watch_base_addr(struct amdgpu_device *adev) 239 { 240 uint32_t retval = SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_ADDR_H) - 241 mmTCP_WATCH0_ADDR_H; 242 243 pr_debug("kfd: reg watch base address: 0x%x\n", retval); 244 245 return retval; 246 } 247 #endif 248 249 static inline struct v10_compute_mqd *get_mqd(void *mqd) 250 { 251 return (struct v10_compute_mqd *)mqd; 252 } 253 254 static inline struct v10_sdma_mqd *get_sdma_mqd(void *mqd) 255 { 256 return (struct v10_sdma_mqd *)mqd; 257 } 258 259 static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, 260 uint32_t queue_id, uint32_t __user *wptr, 261 uint32_t wptr_shift, uint32_t wptr_mask, 262 struct mm_struct *mm) 263 { 264 struct amdgpu_device *adev = get_amdgpu_device(kgd); 265 struct v10_compute_mqd *m; 266 uint32_t *mqd_hqd; 267 uint32_t reg, hqd_base, data; 268 269 m = get_mqd(mqd); 270 271 pr_debug("Load hqd of pipe %d queue %d\n", pipe_id, queue_id); 272 acquire_queue(kgd, pipe_id, queue_id); 273 274 /* HIQ is set during driver init period with vmid set to 0*/ 275 if (m->cp_hqd_vmid == 0) { 276 uint32_t value, mec, pipe; 277 278 mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; 279 pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); 280 281 pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n", 282 mec, pipe, queue_id); 283 value = RREG32(SOC15_REG_OFFSET(GC, 0, mmRLC_CP_SCHEDULERS)); 284 value = REG_SET_FIELD(value, RLC_CP_SCHEDULERS, scheduler1, 285 ((mec << 5) | (pipe << 3) | queue_id | 0x80)); 286 WREG32(SOC15_REG_OFFSET(GC, 0, mmRLC_CP_SCHEDULERS), value); 287 } 288 289 /* HQD registers extend from CP_MQD_BASE_ADDR to CP_HQD_EOP_WPTR_MEM. */ 290 mqd_hqd = &m->cp_mqd_base_addr_lo; 291 hqd_base = SOC15_REG_OFFSET(GC, 0, mmCP_MQD_BASE_ADDR); 292 293 for (reg = hqd_base; 294 reg <= SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI); reg++) 295 WREG32(reg, mqd_hqd[reg - hqd_base]); 296 297 298 /* Activate doorbell logic before triggering WPTR poll. */ 299 data = REG_SET_FIELD(m->cp_hqd_pq_doorbell_control, 300 CP_HQD_PQ_DOORBELL_CONTROL, DOORBELL_EN, 1); 301 WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_DOORBELL_CONTROL), data); 302 303 if (wptr) { 304 /* Don't read wptr with get_user because the user 305 * context may not be accessible (if this function 306 * runs in a work queue). Instead trigger a one-shot 307 * polling read from memory in the CP. This assumes 308 * that wptr is GPU-accessible in the queue's VMID via 309 * ATC or SVM. WPTR==RPTR before starting the poll so 310 * the CP starts fetching new commands from the right 311 * place. 312 * 313 * Guessing a 64-bit WPTR from a 32-bit RPTR is a bit 314 * tricky. Assume that the queue didn't overflow. The 315 * number of valid bits in the 32-bit RPTR depends on 316 * the queue size. The remaining bits are taken from 317 * the saved 64-bit WPTR. If the WPTR wrapped, add the 318 * queue size. 319 */ 320 uint32_t queue_size = 321 2 << REG_GET_FIELD(m->cp_hqd_pq_control, 322 CP_HQD_PQ_CONTROL, QUEUE_SIZE); 323 uint64_t guessed_wptr = m->cp_hqd_pq_rptr & (queue_size - 1); 324 325 if ((m->cp_hqd_pq_wptr_lo & (queue_size - 1)) < guessed_wptr) 326 guessed_wptr += queue_size; 327 guessed_wptr += m->cp_hqd_pq_wptr_lo & ~(queue_size - 1); 328 guessed_wptr += (uint64_t)m->cp_hqd_pq_wptr_hi << 32; 329 330 WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_LO), 331 lower_32_bits(guessed_wptr)); 332 WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI), 333 upper_32_bits(guessed_wptr)); 334 WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_POLL_ADDR), 335 lower_32_bits((uint64_t)wptr)); 336 WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_POLL_ADDR_HI), 337 upper_32_bits((uint64_t)wptr)); 338 pr_debug("%s setting CP_PQ_WPTR_POLL_CNTL1 to %x\n", __func__, get_queue_mask(adev, pipe_id, queue_id)); 339 WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_PQ_WPTR_POLL_CNTL1), 340 get_queue_mask(adev, pipe_id, queue_id)); 341 } 342 343 /* Start the EOP fetcher */ 344 WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_EOP_RPTR), 345 REG_SET_FIELD(m->cp_hqd_eop_rptr, 346 CP_HQD_EOP_RPTR, INIT_FETCHER, 1)); 347 348 data = REG_SET_FIELD(m->cp_hqd_active, CP_HQD_ACTIVE, ACTIVE, 1); 349 WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE), data); 350 351 release_queue(kgd); 352 353 return 0; 354 } 355 356 static int kgd_hqd_dump(struct kgd_dev *kgd, 357 uint32_t pipe_id, uint32_t queue_id, 358 uint32_t (**dump)[2], uint32_t *n_regs) 359 { 360 struct amdgpu_device *adev = get_amdgpu_device(kgd); 361 uint32_t i = 0, reg; 362 #define HQD_N_REGS 56 363 #define DUMP_REG(addr) do { \ 364 if (WARN_ON_ONCE(i >= HQD_N_REGS)) \ 365 break; \ 366 (*dump)[i][0] = (addr) << 2; \ 367 (*dump)[i++][1] = RREG32(addr); \ 368 } while (0) 369 370 *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL); 371 if (*dump == NULL) 372 return -ENOMEM; 373 374 acquire_queue(kgd, pipe_id, queue_id); 375 376 for (reg = SOC15_REG_OFFSET(GC, 0, mmCP_MQD_BASE_ADDR); 377 reg <= SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI); reg++) 378 DUMP_REG(reg); 379 380 release_queue(kgd); 381 382 WARN_ON_ONCE(i != HQD_N_REGS); 383 *n_regs = i; 384 385 return 0; 386 } 387 388 static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, 389 uint32_t __user *wptr, struct mm_struct *mm) 390 { 391 struct amdgpu_device *adev = get_amdgpu_device(kgd); 392 struct v10_sdma_mqd *m; 393 uint32_t sdma_rlc_reg_offset; 394 unsigned long end_jiffies; 395 uint32_t data; 396 uint64_t data64; 397 uint64_t __user *wptr64 = (uint64_t __user *)wptr; 398 399 m = get_sdma_mqd(mqd); 400 sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, m->sdma_engine_id, 401 m->sdma_queue_id); 402 403 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL, 404 m->sdmax_rlcx_rb_cntl & (~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK)); 405 406 end_jiffies = msecs_to_jiffies(2000) + jiffies; 407 while (true) { 408 data = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_CONTEXT_STATUS); 409 if (data & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) 410 break; 411 if (time_after(jiffies, end_jiffies)) { 412 pr_err("SDMA RLC not idle in %s\n", __func__); 413 return -ETIME; 414 } 415 usleep_range(500, 1000); 416 } 417 418 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_DOORBELL_OFFSET, 419 m->sdmax_rlcx_doorbell_offset); 420 421 data = REG_SET_FIELD(m->sdmax_rlcx_doorbell, SDMA0_RLC0_DOORBELL, 422 ENABLE, 1); 423 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_DOORBELL, data); 424 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR, 425 m->sdmax_rlcx_rb_rptr); 426 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR_HI, 427 m->sdmax_rlcx_rb_rptr_hi); 428 429 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 1); 430 if (read_user_wptr(mm, wptr64, data64)) { 431 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_WPTR, 432 lower_32_bits(data64)); 433 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_WPTR_HI, 434 upper_32_bits(data64)); 435 } else { 436 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_WPTR, 437 m->sdmax_rlcx_rb_rptr); 438 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_WPTR_HI, 439 m->sdmax_rlcx_rb_rptr_hi); 440 } 441 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 0); 442 443 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_BASE, m->sdmax_rlcx_rb_base); 444 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_BASE_HI, 445 m->sdmax_rlcx_rb_base_hi); 446 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR_ADDR_LO, 447 m->sdmax_rlcx_rb_rptr_addr_lo); 448 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR_ADDR_HI, 449 m->sdmax_rlcx_rb_rptr_addr_hi); 450 451 data = REG_SET_FIELD(m->sdmax_rlcx_rb_cntl, SDMA0_RLC0_RB_CNTL, 452 RB_ENABLE, 1); 453 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL, data); 454 455 return 0; 456 } 457 458 static int kgd_hqd_sdma_dump(struct kgd_dev *kgd, 459 uint32_t engine_id, uint32_t queue_id, 460 uint32_t (**dump)[2], uint32_t *n_regs) 461 { 462 struct amdgpu_device *adev = get_amdgpu_device(kgd); 463 uint32_t sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, 464 engine_id, queue_id); 465 uint32_t i = 0, reg; 466 #undef HQD_N_REGS 467 #define HQD_N_REGS (19+6+7+10) 468 469 *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL); 470 if (*dump == NULL) 471 return -ENOMEM; 472 473 for (reg = mmSDMA0_RLC0_RB_CNTL; reg <= mmSDMA0_RLC0_DOORBELL; reg++) 474 DUMP_REG(sdma_rlc_reg_offset + reg); 475 for (reg = mmSDMA0_RLC0_STATUS; reg <= mmSDMA0_RLC0_CSA_ADDR_HI; reg++) 476 DUMP_REG(sdma_rlc_reg_offset + reg); 477 for (reg = mmSDMA0_RLC0_IB_SUB_REMAIN; 478 reg <= mmSDMA0_RLC0_MINOR_PTR_UPDATE; reg++) 479 DUMP_REG(sdma_rlc_reg_offset + reg); 480 for (reg = mmSDMA0_RLC0_MIDCMD_DATA0; 481 reg <= mmSDMA0_RLC0_MIDCMD_CNTL; reg++) 482 DUMP_REG(sdma_rlc_reg_offset + reg); 483 484 WARN_ON_ONCE(i != HQD_N_REGS); 485 *n_regs = i; 486 487 return 0; 488 } 489 490 static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, 491 uint32_t pipe_id, uint32_t queue_id) 492 { 493 struct amdgpu_device *adev = get_amdgpu_device(kgd); 494 uint32_t act; 495 bool retval = false; 496 uint32_t low, high; 497 498 acquire_queue(kgd, pipe_id, queue_id); 499 act = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE)); 500 if (act) { 501 low = lower_32_bits(queue_address >> 8); 502 high = upper_32_bits(queue_address >> 8); 503 504 if (low == RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_BASE)) && 505 high == RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_BASE_HI))) 506 retval = true; 507 } 508 release_queue(kgd); 509 return retval; 510 } 511 512 static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd) 513 { 514 struct amdgpu_device *adev = get_amdgpu_device(kgd); 515 struct v10_sdma_mqd *m; 516 uint32_t sdma_rlc_reg_offset; 517 uint32_t sdma_rlc_rb_cntl; 518 519 m = get_sdma_mqd(mqd); 520 sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, m->sdma_engine_id, 521 m->sdma_queue_id); 522 523 sdma_rlc_rb_cntl = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL); 524 525 if (sdma_rlc_rb_cntl & SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK) 526 return true; 527 528 return false; 529 } 530 531 static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, 532 enum kfd_preempt_type reset_type, 533 unsigned int utimeout, uint32_t pipe_id, 534 uint32_t queue_id) 535 { 536 struct amdgpu_device *adev = get_amdgpu_device(kgd); 537 enum hqd_dequeue_request_type type; 538 unsigned long end_jiffies; 539 uint32_t temp; 540 struct v10_compute_mqd *m = get_mqd(mqd); 541 542 #if 0 543 unsigned long flags; 544 int retry; 545 #endif 546 547 acquire_queue(kgd, pipe_id, queue_id); 548 549 if (m->cp_hqd_vmid == 0) 550 WREG32_FIELD15(GC, 0, RLC_CP_SCHEDULERS, scheduler1, 0); 551 552 switch (reset_type) { 553 case KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN: 554 type = DRAIN_PIPE; 555 break; 556 case KFD_PREEMPT_TYPE_WAVEFRONT_RESET: 557 type = RESET_WAVES; 558 break; 559 default: 560 type = DRAIN_PIPE; 561 break; 562 } 563 564 #if 0 /* Is this still needed? */ 565 /* Workaround: If IQ timer is active and the wait time is close to or 566 * equal to 0, dequeueing is not safe. Wait until either the wait time 567 * is larger or timer is cleared. Also, ensure that IQ_REQ_PEND is 568 * cleared before continuing. Also, ensure wait times are set to at 569 * least 0x3. 570 */ 571 local_irq_save(flags); 572 preempt_disable(); 573 retry = 5000; /* wait for 500 usecs at maximum */ 574 while (true) { 575 temp = RREG32(mmCP_HQD_IQ_TIMER); 576 if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, PROCESSING_IQ)) { 577 pr_debug("HW is processing IQ\n"); 578 goto loop; 579 } 580 if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, ACTIVE)) { 581 if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, RETRY_TYPE) 582 == 3) /* SEM-rearm is safe */ 583 break; 584 /* Wait time 3 is safe for CP, but our MMIO read/write 585 * time is close to 1 microsecond, so check for 10 to 586 * leave more buffer room 587 */ 588 if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, WAIT_TIME) 589 >= 10) 590 break; 591 pr_debug("IQ timer is active\n"); 592 } else 593 break; 594 loop: 595 if (!retry) { 596 pr_err("CP HQD IQ timer status time out\n"); 597 break; 598 } 599 ndelay(100); 600 --retry; 601 } 602 retry = 1000; 603 while (true) { 604 temp = RREG32(mmCP_HQD_DEQUEUE_REQUEST); 605 if (!(temp & CP_HQD_DEQUEUE_REQUEST__IQ_REQ_PEND_MASK)) 606 break; 607 pr_debug("Dequeue request is pending\n"); 608 609 if (!retry) { 610 pr_err("CP HQD dequeue request time out\n"); 611 break; 612 } 613 ndelay(100); 614 --retry; 615 } 616 local_irq_restore(flags); 617 preempt_enable(); 618 #endif 619 620 WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_DEQUEUE_REQUEST), type); 621 622 end_jiffies = (utimeout * HZ / 1000) + jiffies; 623 while (true) { 624 temp = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE)); 625 if (!(temp & CP_HQD_ACTIVE__ACTIVE_MASK)) 626 break; 627 if (time_after(jiffies, end_jiffies)) { 628 pr_err("cp queue preemption time out.\n"); 629 release_queue(kgd); 630 return -ETIME; 631 } 632 usleep_range(500, 1000); 633 } 634 635 release_queue(kgd); 636 return 0; 637 } 638 639 static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, 640 unsigned int utimeout) 641 { 642 struct amdgpu_device *adev = get_amdgpu_device(kgd); 643 struct v10_sdma_mqd *m; 644 uint32_t sdma_rlc_reg_offset; 645 uint32_t temp; 646 unsigned long end_jiffies = (utimeout * HZ / 1000) + jiffies; 647 648 m = get_sdma_mqd(mqd); 649 sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, m->sdma_engine_id, 650 m->sdma_queue_id); 651 652 temp = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL); 653 temp = temp & ~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK; 654 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL, temp); 655 656 while (true) { 657 temp = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_CONTEXT_STATUS); 658 if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) 659 break; 660 if (time_after(jiffies, end_jiffies)) { 661 pr_err("SDMA RLC not idle in %s\n", __func__); 662 return -ETIME; 663 } 664 usleep_range(500, 1000); 665 } 666 667 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_DOORBELL, 0); 668 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL, 669 RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL) | 670 SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK); 671 672 m->sdmax_rlcx_rb_rptr = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR); 673 m->sdmax_rlcx_rb_rptr_hi = 674 RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR_HI); 675 676 return 0; 677 } 678 679 static bool get_atc_vmid_pasid_mapping_info(struct kgd_dev *kgd, 680 uint8_t vmid, uint16_t *p_pasid) 681 { 682 uint32_t value; 683 struct amdgpu_device *adev = (struct amdgpu_device *) kgd; 684 685 value = RREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) 686 + vmid); 687 *p_pasid = value & ATC_VMID0_PASID_MAPPING__PASID_MASK; 688 689 return !!(value & ATC_VMID0_PASID_MAPPING__VALID_MASK); 690 } 691 692 static int invalidate_tlbs_with_kiq(struct amdgpu_device *adev, uint16_t pasid) 693 { 694 signed long r; 695 uint32_t seq; 696 struct amdgpu_ring *ring = &adev->gfx.kiq.ring; 697 698 spin_lock(&adev->gfx.kiq.ring_lock); 699 amdgpu_ring_alloc(ring, 12); /* fence + invalidate_tlbs package*/ 700 amdgpu_ring_write(ring, PACKET3(PACKET3_INVALIDATE_TLBS, 0)); 701 amdgpu_ring_write(ring, 702 PACKET3_INVALIDATE_TLBS_DST_SEL(1) | 703 PACKET3_INVALIDATE_TLBS_PASID(pasid)); 704 amdgpu_fence_emit_polling(ring, &seq); 705 amdgpu_ring_commit(ring); 706 spin_unlock(&adev->gfx.kiq.ring_lock); 707 708 r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout); 709 if (r < 1) { 710 DRM_ERROR("wait for kiq fence error: %ld.\n", r); 711 return -ETIME; 712 } 713 714 return 0; 715 } 716 717 static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid) 718 { 719 struct amdgpu_device *adev = (struct amdgpu_device *) kgd; 720 int vmid; 721 uint16_t queried_pasid; 722 bool ret; 723 struct amdgpu_ring *ring = &adev->gfx.kiq.ring; 724 725 if (amdgpu_emu_mode == 0 && ring->sched.ready) 726 return invalidate_tlbs_with_kiq(adev, pasid); 727 728 for (vmid = 0; vmid < 16; vmid++) { 729 if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid)) 730 continue; 731 732 ret = get_atc_vmid_pasid_mapping_info(kgd, vmid, 733 &queried_pasid); 734 if (ret && queried_pasid == pasid) { 735 amdgpu_gmc_flush_gpu_tlb(adev, vmid, 736 AMDGPU_GFXHUB_0, 0); 737 break; 738 } 739 } 740 741 return 0; 742 } 743 744 static int invalidate_tlbs_vmid(struct kgd_dev *kgd, uint16_t vmid) 745 { 746 struct amdgpu_device *adev = (struct amdgpu_device *) kgd; 747 748 if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid)) { 749 pr_err("non kfd vmid %d\n", vmid); 750 return 0; 751 } 752 753 amdgpu_gmc_flush_gpu_tlb(adev, vmid, AMDGPU_GFXHUB_0, 0); 754 return 0; 755 } 756 757 static int kgd_address_watch_disable(struct kgd_dev *kgd) 758 { 759 return 0; 760 } 761 762 static int kgd_address_watch_execute(struct kgd_dev *kgd, 763 unsigned int watch_point_id, 764 uint32_t cntl_val, 765 uint32_t addr_hi, 766 uint32_t addr_lo) 767 { 768 return 0; 769 } 770 771 static int kgd_wave_control_execute(struct kgd_dev *kgd, 772 uint32_t gfx_index_val, 773 uint32_t sq_cmd) 774 { 775 struct amdgpu_device *adev = get_amdgpu_device(kgd); 776 uint32_t data = 0; 777 778 mutex_lock(&adev->grbm_idx_mutex); 779 780 WREG32(SOC15_REG_OFFSET(GC, 0, mmGRBM_GFX_INDEX), gfx_index_val); 781 WREG32(SOC15_REG_OFFSET(GC, 0, mmSQ_CMD), sq_cmd); 782 783 data = REG_SET_FIELD(data, GRBM_GFX_INDEX, 784 INSTANCE_BROADCAST_WRITES, 1); 785 data = REG_SET_FIELD(data, GRBM_GFX_INDEX, 786 SA_BROADCAST_WRITES, 1); 787 data = REG_SET_FIELD(data, GRBM_GFX_INDEX, 788 SE_BROADCAST_WRITES, 1); 789 790 WREG32(SOC15_REG_OFFSET(GC, 0, mmGRBM_GFX_INDEX), data); 791 mutex_unlock(&adev->grbm_idx_mutex); 792 793 return 0; 794 } 795 796 static uint32_t kgd_address_watch_get_offset(struct kgd_dev *kgd, 797 unsigned int watch_point_id, 798 unsigned int reg_offset) 799 { 800 return 0; 801 } 802 803 static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, 804 uint64_t page_table_base) 805 { 806 struct amdgpu_device *adev = get_amdgpu_device(kgd); 807 808 if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid)) { 809 pr_err("trying to set page table base for wrong VMID %u\n", 810 vmid); 811 return; 812 } 813 814 /* SDMA is on gfxhub as well for Navi1* series */ 815 gfxhub_v2_0_setup_vm_pt_regs(adev, vmid, page_table_base); 816 } 817 818 const struct kfd2kgd_calls gfx_v10_kfd2kgd = { 819 .program_sh_mem_settings = kgd_program_sh_mem_settings, 820 .set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping, 821 .init_interrupts = kgd_init_interrupts, 822 .hqd_load = kgd_hqd_load, 823 .hqd_sdma_load = kgd_hqd_sdma_load, 824 .hqd_dump = kgd_hqd_dump, 825 .hqd_sdma_dump = kgd_hqd_sdma_dump, 826 .hqd_is_occupied = kgd_hqd_is_occupied, 827 .hqd_sdma_is_occupied = kgd_hqd_sdma_is_occupied, 828 .hqd_destroy = kgd_hqd_destroy, 829 .hqd_sdma_destroy = kgd_hqd_sdma_destroy, 830 .address_watch_disable = kgd_address_watch_disable, 831 .address_watch_execute = kgd_address_watch_execute, 832 .wave_control_execute = kgd_wave_control_execute, 833 .address_watch_get_offset = kgd_address_watch_get_offset, 834 .get_atc_vmid_pasid_mapping_info = 835 get_atc_vmid_pasid_mapping_info, 836 .get_tile_config = amdgpu_amdkfd_get_tile_config, 837 .set_vm_context_page_table_base = set_vm_context_page_table_base, 838 .invalidate_tlbs = invalidate_tlbs, 839 .invalidate_tlbs_vmid = invalidate_tlbs_vmid, 840 .get_hive_id = amdgpu_amdkfd_get_hive_id, 841 }; 842