1 /* 2 * Copyright 2014-2018 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 */ 22 23 #define pr_fmt(fmt) "kfd2kgd: " fmt 24 25 #include <linux/module.h> 26 #include <linux/fdtable.h> 27 #include <linux/uaccess.h> 28 #include <linux/mmu_context.h> 29 #include <drm/drmP.h> 30 #include "amdgpu.h" 31 #include "amdgpu_amdkfd.h" 32 #include "soc15_hw_ip.h" 33 #include "gc/gc_9_0_offset.h" 34 #include "gc/gc_9_0_sh_mask.h" 35 #include "vega10_enum.h" 36 #include "sdma0/sdma0_4_0_offset.h" 37 #include "sdma0/sdma0_4_0_sh_mask.h" 38 #include "sdma1/sdma1_4_0_offset.h" 39 #include "sdma1/sdma1_4_0_sh_mask.h" 40 #include "athub/athub_1_0_offset.h" 41 #include "athub/athub_1_0_sh_mask.h" 42 #include "oss/osssys_4_0_offset.h" 43 #include "oss/osssys_4_0_sh_mask.h" 44 #include "soc15_common.h" 45 #include "v9_structs.h" 46 #include "soc15.h" 47 #include "soc15d.h" 48 #include "mmhub_v1_0.h" 49 #include "gfxhub_v1_0.h" 50 51 52 #define V9_PIPE_PER_MEC (4) 53 #define V9_QUEUES_PER_PIPE_MEC (8) 54 55 enum hqd_dequeue_request_type { 56 NO_ACTION = 0, 57 DRAIN_PIPE, 58 RESET_WAVES 59 }; 60 61 /* 62 * Register access functions 63 */ 64 65 static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid, 66 uint32_t sh_mem_config, 67 uint32_t sh_mem_ape1_base, uint32_t sh_mem_ape1_limit, 68 uint32_t sh_mem_bases); 69 static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, 70 unsigned int vmid); 71 static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id); 72 static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, 73 uint32_t queue_id, uint32_t __user *wptr, 74 uint32_t wptr_shift, uint32_t wptr_mask, 75 struct mm_struct *mm); 76 static int kgd_hqd_dump(struct kgd_dev *kgd, 77 uint32_t pipe_id, uint32_t queue_id, 78 uint32_t (**dump)[2], uint32_t *n_regs); 79 static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, 80 uint32_t __user *wptr, struct mm_struct *mm); 81 static int kgd_hqd_sdma_dump(struct kgd_dev *kgd, 82 uint32_t engine_id, uint32_t queue_id, 83 uint32_t (**dump)[2], uint32_t *n_regs); 84 static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, 85 uint32_t pipe_id, uint32_t queue_id); 86 static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd); 87 static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, 88 enum kfd_preempt_type reset_type, 89 unsigned int utimeout, uint32_t pipe_id, 90 uint32_t queue_id); 91 static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, 92 unsigned int utimeout); 93 static int kgd_address_watch_disable(struct kgd_dev *kgd); 94 static int kgd_address_watch_execute(struct kgd_dev *kgd, 95 unsigned int watch_point_id, 96 uint32_t cntl_val, 97 uint32_t addr_hi, 98 uint32_t addr_lo); 99 static int kgd_wave_control_execute(struct kgd_dev *kgd, 100 uint32_t gfx_index_val, 101 uint32_t sq_cmd); 102 static uint32_t kgd_address_watch_get_offset(struct kgd_dev *kgd, 103 unsigned int watch_point_id, 104 unsigned int reg_offset); 105 106 static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd, 107 uint8_t vmid); 108 static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd, 109 uint8_t vmid); 110 static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, 111 uint64_t page_table_base); 112 static void set_scratch_backing_va(struct kgd_dev *kgd, 113 uint64_t va, uint32_t vmid); 114 static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid); 115 static int invalidate_tlbs_vmid(struct kgd_dev *kgd, uint16_t vmid); 116 117 /* Because of REG_GET_FIELD() being used, we put this function in the 118 * asic specific file. 119 */ 120 static int amdgpu_amdkfd_get_tile_config(struct kgd_dev *kgd, 121 struct tile_config *config) 122 { 123 struct amdgpu_device *adev = (struct amdgpu_device *)kgd; 124 125 config->gb_addr_config = adev->gfx.config.gb_addr_config; 126 127 config->tile_config_ptr = adev->gfx.config.tile_mode_array; 128 config->num_tile_configs = 129 ARRAY_SIZE(adev->gfx.config.tile_mode_array); 130 config->macro_tile_config_ptr = 131 adev->gfx.config.macrotile_mode_array; 132 config->num_macro_tile_configs = 133 ARRAY_SIZE(adev->gfx.config.macrotile_mode_array); 134 135 return 0; 136 } 137 138 static const struct kfd2kgd_calls kfd2kgd = { 139 .program_sh_mem_settings = kgd_program_sh_mem_settings, 140 .set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping, 141 .init_interrupts = kgd_init_interrupts, 142 .hqd_load = kgd_hqd_load, 143 .hqd_sdma_load = kgd_hqd_sdma_load, 144 .hqd_dump = kgd_hqd_dump, 145 .hqd_sdma_dump = kgd_hqd_sdma_dump, 146 .hqd_is_occupied = kgd_hqd_is_occupied, 147 .hqd_sdma_is_occupied = kgd_hqd_sdma_is_occupied, 148 .hqd_destroy = kgd_hqd_destroy, 149 .hqd_sdma_destroy = kgd_hqd_sdma_destroy, 150 .address_watch_disable = kgd_address_watch_disable, 151 .address_watch_execute = kgd_address_watch_execute, 152 .wave_control_execute = kgd_wave_control_execute, 153 .address_watch_get_offset = kgd_address_watch_get_offset, 154 .get_atc_vmid_pasid_mapping_pasid = 155 get_atc_vmid_pasid_mapping_pasid, 156 .get_atc_vmid_pasid_mapping_valid = 157 get_atc_vmid_pasid_mapping_valid, 158 .set_scratch_backing_va = set_scratch_backing_va, 159 .get_tile_config = amdgpu_amdkfd_get_tile_config, 160 .set_vm_context_page_table_base = set_vm_context_page_table_base, 161 .invalidate_tlbs = invalidate_tlbs, 162 .invalidate_tlbs_vmid = invalidate_tlbs_vmid, 163 .get_hive_id = amdgpu_amdkfd_get_hive_id, 164 }; 165 166 struct kfd2kgd_calls *amdgpu_amdkfd_gfx_9_0_get_functions(void) 167 { 168 return (struct kfd2kgd_calls *)&kfd2kgd; 169 } 170 171 static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd) 172 { 173 return (struct amdgpu_device *)kgd; 174 } 175 176 static void lock_srbm(struct kgd_dev *kgd, uint32_t mec, uint32_t pipe, 177 uint32_t queue, uint32_t vmid) 178 { 179 struct amdgpu_device *adev = get_amdgpu_device(kgd); 180 181 mutex_lock(&adev->srbm_mutex); 182 soc15_grbm_select(adev, mec, pipe, queue, vmid); 183 } 184 185 static void unlock_srbm(struct kgd_dev *kgd) 186 { 187 struct amdgpu_device *adev = get_amdgpu_device(kgd); 188 189 soc15_grbm_select(adev, 0, 0, 0, 0); 190 mutex_unlock(&adev->srbm_mutex); 191 } 192 193 static void acquire_queue(struct kgd_dev *kgd, uint32_t pipe_id, 194 uint32_t queue_id) 195 { 196 struct amdgpu_device *adev = get_amdgpu_device(kgd); 197 198 uint32_t mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; 199 uint32_t pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); 200 201 lock_srbm(kgd, mec, pipe, queue_id, 0); 202 } 203 204 static uint32_t get_queue_mask(struct amdgpu_device *adev, 205 uint32_t pipe_id, uint32_t queue_id) 206 { 207 unsigned int bit = (pipe_id * adev->gfx.mec.num_queue_per_pipe + 208 queue_id) & 31; 209 210 return ((uint32_t)1) << bit; 211 } 212 213 static void release_queue(struct kgd_dev *kgd) 214 { 215 unlock_srbm(kgd); 216 } 217 218 static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid, 219 uint32_t sh_mem_config, 220 uint32_t sh_mem_ape1_base, 221 uint32_t sh_mem_ape1_limit, 222 uint32_t sh_mem_bases) 223 { 224 struct amdgpu_device *adev = get_amdgpu_device(kgd); 225 226 lock_srbm(kgd, 0, 0, 0, vmid); 227 228 WREG32(SOC15_REG_OFFSET(GC, 0, mmSH_MEM_CONFIG), sh_mem_config); 229 WREG32(SOC15_REG_OFFSET(GC, 0, mmSH_MEM_BASES), sh_mem_bases); 230 /* APE1 no longer exists on GFX9 */ 231 232 unlock_srbm(kgd); 233 } 234 235 static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, 236 unsigned int vmid) 237 { 238 struct amdgpu_device *adev = get_amdgpu_device(kgd); 239 240 /* 241 * We have to assume that there is no outstanding mapping. 242 * The ATC_VMID_PASID_MAPPING_UPDATE_STATUS bit could be 0 because 243 * a mapping is in progress or because a mapping finished 244 * and the SW cleared it. 245 * So the protocol is to always wait & clear. 246 */ 247 uint32_t pasid_mapping = (pasid == 0) ? 0 : (uint32_t)pasid | 248 ATC_VMID0_PASID_MAPPING__VALID_MASK; 249 250 /* 251 * need to do this twice, once for gfx and once for mmhub 252 * for ATC add 16 to VMID for mmhub, for IH different registers. 253 * ATC_VMID0..15 registers are separate from ATC_VMID16..31. 254 */ 255 256 WREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) + vmid, 257 pasid_mapping); 258 259 while (!(RREG32(SOC15_REG_OFFSET( 260 ATHUB, 0, 261 mmATC_VMID_PASID_MAPPING_UPDATE_STATUS)) & 262 (1U << vmid))) 263 cpu_relax(); 264 265 WREG32(SOC15_REG_OFFSET(ATHUB, 0, 266 mmATC_VMID_PASID_MAPPING_UPDATE_STATUS), 267 1U << vmid); 268 269 /* Mapping vmid to pasid also for IH block */ 270 WREG32(SOC15_REG_OFFSET(OSSSYS, 0, mmIH_VMID_0_LUT) + vmid, 271 pasid_mapping); 272 273 WREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID16_PASID_MAPPING) + vmid, 274 pasid_mapping); 275 276 while (!(RREG32(SOC15_REG_OFFSET( 277 ATHUB, 0, 278 mmATC_VMID_PASID_MAPPING_UPDATE_STATUS)) & 279 (1U << (vmid + 16)))) 280 cpu_relax(); 281 282 WREG32(SOC15_REG_OFFSET(ATHUB, 0, 283 mmATC_VMID_PASID_MAPPING_UPDATE_STATUS), 284 1U << (vmid + 16)); 285 286 /* Mapping vmid to pasid also for IH block */ 287 WREG32(SOC15_REG_OFFSET(OSSSYS, 0, mmIH_VMID_0_LUT_MM) + vmid, 288 pasid_mapping); 289 return 0; 290 } 291 292 /* TODO - RING0 form of field is obsolete, seems to date back to SI 293 * but still works 294 */ 295 296 static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id) 297 { 298 struct amdgpu_device *adev = get_amdgpu_device(kgd); 299 uint32_t mec; 300 uint32_t pipe; 301 302 mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; 303 pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); 304 305 lock_srbm(kgd, mec, pipe, 0, 0); 306 307 WREG32(SOC15_REG_OFFSET(GC, 0, mmCPC_INT_CNTL), 308 CP_INT_CNTL_RING0__TIME_STAMP_INT_ENABLE_MASK | 309 CP_INT_CNTL_RING0__OPCODE_ERROR_INT_ENABLE_MASK); 310 311 unlock_srbm(kgd); 312 313 return 0; 314 } 315 316 static uint32_t get_sdma_base_addr(struct amdgpu_device *adev, 317 unsigned int engine_id, 318 unsigned int queue_id) 319 { 320 uint32_t base[2] = { 321 SOC15_REG_OFFSET(SDMA0, 0, 322 mmSDMA0_RLC0_RB_CNTL) - mmSDMA0_RLC0_RB_CNTL, 323 SOC15_REG_OFFSET(SDMA1, 0, 324 mmSDMA1_RLC0_RB_CNTL) - mmSDMA1_RLC0_RB_CNTL 325 }; 326 uint32_t retval; 327 328 retval = base[engine_id] + queue_id * (mmSDMA0_RLC1_RB_CNTL - 329 mmSDMA0_RLC0_RB_CNTL); 330 331 pr_debug("sdma base address: 0x%x\n", retval); 332 333 return retval; 334 } 335 336 static inline struct v9_mqd *get_mqd(void *mqd) 337 { 338 return (struct v9_mqd *)mqd; 339 } 340 341 static inline struct v9_sdma_mqd *get_sdma_mqd(void *mqd) 342 { 343 return (struct v9_sdma_mqd *)mqd; 344 } 345 346 static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, 347 uint32_t queue_id, uint32_t __user *wptr, 348 uint32_t wptr_shift, uint32_t wptr_mask, 349 struct mm_struct *mm) 350 { 351 struct amdgpu_device *adev = get_amdgpu_device(kgd); 352 struct v9_mqd *m; 353 uint32_t *mqd_hqd; 354 uint32_t reg, hqd_base, data; 355 356 m = get_mqd(mqd); 357 358 acquire_queue(kgd, pipe_id, queue_id); 359 360 /* HIQ is set during driver init period with vmid set to 0*/ 361 if (m->cp_hqd_vmid == 0) { 362 uint32_t value, mec, pipe; 363 364 mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; 365 pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); 366 367 pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n", 368 mec, pipe, queue_id); 369 value = RREG32(SOC15_REG_OFFSET(GC, 0, mmRLC_CP_SCHEDULERS)); 370 value = REG_SET_FIELD(value, RLC_CP_SCHEDULERS, scheduler1, 371 ((mec << 5) | (pipe << 3) | queue_id | 0x80)); 372 WREG32(SOC15_REG_OFFSET(GC, 0, mmRLC_CP_SCHEDULERS), value); 373 } 374 375 /* HQD registers extend from CP_MQD_BASE_ADDR to CP_HQD_EOP_WPTR_MEM. */ 376 mqd_hqd = &m->cp_mqd_base_addr_lo; 377 hqd_base = SOC15_REG_OFFSET(GC, 0, mmCP_MQD_BASE_ADDR); 378 379 for (reg = hqd_base; 380 reg <= SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI); reg++) 381 WREG32(reg, mqd_hqd[reg - hqd_base]); 382 383 384 /* Activate doorbell logic before triggering WPTR poll. */ 385 data = REG_SET_FIELD(m->cp_hqd_pq_doorbell_control, 386 CP_HQD_PQ_DOORBELL_CONTROL, DOORBELL_EN, 1); 387 WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_DOORBELL_CONTROL), data); 388 389 if (wptr) { 390 /* Don't read wptr with get_user because the user 391 * context may not be accessible (if this function 392 * runs in a work queue). Instead trigger a one-shot 393 * polling read from memory in the CP. This assumes 394 * that wptr is GPU-accessible in the queue's VMID via 395 * ATC or SVM. WPTR==RPTR before starting the poll so 396 * the CP starts fetching new commands from the right 397 * place. 398 * 399 * Guessing a 64-bit WPTR from a 32-bit RPTR is a bit 400 * tricky. Assume that the queue didn't overflow. The 401 * number of valid bits in the 32-bit RPTR depends on 402 * the queue size. The remaining bits are taken from 403 * the saved 64-bit WPTR. If the WPTR wrapped, add the 404 * queue size. 405 */ 406 uint32_t queue_size = 407 2 << REG_GET_FIELD(m->cp_hqd_pq_control, 408 CP_HQD_PQ_CONTROL, QUEUE_SIZE); 409 uint64_t guessed_wptr = m->cp_hqd_pq_rptr & (queue_size - 1); 410 411 if ((m->cp_hqd_pq_wptr_lo & (queue_size - 1)) < guessed_wptr) 412 guessed_wptr += queue_size; 413 guessed_wptr += m->cp_hqd_pq_wptr_lo & ~(queue_size - 1); 414 guessed_wptr += (uint64_t)m->cp_hqd_pq_wptr_hi << 32; 415 416 WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_LO), 417 lower_32_bits(guessed_wptr)); 418 WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI), 419 upper_32_bits(guessed_wptr)); 420 WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_POLL_ADDR), 421 lower_32_bits((uintptr_t)wptr)); 422 WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_POLL_ADDR_HI), 423 upper_32_bits((uintptr_t)wptr)); 424 WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_PQ_WPTR_POLL_CNTL1), 425 get_queue_mask(adev, pipe_id, queue_id)); 426 } 427 428 /* Start the EOP fetcher */ 429 WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_EOP_RPTR), 430 REG_SET_FIELD(m->cp_hqd_eop_rptr, 431 CP_HQD_EOP_RPTR, INIT_FETCHER, 1)); 432 433 data = REG_SET_FIELD(m->cp_hqd_active, CP_HQD_ACTIVE, ACTIVE, 1); 434 WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE), data); 435 436 release_queue(kgd); 437 438 return 0; 439 } 440 441 static int kgd_hqd_dump(struct kgd_dev *kgd, 442 uint32_t pipe_id, uint32_t queue_id, 443 uint32_t (**dump)[2], uint32_t *n_regs) 444 { 445 struct amdgpu_device *adev = get_amdgpu_device(kgd); 446 uint32_t i = 0, reg; 447 #define HQD_N_REGS 56 448 #define DUMP_REG(addr) do { \ 449 if (WARN_ON_ONCE(i >= HQD_N_REGS)) \ 450 break; \ 451 (*dump)[i][0] = (addr) << 2; \ 452 (*dump)[i++][1] = RREG32(addr); \ 453 } while (0) 454 455 *dump = kmalloc_array(HQD_N_REGS * 2, sizeof(uint32_t), GFP_KERNEL); 456 if (*dump == NULL) 457 return -ENOMEM; 458 459 acquire_queue(kgd, pipe_id, queue_id); 460 461 for (reg = SOC15_REG_OFFSET(GC, 0, mmCP_MQD_BASE_ADDR); 462 reg <= SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI); reg++) 463 DUMP_REG(reg); 464 465 release_queue(kgd); 466 467 WARN_ON_ONCE(i != HQD_N_REGS); 468 *n_regs = i; 469 470 return 0; 471 } 472 473 static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, 474 uint32_t __user *wptr, struct mm_struct *mm) 475 { 476 struct amdgpu_device *adev = get_amdgpu_device(kgd); 477 struct v9_sdma_mqd *m; 478 uint32_t sdma_base_addr, sdmax_gfx_context_cntl; 479 unsigned long end_jiffies; 480 uint32_t data; 481 uint64_t data64; 482 uint64_t __user *wptr64 = (uint64_t __user *)wptr; 483 484 m = get_sdma_mqd(mqd); 485 sdma_base_addr = get_sdma_base_addr(adev, m->sdma_engine_id, 486 m->sdma_queue_id); 487 sdmax_gfx_context_cntl = m->sdma_engine_id ? 488 SOC15_REG_OFFSET(SDMA1, 0, mmSDMA1_GFX_CONTEXT_CNTL) : 489 SOC15_REG_OFFSET(SDMA0, 0, mmSDMA0_GFX_CONTEXT_CNTL); 490 491 WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, 492 m->sdmax_rlcx_rb_cntl & (~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK)); 493 494 end_jiffies = msecs_to_jiffies(2000) + jiffies; 495 while (true) { 496 data = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS); 497 if (data & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) 498 break; 499 if (time_after(jiffies, end_jiffies)) 500 return -ETIME; 501 usleep_range(500, 1000); 502 } 503 data = RREG32(sdmax_gfx_context_cntl); 504 data = REG_SET_FIELD(data, SDMA0_GFX_CONTEXT_CNTL, 505 RESUME_CTX, 0); 506 WREG32(sdmax_gfx_context_cntl, data); 507 508 WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL_OFFSET, 509 m->sdmax_rlcx_doorbell_offset); 510 511 data = REG_SET_FIELD(m->sdmax_rlcx_doorbell, SDMA0_RLC0_DOORBELL, 512 ENABLE, 1); 513 WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, data); 514 WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, m->sdmax_rlcx_rb_rptr); 515 WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_HI, 516 m->sdmax_rlcx_rb_rptr_hi); 517 518 WREG32(sdma_base_addr + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 1); 519 if (read_user_wptr(mm, wptr64, data64)) { 520 WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, 521 lower_32_bits(data64)); 522 WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR_HI, 523 upper_32_bits(data64)); 524 } else { 525 WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, 526 m->sdmax_rlcx_rb_rptr); 527 WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR_HI, 528 m->sdmax_rlcx_rb_rptr_hi); 529 } 530 WREG32(sdma_base_addr + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 0); 531 532 WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, m->sdmax_rlcx_rb_base); 533 WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE_HI, 534 m->sdmax_rlcx_rb_base_hi); 535 WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_LO, 536 m->sdmax_rlcx_rb_rptr_addr_lo); 537 WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_HI, 538 m->sdmax_rlcx_rb_rptr_addr_hi); 539 540 data = REG_SET_FIELD(m->sdmax_rlcx_rb_cntl, SDMA0_RLC0_RB_CNTL, 541 RB_ENABLE, 1); 542 WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, data); 543 544 return 0; 545 } 546 547 static int kgd_hqd_sdma_dump(struct kgd_dev *kgd, 548 uint32_t engine_id, uint32_t queue_id, 549 uint32_t (**dump)[2], uint32_t *n_regs) 550 { 551 struct amdgpu_device *adev = get_amdgpu_device(kgd); 552 uint32_t sdma_base_addr = get_sdma_base_addr(adev, engine_id, queue_id); 553 uint32_t i = 0, reg; 554 #undef HQD_N_REGS 555 #define HQD_N_REGS (19+6+7+10) 556 557 *dump = kmalloc_array(HQD_N_REGS * 2, sizeof(uint32_t), GFP_KERNEL); 558 if (*dump == NULL) 559 return -ENOMEM; 560 561 for (reg = mmSDMA0_RLC0_RB_CNTL; reg <= mmSDMA0_RLC0_DOORBELL; reg++) 562 DUMP_REG(sdma_base_addr + reg); 563 for (reg = mmSDMA0_RLC0_STATUS; reg <= mmSDMA0_RLC0_CSA_ADDR_HI; reg++) 564 DUMP_REG(sdma_base_addr + reg); 565 for (reg = mmSDMA0_RLC0_IB_SUB_REMAIN; 566 reg <= mmSDMA0_RLC0_MINOR_PTR_UPDATE; reg++) 567 DUMP_REG(sdma_base_addr + reg); 568 for (reg = mmSDMA0_RLC0_MIDCMD_DATA0; 569 reg <= mmSDMA0_RLC0_MIDCMD_CNTL; reg++) 570 DUMP_REG(sdma_base_addr + reg); 571 572 WARN_ON_ONCE(i != HQD_N_REGS); 573 *n_regs = i; 574 575 return 0; 576 } 577 578 static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, 579 uint32_t pipe_id, uint32_t queue_id) 580 { 581 struct amdgpu_device *adev = get_amdgpu_device(kgd); 582 uint32_t act; 583 bool retval = false; 584 uint32_t low, high; 585 586 acquire_queue(kgd, pipe_id, queue_id); 587 act = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE)); 588 if (act) { 589 low = lower_32_bits(queue_address >> 8); 590 high = upper_32_bits(queue_address >> 8); 591 592 if (low == RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_BASE)) && 593 high == RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_BASE_HI))) 594 retval = true; 595 } 596 release_queue(kgd); 597 return retval; 598 } 599 600 static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd) 601 { 602 struct amdgpu_device *adev = get_amdgpu_device(kgd); 603 struct v9_sdma_mqd *m; 604 uint32_t sdma_base_addr; 605 uint32_t sdma_rlc_rb_cntl; 606 607 m = get_sdma_mqd(mqd); 608 sdma_base_addr = get_sdma_base_addr(adev, m->sdma_engine_id, 609 m->sdma_queue_id); 610 611 sdma_rlc_rb_cntl = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL); 612 613 if (sdma_rlc_rb_cntl & SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK) 614 return true; 615 616 return false; 617 } 618 619 static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, 620 enum kfd_preempt_type reset_type, 621 unsigned int utimeout, uint32_t pipe_id, 622 uint32_t queue_id) 623 { 624 struct amdgpu_device *adev = get_amdgpu_device(kgd); 625 enum hqd_dequeue_request_type type; 626 unsigned long end_jiffies; 627 uint32_t temp; 628 struct v9_mqd *m = get_mqd(mqd); 629 630 if (adev->in_gpu_reset) 631 return -EIO; 632 633 acquire_queue(kgd, pipe_id, queue_id); 634 635 if (m->cp_hqd_vmid == 0) 636 WREG32_FIELD15(GC, 0, RLC_CP_SCHEDULERS, scheduler1, 0); 637 638 switch (reset_type) { 639 case KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN: 640 type = DRAIN_PIPE; 641 break; 642 case KFD_PREEMPT_TYPE_WAVEFRONT_RESET: 643 type = RESET_WAVES; 644 break; 645 default: 646 type = DRAIN_PIPE; 647 break; 648 } 649 650 WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_DEQUEUE_REQUEST), type); 651 652 end_jiffies = (utimeout * HZ / 1000) + jiffies; 653 while (true) { 654 temp = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE)); 655 if (!(temp & CP_HQD_ACTIVE__ACTIVE_MASK)) 656 break; 657 if (time_after(jiffies, end_jiffies)) { 658 pr_err("cp queue preemption time out.\n"); 659 release_queue(kgd); 660 return -ETIME; 661 } 662 usleep_range(500, 1000); 663 } 664 665 release_queue(kgd); 666 return 0; 667 } 668 669 static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, 670 unsigned int utimeout) 671 { 672 struct amdgpu_device *adev = get_amdgpu_device(kgd); 673 struct v9_sdma_mqd *m; 674 uint32_t sdma_base_addr; 675 uint32_t temp; 676 unsigned long end_jiffies = (utimeout * HZ / 1000) + jiffies; 677 678 m = get_sdma_mqd(mqd); 679 sdma_base_addr = get_sdma_base_addr(adev, m->sdma_engine_id, 680 m->sdma_queue_id); 681 682 temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL); 683 temp = temp & ~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK; 684 WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, temp); 685 686 while (true) { 687 temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS); 688 if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) 689 break; 690 if (time_after(jiffies, end_jiffies)) 691 return -ETIME; 692 usleep_range(500, 1000); 693 } 694 695 WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, 0); 696 WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, 697 RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL) | 698 SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK); 699 700 m->sdmax_rlcx_rb_rptr = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR); 701 m->sdmax_rlcx_rb_rptr_hi = 702 RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_HI); 703 704 return 0; 705 } 706 707 static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd, 708 uint8_t vmid) 709 { 710 uint32_t reg; 711 struct amdgpu_device *adev = (struct amdgpu_device *) kgd; 712 713 reg = RREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) 714 + vmid); 715 return reg & ATC_VMID0_PASID_MAPPING__VALID_MASK; 716 } 717 718 static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd, 719 uint8_t vmid) 720 { 721 uint32_t reg; 722 struct amdgpu_device *adev = (struct amdgpu_device *) kgd; 723 724 reg = RREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) 725 + vmid); 726 return reg & ATC_VMID0_PASID_MAPPING__PASID_MASK; 727 } 728 729 static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid) 730 { 731 struct amdgpu_device *adev = (struct amdgpu_device *) kgd; 732 733 /* Use legacy mode tlb invalidation. 734 * 735 * Currently on Raven the code below is broken for anything but 736 * legacy mode due to a MMHUB power gating problem. A workaround 737 * is for MMHUB to wait until the condition PER_VMID_INVALIDATE_REQ 738 * == PER_VMID_INVALIDATE_ACK instead of simply waiting for the ack 739 * bit. 740 * 741 * TODO 1: agree on the right set of invalidation registers for 742 * KFD use. Use the last one for now. Invalidate both GC and 743 * MMHUB. 744 * 745 * TODO 2: support range-based invalidation, requires kfg2kgd 746 * interface change 747 */ 748 amdgpu_gmc_flush_gpu_tlb(adev, vmid, 0); 749 } 750 751 static int invalidate_tlbs_with_kiq(struct amdgpu_device *adev, uint16_t pasid) 752 { 753 signed long r; 754 uint32_t seq; 755 struct amdgpu_ring *ring = &adev->gfx.kiq.ring; 756 757 spin_lock(&adev->gfx.kiq.ring_lock); 758 amdgpu_ring_alloc(ring, 12); /* fence + invalidate_tlbs package*/ 759 amdgpu_ring_write(ring, PACKET3(PACKET3_INVALIDATE_TLBS, 0)); 760 amdgpu_ring_write(ring, 761 PACKET3_INVALIDATE_TLBS_DST_SEL(1) | 762 PACKET3_INVALIDATE_TLBS_ALL_HUB(1) | 763 PACKET3_INVALIDATE_TLBS_PASID(pasid) | 764 PACKET3_INVALIDATE_TLBS_FLUSH_TYPE(0)); /* legacy */ 765 amdgpu_fence_emit_polling(ring, &seq); 766 amdgpu_ring_commit(ring); 767 spin_unlock(&adev->gfx.kiq.ring_lock); 768 769 r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout); 770 if (r < 1) { 771 DRM_ERROR("wait for kiq fence error: %ld.\n", r); 772 return -ETIME; 773 } 774 775 return 0; 776 } 777 778 static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid) 779 { 780 struct amdgpu_device *adev = (struct amdgpu_device *) kgd; 781 int vmid; 782 struct amdgpu_ring *ring = &adev->gfx.kiq.ring; 783 784 if (adev->in_gpu_reset) 785 return -EIO; 786 787 if (ring->sched.ready) 788 return invalidate_tlbs_with_kiq(adev, pasid); 789 790 for (vmid = 0; vmid < 16; vmid++) { 791 if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid)) 792 continue; 793 if (get_atc_vmid_pasid_mapping_valid(kgd, vmid)) { 794 if (get_atc_vmid_pasid_mapping_pasid(kgd, vmid) 795 == pasid) { 796 write_vmid_invalidate_request(kgd, vmid); 797 break; 798 } 799 } 800 } 801 802 return 0; 803 } 804 805 static int invalidate_tlbs_vmid(struct kgd_dev *kgd, uint16_t vmid) 806 { 807 struct amdgpu_device *adev = (struct amdgpu_device *) kgd; 808 809 if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid)) { 810 pr_err("non kfd vmid %d\n", vmid); 811 return 0; 812 } 813 814 write_vmid_invalidate_request(kgd, vmid); 815 return 0; 816 } 817 818 static int kgd_address_watch_disable(struct kgd_dev *kgd) 819 { 820 return 0; 821 } 822 823 static int kgd_address_watch_execute(struct kgd_dev *kgd, 824 unsigned int watch_point_id, 825 uint32_t cntl_val, 826 uint32_t addr_hi, 827 uint32_t addr_lo) 828 { 829 return 0; 830 } 831 832 static int kgd_wave_control_execute(struct kgd_dev *kgd, 833 uint32_t gfx_index_val, 834 uint32_t sq_cmd) 835 { 836 struct amdgpu_device *adev = get_amdgpu_device(kgd); 837 uint32_t data = 0; 838 839 mutex_lock(&adev->grbm_idx_mutex); 840 841 WREG32(SOC15_REG_OFFSET(GC, 0, mmGRBM_GFX_INDEX), gfx_index_val); 842 WREG32(SOC15_REG_OFFSET(GC, 0, mmSQ_CMD), sq_cmd); 843 844 data = REG_SET_FIELD(data, GRBM_GFX_INDEX, 845 INSTANCE_BROADCAST_WRITES, 1); 846 data = REG_SET_FIELD(data, GRBM_GFX_INDEX, 847 SH_BROADCAST_WRITES, 1); 848 data = REG_SET_FIELD(data, GRBM_GFX_INDEX, 849 SE_BROADCAST_WRITES, 1); 850 851 WREG32(SOC15_REG_OFFSET(GC, 0, mmGRBM_GFX_INDEX), data); 852 mutex_unlock(&adev->grbm_idx_mutex); 853 854 return 0; 855 } 856 857 static uint32_t kgd_address_watch_get_offset(struct kgd_dev *kgd, 858 unsigned int watch_point_id, 859 unsigned int reg_offset) 860 { 861 return 0; 862 } 863 864 static void set_scratch_backing_va(struct kgd_dev *kgd, 865 uint64_t va, uint32_t vmid) 866 { 867 /* No longer needed on GFXv9. The scratch base address is 868 * passed to the shader by the CP. It's the user mode driver's 869 * responsibility. 870 */ 871 } 872 873 static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, 874 uint64_t page_table_base) 875 { 876 struct amdgpu_device *adev = get_amdgpu_device(kgd); 877 878 if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid)) { 879 pr_err("trying to set page table base for wrong VMID %u\n", 880 vmid); 881 return; 882 } 883 884 /* TODO: take advantage of per-process address space size. For 885 * now, all processes share the same address space size, like 886 * on GFX8 and older. 887 */ 888 mmhub_v1_0_setup_vm_pt_regs(adev, vmid, page_table_base); 889 890 gfxhub_v1_0_setup_vm_pt_regs(adev, vmid, page_table_base); 891 } 892