1 /* 2 * Copyright 2014-2018 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 */ 22 23 #define pr_fmt(fmt) "kfd2kgd: " fmt 24 25 #include <linux/module.h> 26 #include <linux/fdtable.h> 27 #include <linux/uaccess.h> 28 #include <linux/firmware.h> 29 #include <drm/drmP.h> 30 #include "amdgpu.h" 31 #include "amdgpu_amdkfd.h" 32 #include "amdgpu_ucode.h" 33 #include "soc15_hw_ip.h" 34 #include "gc/gc_9_0_offset.h" 35 #include "gc/gc_9_0_sh_mask.h" 36 #include "vega10_enum.h" 37 #include "sdma0/sdma0_4_0_offset.h" 38 #include "sdma0/sdma0_4_0_sh_mask.h" 39 #include "sdma1/sdma1_4_0_offset.h" 40 #include "sdma1/sdma1_4_0_sh_mask.h" 41 #include "athub/athub_1_0_offset.h" 42 #include "athub/athub_1_0_sh_mask.h" 43 #include "oss/osssys_4_0_offset.h" 44 #include "oss/osssys_4_0_sh_mask.h" 45 #include "soc15_common.h" 46 #include "v9_structs.h" 47 #include "soc15.h" 48 #include "soc15d.h" 49 #include "mmhub_v1_0.h" 50 #include "gfxhub_v1_0.h" 51 52 53 #define V9_PIPE_PER_MEC (4) 54 #define V9_QUEUES_PER_PIPE_MEC (8) 55 56 enum hqd_dequeue_request_type { 57 NO_ACTION = 0, 58 DRAIN_PIPE, 59 RESET_WAVES 60 }; 61 62 /* 63 * Register access functions 64 */ 65 66 static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid, 67 uint32_t sh_mem_config, 68 uint32_t sh_mem_ape1_base, uint32_t sh_mem_ape1_limit, 69 uint32_t sh_mem_bases); 70 static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, 71 unsigned int vmid); 72 static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id); 73 static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, 74 uint32_t queue_id, uint32_t __user *wptr, 75 uint32_t wptr_shift, uint32_t wptr_mask, 76 struct mm_struct *mm); 77 static int kgd_hqd_dump(struct kgd_dev *kgd, 78 uint32_t pipe_id, uint32_t queue_id, 79 uint32_t (**dump)[2], uint32_t *n_regs); 80 static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, 81 uint32_t __user *wptr, struct mm_struct *mm); 82 static int kgd_hqd_sdma_dump(struct kgd_dev *kgd, 83 uint32_t engine_id, uint32_t queue_id, 84 uint32_t (**dump)[2], uint32_t *n_regs); 85 static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, 86 uint32_t pipe_id, uint32_t queue_id); 87 static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd); 88 static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, 89 enum kfd_preempt_type reset_type, 90 unsigned int utimeout, uint32_t pipe_id, 91 uint32_t queue_id); 92 static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, 93 unsigned int utimeout); 94 static int kgd_address_watch_disable(struct kgd_dev *kgd); 95 static int kgd_address_watch_execute(struct kgd_dev *kgd, 96 unsigned int watch_point_id, 97 uint32_t cntl_val, 98 uint32_t addr_hi, 99 uint32_t addr_lo); 100 static int kgd_wave_control_execute(struct kgd_dev *kgd, 101 uint32_t gfx_index_val, 102 uint32_t sq_cmd); 103 static uint32_t kgd_address_watch_get_offset(struct kgd_dev *kgd, 104 unsigned int watch_point_id, 105 unsigned int reg_offset); 106 107 static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd, 108 uint8_t vmid); 109 static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd, 110 uint8_t vmid); 111 static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, 112 uint64_t page_table_base); 113 static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type); 114 static void set_scratch_backing_va(struct kgd_dev *kgd, 115 uint64_t va, uint32_t vmid); 116 static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid); 117 static int invalidate_tlbs_vmid(struct kgd_dev *kgd, uint16_t vmid); 118 119 /* Because of REG_GET_FIELD() being used, we put this function in the 120 * asic specific file. 121 */ 122 static int amdgpu_amdkfd_get_tile_config(struct kgd_dev *kgd, 123 struct tile_config *config) 124 { 125 struct amdgpu_device *adev = (struct amdgpu_device *)kgd; 126 127 config->gb_addr_config = adev->gfx.config.gb_addr_config; 128 129 config->tile_config_ptr = adev->gfx.config.tile_mode_array; 130 config->num_tile_configs = 131 ARRAY_SIZE(adev->gfx.config.tile_mode_array); 132 config->macro_tile_config_ptr = 133 adev->gfx.config.macrotile_mode_array; 134 config->num_macro_tile_configs = 135 ARRAY_SIZE(adev->gfx.config.macrotile_mode_array); 136 137 return 0; 138 } 139 140 static const struct kfd2kgd_calls kfd2kgd = { 141 .program_sh_mem_settings = kgd_program_sh_mem_settings, 142 .set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping, 143 .init_interrupts = kgd_init_interrupts, 144 .hqd_load = kgd_hqd_load, 145 .hqd_sdma_load = kgd_hqd_sdma_load, 146 .hqd_dump = kgd_hqd_dump, 147 .hqd_sdma_dump = kgd_hqd_sdma_dump, 148 .hqd_is_occupied = kgd_hqd_is_occupied, 149 .hqd_sdma_is_occupied = kgd_hqd_sdma_is_occupied, 150 .hqd_destroy = kgd_hqd_destroy, 151 .hqd_sdma_destroy = kgd_hqd_sdma_destroy, 152 .address_watch_disable = kgd_address_watch_disable, 153 .address_watch_execute = kgd_address_watch_execute, 154 .wave_control_execute = kgd_wave_control_execute, 155 .address_watch_get_offset = kgd_address_watch_get_offset, 156 .get_atc_vmid_pasid_mapping_pasid = 157 get_atc_vmid_pasid_mapping_pasid, 158 .get_atc_vmid_pasid_mapping_valid = 159 get_atc_vmid_pasid_mapping_valid, 160 .get_fw_version = get_fw_version, 161 .set_scratch_backing_va = set_scratch_backing_va, 162 .get_tile_config = amdgpu_amdkfd_get_tile_config, 163 .set_vm_context_page_table_base = set_vm_context_page_table_base, 164 .invalidate_tlbs = invalidate_tlbs, 165 .invalidate_tlbs_vmid = invalidate_tlbs_vmid, 166 .get_hive_id = amdgpu_amdkfd_get_hive_id, 167 }; 168 169 struct kfd2kgd_calls *amdgpu_amdkfd_gfx_9_0_get_functions(void) 170 { 171 return (struct kfd2kgd_calls *)&kfd2kgd; 172 } 173 174 static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd) 175 { 176 return (struct amdgpu_device *)kgd; 177 } 178 179 static void lock_srbm(struct kgd_dev *kgd, uint32_t mec, uint32_t pipe, 180 uint32_t queue, uint32_t vmid) 181 { 182 struct amdgpu_device *adev = get_amdgpu_device(kgd); 183 184 mutex_lock(&adev->srbm_mutex); 185 soc15_grbm_select(adev, mec, pipe, queue, vmid); 186 } 187 188 static void unlock_srbm(struct kgd_dev *kgd) 189 { 190 struct amdgpu_device *adev = get_amdgpu_device(kgd); 191 192 soc15_grbm_select(adev, 0, 0, 0, 0); 193 mutex_unlock(&adev->srbm_mutex); 194 } 195 196 static void acquire_queue(struct kgd_dev *kgd, uint32_t pipe_id, 197 uint32_t queue_id) 198 { 199 struct amdgpu_device *adev = get_amdgpu_device(kgd); 200 201 uint32_t mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; 202 uint32_t pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); 203 204 lock_srbm(kgd, mec, pipe, queue_id, 0); 205 } 206 207 static uint32_t get_queue_mask(struct amdgpu_device *adev, 208 uint32_t pipe_id, uint32_t queue_id) 209 { 210 unsigned int bit = (pipe_id * adev->gfx.mec.num_queue_per_pipe + 211 queue_id) & 31; 212 213 return ((uint32_t)1) << bit; 214 } 215 216 static void release_queue(struct kgd_dev *kgd) 217 { 218 unlock_srbm(kgd); 219 } 220 221 static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid, 222 uint32_t sh_mem_config, 223 uint32_t sh_mem_ape1_base, 224 uint32_t sh_mem_ape1_limit, 225 uint32_t sh_mem_bases) 226 { 227 struct amdgpu_device *adev = get_amdgpu_device(kgd); 228 229 lock_srbm(kgd, 0, 0, 0, vmid); 230 231 WREG32(SOC15_REG_OFFSET(GC, 0, mmSH_MEM_CONFIG), sh_mem_config); 232 WREG32(SOC15_REG_OFFSET(GC, 0, mmSH_MEM_BASES), sh_mem_bases); 233 /* APE1 no longer exists on GFX9 */ 234 235 unlock_srbm(kgd); 236 } 237 238 static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, 239 unsigned int vmid) 240 { 241 struct amdgpu_device *adev = get_amdgpu_device(kgd); 242 243 /* 244 * We have to assume that there is no outstanding mapping. 245 * The ATC_VMID_PASID_MAPPING_UPDATE_STATUS bit could be 0 because 246 * a mapping is in progress or because a mapping finished 247 * and the SW cleared it. 248 * So the protocol is to always wait & clear. 249 */ 250 uint32_t pasid_mapping = (pasid == 0) ? 0 : (uint32_t)pasid | 251 ATC_VMID0_PASID_MAPPING__VALID_MASK; 252 253 /* 254 * need to do this twice, once for gfx and once for mmhub 255 * for ATC add 16 to VMID for mmhub, for IH different registers. 256 * ATC_VMID0..15 registers are separate from ATC_VMID16..31. 257 */ 258 259 WREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) + vmid, 260 pasid_mapping); 261 262 while (!(RREG32(SOC15_REG_OFFSET( 263 ATHUB, 0, 264 mmATC_VMID_PASID_MAPPING_UPDATE_STATUS)) & 265 (1U << vmid))) 266 cpu_relax(); 267 268 WREG32(SOC15_REG_OFFSET(ATHUB, 0, 269 mmATC_VMID_PASID_MAPPING_UPDATE_STATUS), 270 1U << vmid); 271 272 /* Mapping vmid to pasid also for IH block */ 273 WREG32(SOC15_REG_OFFSET(OSSSYS, 0, mmIH_VMID_0_LUT) + vmid, 274 pasid_mapping); 275 276 WREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID16_PASID_MAPPING) + vmid, 277 pasid_mapping); 278 279 while (!(RREG32(SOC15_REG_OFFSET( 280 ATHUB, 0, 281 mmATC_VMID_PASID_MAPPING_UPDATE_STATUS)) & 282 (1U << (vmid + 16)))) 283 cpu_relax(); 284 285 WREG32(SOC15_REG_OFFSET(ATHUB, 0, 286 mmATC_VMID_PASID_MAPPING_UPDATE_STATUS), 287 1U << (vmid + 16)); 288 289 /* Mapping vmid to pasid also for IH block */ 290 WREG32(SOC15_REG_OFFSET(OSSSYS, 0, mmIH_VMID_0_LUT_MM) + vmid, 291 pasid_mapping); 292 return 0; 293 } 294 295 /* TODO - RING0 form of field is obsolete, seems to date back to SI 296 * but still works 297 */ 298 299 static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id) 300 { 301 struct amdgpu_device *adev = get_amdgpu_device(kgd); 302 uint32_t mec; 303 uint32_t pipe; 304 305 mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; 306 pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); 307 308 lock_srbm(kgd, mec, pipe, 0, 0); 309 310 WREG32(SOC15_REG_OFFSET(GC, 0, mmCPC_INT_CNTL), 311 CP_INT_CNTL_RING0__TIME_STAMP_INT_ENABLE_MASK | 312 CP_INT_CNTL_RING0__OPCODE_ERROR_INT_ENABLE_MASK); 313 314 unlock_srbm(kgd); 315 316 return 0; 317 } 318 319 static uint32_t get_sdma_base_addr(struct amdgpu_device *adev, 320 unsigned int engine_id, 321 unsigned int queue_id) 322 { 323 uint32_t base[2] = { 324 SOC15_REG_OFFSET(SDMA0, 0, 325 mmSDMA0_RLC0_RB_CNTL) - mmSDMA0_RLC0_RB_CNTL, 326 SOC15_REG_OFFSET(SDMA1, 0, 327 mmSDMA1_RLC0_RB_CNTL) - mmSDMA1_RLC0_RB_CNTL 328 }; 329 uint32_t retval; 330 331 retval = base[engine_id] + queue_id * (mmSDMA0_RLC1_RB_CNTL - 332 mmSDMA0_RLC0_RB_CNTL); 333 334 pr_debug("sdma base address: 0x%x\n", retval); 335 336 return retval; 337 } 338 339 static inline struct v9_mqd *get_mqd(void *mqd) 340 { 341 return (struct v9_mqd *)mqd; 342 } 343 344 static inline struct v9_sdma_mqd *get_sdma_mqd(void *mqd) 345 { 346 return (struct v9_sdma_mqd *)mqd; 347 } 348 349 static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, 350 uint32_t queue_id, uint32_t __user *wptr, 351 uint32_t wptr_shift, uint32_t wptr_mask, 352 struct mm_struct *mm) 353 { 354 struct amdgpu_device *adev = get_amdgpu_device(kgd); 355 struct v9_mqd *m; 356 uint32_t *mqd_hqd; 357 uint32_t reg, hqd_base, data; 358 359 m = get_mqd(mqd); 360 361 acquire_queue(kgd, pipe_id, queue_id); 362 363 /* HIQ is set during driver init period with vmid set to 0*/ 364 if (m->cp_hqd_vmid == 0) { 365 uint32_t value, mec, pipe; 366 367 mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; 368 pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); 369 370 pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n", 371 mec, pipe, queue_id); 372 value = RREG32(SOC15_REG_OFFSET(GC, 0, mmRLC_CP_SCHEDULERS)); 373 value = REG_SET_FIELD(value, RLC_CP_SCHEDULERS, scheduler1, 374 ((mec << 5) | (pipe << 3) | queue_id | 0x80)); 375 WREG32(SOC15_REG_OFFSET(GC, 0, mmRLC_CP_SCHEDULERS), value); 376 } 377 378 /* HQD registers extend from CP_MQD_BASE_ADDR to CP_HQD_EOP_WPTR_MEM. */ 379 mqd_hqd = &m->cp_mqd_base_addr_lo; 380 hqd_base = SOC15_REG_OFFSET(GC, 0, mmCP_MQD_BASE_ADDR); 381 382 for (reg = hqd_base; 383 reg <= SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI); reg++) 384 WREG32(reg, mqd_hqd[reg - hqd_base]); 385 386 387 /* Activate doorbell logic before triggering WPTR poll. */ 388 data = REG_SET_FIELD(m->cp_hqd_pq_doorbell_control, 389 CP_HQD_PQ_DOORBELL_CONTROL, DOORBELL_EN, 1); 390 WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_DOORBELL_CONTROL), data); 391 392 if (wptr) { 393 /* Don't read wptr with get_user because the user 394 * context may not be accessible (if this function 395 * runs in a work queue). Instead trigger a one-shot 396 * polling read from memory in the CP. This assumes 397 * that wptr is GPU-accessible in the queue's VMID via 398 * ATC or SVM. WPTR==RPTR before starting the poll so 399 * the CP starts fetching new commands from the right 400 * place. 401 * 402 * Guessing a 64-bit WPTR from a 32-bit RPTR is a bit 403 * tricky. Assume that the queue didn't overflow. The 404 * number of valid bits in the 32-bit RPTR depends on 405 * the queue size. The remaining bits are taken from 406 * the saved 64-bit WPTR. If the WPTR wrapped, add the 407 * queue size. 408 */ 409 uint32_t queue_size = 410 2 << REG_GET_FIELD(m->cp_hqd_pq_control, 411 CP_HQD_PQ_CONTROL, QUEUE_SIZE); 412 uint64_t guessed_wptr = m->cp_hqd_pq_rptr & (queue_size - 1); 413 414 if ((m->cp_hqd_pq_wptr_lo & (queue_size - 1)) < guessed_wptr) 415 guessed_wptr += queue_size; 416 guessed_wptr += m->cp_hqd_pq_wptr_lo & ~(queue_size - 1); 417 guessed_wptr += (uint64_t)m->cp_hqd_pq_wptr_hi << 32; 418 419 WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_LO), 420 lower_32_bits(guessed_wptr)); 421 WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI), 422 upper_32_bits(guessed_wptr)); 423 WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_POLL_ADDR), 424 lower_32_bits((uintptr_t)wptr)); 425 WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_POLL_ADDR_HI), 426 upper_32_bits((uintptr_t)wptr)); 427 WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_PQ_WPTR_POLL_CNTL1), 428 get_queue_mask(adev, pipe_id, queue_id)); 429 } 430 431 /* Start the EOP fetcher */ 432 WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_EOP_RPTR), 433 REG_SET_FIELD(m->cp_hqd_eop_rptr, 434 CP_HQD_EOP_RPTR, INIT_FETCHER, 1)); 435 436 data = REG_SET_FIELD(m->cp_hqd_active, CP_HQD_ACTIVE, ACTIVE, 1); 437 WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE), data); 438 439 release_queue(kgd); 440 441 return 0; 442 } 443 444 static int kgd_hqd_dump(struct kgd_dev *kgd, 445 uint32_t pipe_id, uint32_t queue_id, 446 uint32_t (**dump)[2], uint32_t *n_regs) 447 { 448 struct amdgpu_device *adev = get_amdgpu_device(kgd); 449 uint32_t i = 0, reg; 450 #define HQD_N_REGS 56 451 #define DUMP_REG(addr) do { \ 452 if (WARN_ON_ONCE(i >= HQD_N_REGS)) \ 453 break; \ 454 (*dump)[i][0] = (addr) << 2; \ 455 (*dump)[i++][1] = RREG32(addr); \ 456 } while (0) 457 458 *dump = kmalloc_array(HQD_N_REGS * 2, sizeof(uint32_t), GFP_KERNEL); 459 if (*dump == NULL) 460 return -ENOMEM; 461 462 acquire_queue(kgd, pipe_id, queue_id); 463 464 for (reg = SOC15_REG_OFFSET(GC, 0, mmCP_MQD_BASE_ADDR); 465 reg <= SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI); reg++) 466 DUMP_REG(reg); 467 468 release_queue(kgd); 469 470 WARN_ON_ONCE(i != HQD_N_REGS); 471 *n_regs = i; 472 473 return 0; 474 } 475 476 static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, 477 uint32_t __user *wptr, struct mm_struct *mm) 478 { 479 struct amdgpu_device *adev = get_amdgpu_device(kgd); 480 struct v9_sdma_mqd *m; 481 uint32_t sdma_base_addr, sdmax_gfx_context_cntl; 482 unsigned long end_jiffies; 483 uint32_t data; 484 uint64_t data64; 485 uint64_t __user *wptr64 = (uint64_t __user *)wptr; 486 487 m = get_sdma_mqd(mqd); 488 sdma_base_addr = get_sdma_base_addr(adev, m->sdma_engine_id, 489 m->sdma_queue_id); 490 sdmax_gfx_context_cntl = m->sdma_engine_id ? 491 SOC15_REG_OFFSET(SDMA1, 0, mmSDMA1_GFX_CONTEXT_CNTL) : 492 SOC15_REG_OFFSET(SDMA0, 0, mmSDMA0_GFX_CONTEXT_CNTL); 493 494 WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, 495 m->sdmax_rlcx_rb_cntl & (~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK)); 496 497 end_jiffies = msecs_to_jiffies(2000) + jiffies; 498 while (true) { 499 data = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS); 500 if (data & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) 501 break; 502 if (time_after(jiffies, end_jiffies)) 503 return -ETIME; 504 usleep_range(500, 1000); 505 } 506 data = RREG32(sdmax_gfx_context_cntl); 507 data = REG_SET_FIELD(data, SDMA0_GFX_CONTEXT_CNTL, 508 RESUME_CTX, 0); 509 WREG32(sdmax_gfx_context_cntl, data); 510 511 WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL_OFFSET, 512 m->sdmax_rlcx_doorbell_offset); 513 514 data = REG_SET_FIELD(m->sdmax_rlcx_doorbell, SDMA0_RLC0_DOORBELL, 515 ENABLE, 1); 516 WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, data); 517 WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, m->sdmax_rlcx_rb_rptr); 518 WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_HI, 519 m->sdmax_rlcx_rb_rptr_hi); 520 521 WREG32(sdma_base_addr + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 1); 522 if (read_user_wptr(mm, wptr64, data64)) { 523 WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, 524 lower_32_bits(data64)); 525 WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR_HI, 526 upper_32_bits(data64)); 527 } else { 528 WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, 529 m->sdmax_rlcx_rb_rptr); 530 WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR_HI, 531 m->sdmax_rlcx_rb_rptr_hi); 532 } 533 WREG32(sdma_base_addr + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 0); 534 535 WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, m->sdmax_rlcx_rb_base); 536 WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE_HI, 537 m->sdmax_rlcx_rb_base_hi); 538 WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_LO, 539 m->sdmax_rlcx_rb_rptr_addr_lo); 540 WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_HI, 541 m->sdmax_rlcx_rb_rptr_addr_hi); 542 543 data = REG_SET_FIELD(m->sdmax_rlcx_rb_cntl, SDMA0_RLC0_RB_CNTL, 544 RB_ENABLE, 1); 545 WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, data); 546 547 return 0; 548 } 549 550 static int kgd_hqd_sdma_dump(struct kgd_dev *kgd, 551 uint32_t engine_id, uint32_t queue_id, 552 uint32_t (**dump)[2], uint32_t *n_regs) 553 { 554 struct amdgpu_device *adev = get_amdgpu_device(kgd); 555 uint32_t sdma_base_addr = get_sdma_base_addr(adev, engine_id, queue_id); 556 uint32_t i = 0, reg; 557 #undef HQD_N_REGS 558 #define HQD_N_REGS (19+6+7+10) 559 560 *dump = kmalloc_array(HQD_N_REGS * 2, sizeof(uint32_t), GFP_KERNEL); 561 if (*dump == NULL) 562 return -ENOMEM; 563 564 for (reg = mmSDMA0_RLC0_RB_CNTL; reg <= mmSDMA0_RLC0_DOORBELL; reg++) 565 DUMP_REG(sdma_base_addr + reg); 566 for (reg = mmSDMA0_RLC0_STATUS; reg <= mmSDMA0_RLC0_CSA_ADDR_HI; reg++) 567 DUMP_REG(sdma_base_addr + reg); 568 for (reg = mmSDMA0_RLC0_IB_SUB_REMAIN; 569 reg <= mmSDMA0_RLC0_MINOR_PTR_UPDATE; reg++) 570 DUMP_REG(sdma_base_addr + reg); 571 for (reg = mmSDMA0_RLC0_MIDCMD_DATA0; 572 reg <= mmSDMA0_RLC0_MIDCMD_CNTL; reg++) 573 DUMP_REG(sdma_base_addr + reg); 574 575 WARN_ON_ONCE(i != HQD_N_REGS); 576 *n_regs = i; 577 578 return 0; 579 } 580 581 static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, 582 uint32_t pipe_id, uint32_t queue_id) 583 { 584 struct amdgpu_device *adev = get_amdgpu_device(kgd); 585 uint32_t act; 586 bool retval = false; 587 uint32_t low, high; 588 589 acquire_queue(kgd, pipe_id, queue_id); 590 act = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE)); 591 if (act) { 592 low = lower_32_bits(queue_address >> 8); 593 high = upper_32_bits(queue_address >> 8); 594 595 if (low == RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_BASE)) && 596 high == RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_BASE_HI))) 597 retval = true; 598 } 599 release_queue(kgd); 600 return retval; 601 } 602 603 static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd) 604 { 605 struct amdgpu_device *adev = get_amdgpu_device(kgd); 606 struct v9_sdma_mqd *m; 607 uint32_t sdma_base_addr; 608 uint32_t sdma_rlc_rb_cntl; 609 610 m = get_sdma_mqd(mqd); 611 sdma_base_addr = get_sdma_base_addr(adev, m->sdma_engine_id, 612 m->sdma_queue_id); 613 614 sdma_rlc_rb_cntl = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL); 615 616 if (sdma_rlc_rb_cntl & SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK) 617 return true; 618 619 return false; 620 } 621 622 static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, 623 enum kfd_preempt_type reset_type, 624 unsigned int utimeout, uint32_t pipe_id, 625 uint32_t queue_id) 626 { 627 struct amdgpu_device *adev = get_amdgpu_device(kgd); 628 enum hqd_dequeue_request_type type; 629 unsigned long end_jiffies; 630 uint32_t temp; 631 struct v9_mqd *m = get_mqd(mqd); 632 633 if (adev->in_gpu_reset) 634 return -EIO; 635 636 acquire_queue(kgd, pipe_id, queue_id); 637 638 if (m->cp_hqd_vmid == 0) 639 WREG32_FIELD15(GC, 0, RLC_CP_SCHEDULERS, scheduler1, 0); 640 641 switch (reset_type) { 642 case KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN: 643 type = DRAIN_PIPE; 644 break; 645 case KFD_PREEMPT_TYPE_WAVEFRONT_RESET: 646 type = RESET_WAVES; 647 break; 648 default: 649 type = DRAIN_PIPE; 650 break; 651 } 652 653 WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_DEQUEUE_REQUEST), type); 654 655 end_jiffies = (utimeout * HZ / 1000) + jiffies; 656 while (true) { 657 temp = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE)); 658 if (!(temp & CP_HQD_ACTIVE__ACTIVE_MASK)) 659 break; 660 if (time_after(jiffies, end_jiffies)) { 661 pr_err("cp queue preemption time out.\n"); 662 release_queue(kgd); 663 return -ETIME; 664 } 665 usleep_range(500, 1000); 666 } 667 668 release_queue(kgd); 669 return 0; 670 } 671 672 static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, 673 unsigned int utimeout) 674 { 675 struct amdgpu_device *adev = get_amdgpu_device(kgd); 676 struct v9_sdma_mqd *m; 677 uint32_t sdma_base_addr; 678 uint32_t temp; 679 unsigned long end_jiffies = (utimeout * HZ / 1000) + jiffies; 680 681 m = get_sdma_mqd(mqd); 682 sdma_base_addr = get_sdma_base_addr(adev, m->sdma_engine_id, 683 m->sdma_queue_id); 684 685 temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL); 686 temp = temp & ~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK; 687 WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, temp); 688 689 while (true) { 690 temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS); 691 if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) 692 break; 693 if (time_after(jiffies, end_jiffies)) 694 return -ETIME; 695 usleep_range(500, 1000); 696 } 697 698 WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, 0); 699 WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, 700 RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL) | 701 SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK); 702 703 m->sdmax_rlcx_rb_rptr = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR); 704 m->sdmax_rlcx_rb_rptr_hi = 705 RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_HI); 706 707 return 0; 708 } 709 710 static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd, 711 uint8_t vmid) 712 { 713 uint32_t reg; 714 struct amdgpu_device *adev = (struct amdgpu_device *) kgd; 715 716 reg = RREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) 717 + vmid); 718 return reg & ATC_VMID0_PASID_MAPPING__VALID_MASK; 719 } 720 721 static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd, 722 uint8_t vmid) 723 { 724 uint32_t reg; 725 struct amdgpu_device *adev = (struct amdgpu_device *) kgd; 726 727 reg = RREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) 728 + vmid); 729 return reg & ATC_VMID0_PASID_MAPPING__PASID_MASK; 730 } 731 732 static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid) 733 { 734 struct amdgpu_device *adev = (struct amdgpu_device *) kgd; 735 736 /* Use legacy mode tlb invalidation. 737 * 738 * Currently on Raven the code below is broken for anything but 739 * legacy mode due to a MMHUB power gating problem. A workaround 740 * is for MMHUB to wait until the condition PER_VMID_INVALIDATE_REQ 741 * == PER_VMID_INVALIDATE_ACK instead of simply waiting for the ack 742 * bit. 743 * 744 * TODO 1: agree on the right set of invalidation registers for 745 * KFD use. Use the last one for now. Invalidate both GC and 746 * MMHUB. 747 * 748 * TODO 2: support range-based invalidation, requires kfg2kgd 749 * interface change 750 */ 751 amdgpu_gmc_flush_gpu_tlb(adev, vmid, 0); 752 } 753 754 static int invalidate_tlbs_with_kiq(struct amdgpu_device *adev, uint16_t pasid) 755 { 756 signed long r; 757 uint32_t seq; 758 struct amdgpu_ring *ring = &adev->gfx.kiq.ring; 759 760 spin_lock(&adev->gfx.kiq.ring_lock); 761 amdgpu_ring_alloc(ring, 12); /* fence + invalidate_tlbs package*/ 762 amdgpu_ring_write(ring, PACKET3(PACKET3_INVALIDATE_TLBS, 0)); 763 amdgpu_ring_write(ring, 764 PACKET3_INVALIDATE_TLBS_DST_SEL(1) | 765 PACKET3_INVALIDATE_TLBS_ALL_HUB(1) | 766 PACKET3_INVALIDATE_TLBS_PASID(pasid) | 767 PACKET3_INVALIDATE_TLBS_FLUSH_TYPE(0)); /* legacy */ 768 amdgpu_fence_emit_polling(ring, &seq); 769 amdgpu_ring_commit(ring); 770 spin_unlock(&adev->gfx.kiq.ring_lock); 771 772 r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout); 773 if (r < 1) { 774 DRM_ERROR("wait for kiq fence error: %ld.\n", r); 775 return -ETIME; 776 } 777 778 return 0; 779 } 780 781 static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid) 782 { 783 struct amdgpu_device *adev = (struct amdgpu_device *) kgd; 784 int vmid; 785 struct amdgpu_ring *ring = &adev->gfx.kiq.ring; 786 787 if (adev->in_gpu_reset) 788 return -EIO; 789 790 if (ring->sched.ready) 791 return invalidate_tlbs_with_kiq(adev, pasid); 792 793 for (vmid = 0; vmid < 16; vmid++) { 794 if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid)) 795 continue; 796 if (get_atc_vmid_pasid_mapping_valid(kgd, vmid)) { 797 if (get_atc_vmid_pasid_mapping_pasid(kgd, vmid) 798 == pasid) { 799 write_vmid_invalidate_request(kgd, vmid); 800 break; 801 } 802 } 803 } 804 805 return 0; 806 } 807 808 static int invalidate_tlbs_vmid(struct kgd_dev *kgd, uint16_t vmid) 809 { 810 struct amdgpu_device *adev = (struct amdgpu_device *) kgd; 811 812 if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid)) { 813 pr_err("non kfd vmid %d\n", vmid); 814 return 0; 815 } 816 817 write_vmid_invalidate_request(kgd, vmid); 818 return 0; 819 } 820 821 static int kgd_address_watch_disable(struct kgd_dev *kgd) 822 { 823 return 0; 824 } 825 826 static int kgd_address_watch_execute(struct kgd_dev *kgd, 827 unsigned int watch_point_id, 828 uint32_t cntl_val, 829 uint32_t addr_hi, 830 uint32_t addr_lo) 831 { 832 return 0; 833 } 834 835 static int kgd_wave_control_execute(struct kgd_dev *kgd, 836 uint32_t gfx_index_val, 837 uint32_t sq_cmd) 838 { 839 struct amdgpu_device *adev = get_amdgpu_device(kgd); 840 uint32_t data = 0; 841 842 mutex_lock(&adev->grbm_idx_mutex); 843 844 WREG32(SOC15_REG_OFFSET(GC, 0, mmGRBM_GFX_INDEX), gfx_index_val); 845 WREG32(SOC15_REG_OFFSET(GC, 0, mmSQ_CMD), sq_cmd); 846 847 data = REG_SET_FIELD(data, GRBM_GFX_INDEX, 848 INSTANCE_BROADCAST_WRITES, 1); 849 data = REG_SET_FIELD(data, GRBM_GFX_INDEX, 850 SH_BROADCAST_WRITES, 1); 851 data = REG_SET_FIELD(data, GRBM_GFX_INDEX, 852 SE_BROADCAST_WRITES, 1); 853 854 WREG32(SOC15_REG_OFFSET(GC, 0, mmGRBM_GFX_INDEX), data); 855 mutex_unlock(&adev->grbm_idx_mutex); 856 857 return 0; 858 } 859 860 static uint32_t kgd_address_watch_get_offset(struct kgd_dev *kgd, 861 unsigned int watch_point_id, 862 unsigned int reg_offset) 863 { 864 return 0; 865 } 866 867 static void set_scratch_backing_va(struct kgd_dev *kgd, 868 uint64_t va, uint32_t vmid) 869 { 870 /* No longer needed on GFXv9. The scratch base address is 871 * passed to the shader by the CP. It's the user mode driver's 872 * responsibility. 873 */ 874 } 875 876 /* FIXME: Does this need to be ASIC-specific code? */ 877 static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type) 878 { 879 struct amdgpu_device *adev = (struct amdgpu_device *) kgd; 880 const union amdgpu_firmware_header *hdr; 881 882 switch (type) { 883 case KGD_ENGINE_PFP: 884 hdr = (const union amdgpu_firmware_header *)adev->gfx.pfp_fw->data; 885 break; 886 887 case KGD_ENGINE_ME: 888 hdr = (const union amdgpu_firmware_header *)adev->gfx.me_fw->data; 889 break; 890 891 case KGD_ENGINE_CE: 892 hdr = (const union amdgpu_firmware_header *)adev->gfx.ce_fw->data; 893 break; 894 895 case KGD_ENGINE_MEC1: 896 hdr = (const union amdgpu_firmware_header *)adev->gfx.mec_fw->data; 897 break; 898 899 case KGD_ENGINE_MEC2: 900 hdr = (const union amdgpu_firmware_header *)adev->gfx.mec2_fw->data; 901 break; 902 903 case KGD_ENGINE_RLC: 904 hdr = (const union amdgpu_firmware_header *)adev->gfx.rlc_fw->data; 905 break; 906 907 case KGD_ENGINE_SDMA1: 908 hdr = (const union amdgpu_firmware_header *)adev->sdma.instance[0].fw->data; 909 break; 910 911 case KGD_ENGINE_SDMA2: 912 hdr = (const union amdgpu_firmware_header *)adev->sdma.instance[1].fw->data; 913 break; 914 915 default: 916 return 0; 917 } 918 919 if (hdr == NULL) 920 return 0; 921 922 /* Only 12 bit in use*/ 923 return hdr->common.ucode_version; 924 } 925 926 static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, 927 uint64_t page_table_base) 928 { 929 struct amdgpu_device *adev = get_amdgpu_device(kgd); 930 931 if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid)) { 932 pr_err("trying to set page table base for wrong VMID %u\n", 933 vmid); 934 return; 935 } 936 937 /* TODO: take advantage of per-process address space size. For 938 * now, all processes share the same address space size, like 939 * on GFX8 and older. 940 */ 941 mmhub_v1_0_setup_vm_pt_regs(adev, vmid, page_table_base); 942 943 gfxhub_v1_0_setup_vm_pt_regs(adev, vmid, page_table_base); 944 } 945