1 /* 2 * Copyright © 2014 Broadcom 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24 /** 25 * DOC: Shader validator for VC4. 26 * 27 * Since the VC4 has no IOMMU between it and system memory, a user 28 * with access to execute shaders could escalate privilege by 29 * overwriting system memory (using the VPM write address register in 30 * the general-purpose DMA mode) or reading system memory it shouldn't 31 * (reading it as a texture, uniform data, or direct-addressed TMU 32 * lookup). 33 * 34 * The shader validator walks over a shader's BO, ensuring that its 35 * accesses are appropriately bounded, and recording where texture 36 * accesses are made so that we can do relocations for them in the 37 * uniform stream. 38 * 39 * Shader BO are immutable for their lifetimes (enforced by not 40 * allowing mmaps, GEM prime export, or rendering to from a CL), so 41 * this validation is only performed at BO creation time. 42 */ 43 44 #include "vc4_drv.h" 45 #include "vc4_qpu_defines.h" 46 47 #define LIVE_REG_COUNT (32 + 32 + 4) 48 49 struct vc4_shader_validation_state { 50 /* Current IP being validated. */ 51 uint32_t ip; 52 53 /* IP at the end of the BO, do not read shader[max_ip] */ 54 uint32_t max_ip; 55 56 uint64_t *shader; 57 58 struct vc4_texture_sample_info tmu_setup[2]; 59 int tmu_write_count[2]; 60 61 /* For registers that were last written to by a MIN instruction with 62 * one argument being a uniform, the address of the uniform. 63 * Otherwise, ~0. 64 * 65 * This is used for the validation of direct address memory reads. 66 */ 67 uint32_t live_min_clamp_offsets[LIVE_REG_COUNT]; 68 bool live_max_clamp_regs[LIVE_REG_COUNT]; 69 uint32_t live_immediates[LIVE_REG_COUNT]; 70 71 /* Bitfield of which IPs are used as branch targets. 72 * 73 * Used for validation that the uniform stream is updated at the right 74 * points and clearing the texturing/clamping state. 75 */ 76 unsigned long *branch_targets; 77 78 /* Set when entering a basic block, and cleared when the uniform 79 * address update is found. This is used to make sure that we don't 80 * read uniforms when the address is undefined. 81 */ 82 bool needs_uniform_address_update; 83 84 /* Set when we find a backwards branch. If the branch is backwards, 85 * the taraget is probably doing an address reset to read uniforms, 86 * and so we need to be sure that a uniforms address is present in the 87 * stream, even if the shader didn't need to read uniforms in later 88 * basic blocks. 89 */ 90 bool needs_uniform_address_for_loop; 91 92 /* Set when we find an instruction writing the top half of the 93 * register files. If we allowed writing the unusable regs in 94 * a threaded shader, then the other shader running on our 95 * QPU's clamp validation would be invalid. 96 */ 97 bool all_registers_used; 98 }; 99 100 static uint32_t 101 waddr_to_live_reg_index(uint32_t waddr, bool is_b) 102 { 103 if (waddr < 32) { 104 if (is_b) 105 return 32 + waddr; 106 else 107 return waddr; 108 } else if (waddr <= QPU_W_ACC3) { 109 return 64 + waddr - QPU_W_ACC0; 110 } else { 111 return ~0; 112 } 113 } 114 115 static uint32_t 116 raddr_add_a_to_live_reg_index(uint64_t inst) 117 { 118 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); 119 uint32_t add_a = QPU_GET_FIELD(inst, QPU_ADD_A); 120 uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A); 121 uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B); 122 123 if (add_a == QPU_MUX_A) 124 return raddr_a; 125 else if (add_a == QPU_MUX_B && sig != QPU_SIG_SMALL_IMM) 126 return 32 + raddr_b; 127 else if (add_a <= QPU_MUX_R3) 128 return 64 + add_a; 129 else 130 return ~0; 131 } 132 133 static bool 134 live_reg_is_upper_half(uint32_t lri) 135 { 136 return (lri >= 16 && lri < 32) || 137 (lri >= 32 + 16 && lri < 32 + 32); 138 } 139 140 static bool 141 is_tmu_submit(uint32_t waddr) 142 { 143 return (waddr == QPU_W_TMU0_S || 144 waddr == QPU_W_TMU1_S); 145 } 146 147 static bool 148 is_tmu_write(uint32_t waddr) 149 { 150 return (waddr >= QPU_W_TMU0_S && 151 waddr <= QPU_W_TMU1_B); 152 } 153 154 static bool 155 record_texture_sample(struct vc4_validated_shader_info *validated_shader, 156 struct vc4_shader_validation_state *validation_state, 157 int tmu) 158 { 159 uint32_t s = validated_shader->num_texture_samples; 160 int i; 161 struct vc4_texture_sample_info *temp_samples; 162 163 temp_samples = krealloc(validated_shader->texture_samples, 164 (s + 1) * sizeof(*temp_samples), 165 GFP_KERNEL); 166 if (!temp_samples) 167 return false; 168 169 memcpy(&temp_samples[s], 170 &validation_state->tmu_setup[tmu], 171 sizeof(*temp_samples)); 172 173 validated_shader->num_texture_samples = s + 1; 174 validated_shader->texture_samples = temp_samples; 175 176 for (i = 0; i < 4; i++) 177 validation_state->tmu_setup[tmu].p_offset[i] = ~0; 178 179 return true; 180 } 181 182 static bool 183 check_tmu_write(struct vc4_validated_shader_info *validated_shader, 184 struct vc4_shader_validation_state *validation_state, 185 bool is_mul) 186 { 187 uint64_t inst = validation_state->shader[validation_state->ip]; 188 uint32_t waddr = (is_mul ? 189 QPU_GET_FIELD(inst, QPU_WADDR_MUL) : 190 QPU_GET_FIELD(inst, QPU_WADDR_ADD)); 191 uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A); 192 uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B); 193 int tmu = waddr > QPU_W_TMU0_B; 194 bool submit = is_tmu_submit(waddr); 195 bool is_direct = submit && validation_state->tmu_write_count[tmu] == 0; 196 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); 197 198 if (is_direct) { 199 uint32_t add_b = QPU_GET_FIELD(inst, QPU_ADD_B); 200 uint32_t clamp_reg, clamp_offset; 201 202 if (sig == QPU_SIG_SMALL_IMM) { 203 DRM_DEBUG("direct TMU read used small immediate\n"); 204 return false; 205 } 206 207 /* Make sure that this texture load is an add of the base 208 * address of the UBO to a clamped offset within the UBO. 209 */ 210 if (is_mul || 211 QPU_GET_FIELD(inst, QPU_OP_ADD) != QPU_A_ADD) { 212 DRM_DEBUG("direct TMU load wasn't an add\n"); 213 return false; 214 } 215 216 /* We assert that the clamped address is the first 217 * argument, and the UBO base address is the second argument. 218 * This is arbitrary, but simpler than supporting flipping the 219 * two either way. 220 */ 221 clamp_reg = raddr_add_a_to_live_reg_index(inst); 222 if (clamp_reg == ~0) { 223 DRM_DEBUG("direct TMU load wasn't clamped\n"); 224 return false; 225 } 226 227 clamp_offset = validation_state->live_min_clamp_offsets[clamp_reg]; 228 if (clamp_offset == ~0) { 229 DRM_DEBUG("direct TMU load wasn't clamped\n"); 230 return false; 231 } 232 233 /* Store the clamp value's offset in p1 (see reloc_tex() in 234 * vc4_validate.c). 235 */ 236 validation_state->tmu_setup[tmu].p_offset[1] = 237 clamp_offset; 238 239 if (!(add_b == QPU_MUX_A && raddr_a == QPU_R_UNIF) && 240 !(add_b == QPU_MUX_B && raddr_b == QPU_R_UNIF)) { 241 DRM_DEBUG("direct TMU load didn't add to a uniform\n"); 242 return false; 243 } 244 245 validation_state->tmu_setup[tmu].is_direct = true; 246 } else { 247 if (raddr_a == QPU_R_UNIF || (sig != QPU_SIG_SMALL_IMM && 248 raddr_b == QPU_R_UNIF)) { 249 DRM_DEBUG("uniform read in the same instruction as " 250 "texture setup.\n"); 251 return false; 252 } 253 } 254 255 if (validation_state->tmu_write_count[tmu] >= 4) { 256 DRM_DEBUG("TMU%d got too many parameters before dispatch\n", 257 tmu); 258 return false; 259 } 260 validation_state->tmu_setup[tmu].p_offset[validation_state->tmu_write_count[tmu]] = 261 validated_shader->uniforms_size; 262 validation_state->tmu_write_count[tmu]++; 263 /* Since direct uses a RADDR uniform reference, it will get counted in 264 * check_instruction_reads() 265 */ 266 if (!is_direct) { 267 if (validation_state->needs_uniform_address_update) { 268 DRM_DEBUG("Texturing with undefined uniform address\n"); 269 return false; 270 } 271 272 validated_shader->uniforms_size += 4; 273 } 274 275 if (submit) { 276 if (!record_texture_sample(validated_shader, 277 validation_state, tmu)) { 278 return false; 279 } 280 281 validation_state->tmu_write_count[tmu] = 0; 282 } 283 284 return true; 285 } 286 287 static bool require_uniform_address_uniform(struct vc4_validated_shader_info *validated_shader) 288 { 289 uint32_t o = validated_shader->num_uniform_addr_offsets; 290 uint32_t num_uniforms = validated_shader->uniforms_size / 4; 291 292 validated_shader->uniform_addr_offsets = 293 krealloc(validated_shader->uniform_addr_offsets, 294 (o + 1) * 295 sizeof(*validated_shader->uniform_addr_offsets), 296 GFP_KERNEL); 297 if (!validated_shader->uniform_addr_offsets) 298 return false; 299 300 validated_shader->uniform_addr_offsets[o] = num_uniforms; 301 validated_shader->num_uniform_addr_offsets++; 302 303 return true; 304 } 305 306 static bool 307 validate_uniform_address_write(struct vc4_validated_shader_info *validated_shader, 308 struct vc4_shader_validation_state *validation_state, 309 bool is_mul) 310 { 311 uint64_t inst = validation_state->shader[validation_state->ip]; 312 u32 add_b = QPU_GET_FIELD(inst, QPU_ADD_B); 313 u32 raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A); 314 u32 raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B); 315 u32 add_lri = raddr_add_a_to_live_reg_index(inst); 316 /* We want our reset to be pointing at whatever uniform follows the 317 * uniforms base address. 318 */ 319 u32 expected_offset = validated_shader->uniforms_size + 4; 320 321 /* We only support absolute uniform address changes, and we 322 * require that they be in the current basic block before any 323 * of its uniform reads. 324 * 325 * One could potentially emit more efficient QPU code, by 326 * noticing that (say) an if statement does uniform control 327 * flow for all threads and that the if reads the same number 328 * of uniforms on each side. However, this scheme is easy to 329 * validate so it's all we allow for now. 330 */ 331 switch (QPU_GET_FIELD(inst, QPU_SIG)) { 332 case QPU_SIG_NONE: 333 case QPU_SIG_SCOREBOARD_UNLOCK: 334 case QPU_SIG_COLOR_LOAD: 335 case QPU_SIG_LOAD_TMU0: 336 case QPU_SIG_LOAD_TMU1: 337 break; 338 default: 339 DRM_DEBUG("uniforms address change must be " 340 "normal math\n"); 341 return false; 342 } 343 344 if (is_mul || QPU_GET_FIELD(inst, QPU_OP_ADD) != QPU_A_ADD) { 345 DRM_DEBUG("Uniform address reset must be an ADD.\n"); 346 return false; 347 } 348 349 if (QPU_GET_FIELD(inst, QPU_COND_ADD) != QPU_COND_ALWAYS) { 350 DRM_DEBUG("Uniform address reset must be unconditional.\n"); 351 return false; 352 } 353 354 if (QPU_GET_FIELD(inst, QPU_PACK) != QPU_PACK_A_NOP && 355 !(inst & QPU_PM)) { 356 DRM_DEBUG("No packing allowed on uniforms reset\n"); 357 return false; 358 } 359 360 if (add_lri == -1) { 361 DRM_DEBUG("First argument of uniform address write must be " 362 "an immediate value.\n"); 363 return false; 364 } 365 366 if (validation_state->live_immediates[add_lri] != expected_offset) { 367 DRM_DEBUG("Resetting uniforms with offset %db instead of %db\n", 368 validation_state->live_immediates[add_lri], 369 expected_offset); 370 return false; 371 } 372 373 if (!(add_b == QPU_MUX_A && raddr_a == QPU_R_UNIF) && 374 !(add_b == QPU_MUX_B && raddr_b == QPU_R_UNIF)) { 375 DRM_DEBUG("Second argument of uniform address write must be " 376 "a uniform.\n"); 377 return false; 378 } 379 380 validation_state->needs_uniform_address_update = false; 381 validation_state->needs_uniform_address_for_loop = false; 382 return require_uniform_address_uniform(validated_shader); 383 } 384 385 static bool 386 check_reg_write(struct vc4_validated_shader_info *validated_shader, 387 struct vc4_shader_validation_state *validation_state, 388 bool is_mul) 389 { 390 uint64_t inst = validation_state->shader[validation_state->ip]; 391 uint32_t waddr = (is_mul ? 392 QPU_GET_FIELD(inst, QPU_WADDR_MUL) : 393 QPU_GET_FIELD(inst, QPU_WADDR_ADD)); 394 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); 395 bool ws = inst & QPU_WS; 396 bool is_b = is_mul ^ ws; 397 u32 lri = waddr_to_live_reg_index(waddr, is_b); 398 399 if (lri != -1) { 400 uint32_t cond_add = QPU_GET_FIELD(inst, QPU_COND_ADD); 401 uint32_t cond_mul = QPU_GET_FIELD(inst, QPU_COND_MUL); 402 403 if (sig == QPU_SIG_LOAD_IMM && 404 QPU_GET_FIELD(inst, QPU_PACK) == QPU_PACK_A_NOP && 405 ((is_mul && cond_mul == QPU_COND_ALWAYS) || 406 (!is_mul && cond_add == QPU_COND_ALWAYS))) { 407 validation_state->live_immediates[lri] = 408 QPU_GET_FIELD(inst, QPU_LOAD_IMM); 409 } else { 410 validation_state->live_immediates[lri] = ~0; 411 } 412 413 if (live_reg_is_upper_half(lri)) 414 validation_state->all_registers_used = true; 415 } 416 417 switch (waddr) { 418 case QPU_W_UNIFORMS_ADDRESS: 419 if (is_b) { 420 DRM_DEBUG("relative uniforms address change " 421 "unsupported\n"); 422 return false; 423 } 424 425 return validate_uniform_address_write(validated_shader, 426 validation_state, 427 is_mul); 428 429 case QPU_W_TLB_COLOR_MS: 430 case QPU_W_TLB_COLOR_ALL: 431 case QPU_W_TLB_Z: 432 /* These only interact with the tile buffer, not main memory, 433 * so they're safe. 434 */ 435 return true; 436 437 case QPU_W_TMU0_S: 438 case QPU_W_TMU0_T: 439 case QPU_W_TMU0_R: 440 case QPU_W_TMU0_B: 441 case QPU_W_TMU1_S: 442 case QPU_W_TMU1_T: 443 case QPU_W_TMU1_R: 444 case QPU_W_TMU1_B: 445 return check_tmu_write(validated_shader, validation_state, 446 is_mul); 447 448 case QPU_W_HOST_INT: 449 case QPU_W_TMU_NOSWAP: 450 case QPU_W_TLB_ALPHA_MASK: 451 case QPU_W_MUTEX_RELEASE: 452 /* XXX: I haven't thought about these, so don't support them 453 * for now. 454 */ 455 DRM_DEBUG("Unsupported waddr %d\n", waddr); 456 return false; 457 458 case QPU_W_VPM_ADDR: 459 DRM_DEBUG("General VPM DMA unsupported\n"); 460 return false; 461 462 case QPU_W_VPM: 463 case QPU_W_VPMVCD_SETUP: 464 /* We allow VPM setup in general, even including VPM DMA 465 * configuration setup, because the (unsafe) DMA can only be 466 * triggered by QPU_W_VPM_ADDR writes. 467 */ 468 return true; 469 470 case QPU_W_TLB_STENCIL_SETUP: 471 return true; 472 } 473 474 return true; 475 } 476 477 static void 478 track_live_clamps(struct vc4_validated_shader_info *validated_shader, 479 struct vc4_shader_validation_state *validation_state) 480 { 481 uint64_t inst = validation_state->shader[validation_state->ip]; 482 uint32_t op_add = QPU_GET_FIELD(inst, QPU_OP_ADD); 483 uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD); 484 uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL); 485 uint32_t cond_add = QPU_GET_FIELD(inst, QPU_COND_ADD); 486 uint32_t add_a = QPU_GET_FIELD(inst, QPU_ADD_A); 487 uint32_t add_b = QPU_GET_FIELD(inst, QPU_ADD_B); 488 uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A); 489 uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B); 490 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); 491 bool ws = inst & QPU_WS; 492 uint32_t lri_add_a, lri_add, lri_mul; 493 bool add_a_is_min_0; 494 495 /* Check whether OP_ADD's A argumennt comes from a live MAX(x, 0), 496 * before we clear previous live state. 497 */ 498 lri_add_a = raddr_add_a_to_live_reg_index(inst); 499 add_a_is_min_0 = (lri_add_a != ~0 && 500 validation_state->live_max_clamp_regs[lri_add_a]); 501 502 /* Clear live state for registers written by our instruction. */ 503 lri_add = waddr_to_live_reg_index(waddr_add, ws); 504 lri_mul = waddr_to_live_reg_index(waddr_mul, !ws); 505 if (lri_mul != ~0) { 506 validation_state->live_max_clamp_regs[lri_mul] = false; 507 validation_state->live_min_clamp_offsets[lri_mul] = ~0; 508 } 509 if (lri_add != ~0) { 510 validation_state->live_max_clamp_regs[lri_add] = false; 511 validation_state->live_min_clamp_offsets[lri_add] = ~0; 512 } else { 513 /* Nothing further to do for live tracking, since only ADDs 514 * generate new live clamp registers. 515 */ 516 return; 517 } 518 519 /* Now, handle remaining live clamp tracking for the ADD operation. */ 520 521 if (cond_add != QPU_COND_ALWAYS) 522 return; 523 524 if (op_add == QPU_A_MAX) { 525 /* Track live clamps of a value to a minimum of 0 (in either 526 * arg). 527 */ 528 if (sig != QPU_SIG_SMALL_IMM || raddr_b != 0 || 529 (add_a != QPU_MUX_B && add_b != QPU_MUX_B)) { 530 return; 531 } 532 533 validation_state->live_max_clamp_regs[lri_add] = true; 534 } else if (op_add == QPU_A_MIN) { 535 /* Track live clamps of a value clamped to a minimum of 0 and 536 * a maximum of some uniform's offset. 537 */ 538 if (!add_a_is_min_0) 539 return; 540 541 if (!(add_b == QPU_MUX_A && raddr_a == QPU_R_UNIF) && 542 !(add_b == QPU_MUX_B && raddr_b == QPU_R_UNIF && 543 sig != QPU_SIG_SMALL_IMM)) { 544 return; 545 } 546 547 validation_state->live_min_clamp_offsets[lri_add] = 548 validated_shader->uniforms_size; 549 } 550 } 551 552 static bool 553 check_instruction_writes(struct vc4_validated_shader_info *validated_shader, 554 struct vc4_shader_validation_state *validation_state) 555 { 556 uint64_t inst = validation_state->shader[validation_state->ip]; 557 uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD); 558 uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL); 559 bool ok; 560 561 if (is_tmu_write(waddr_add) && is_tmu_write(waddr_mul)) { 562 DRM_DEBUG("ADD and MUL both set up textures\n"); 563 return false; 564 } 565 566 ok = (check_reg_write(validated_shader, validation_state, false) && 567 check_reg_write(validated_shader, validation_state, true)); 568 569 track_live_clamps(validated_shader, validation_state); 570 571 return ok; 572 } 573 574 static bool 575 check_branch(uint64_t inst, 576 struct vc4_validated_shader_info *validated_shader, 577 struct vc4_shader_validation_state *validation_state, 578 int ip) 579 { 580 int32_t branch_imm = QPU_GET_FIELD(inst, QPU_BRANCH_TARGET); 581 uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD); 582 uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL); 583 584 if ((int)branch_imm < 0) 585 validation_state->needs_uniform_address_for_loop = true; 586 587 /* We don't want to have to worry about validation of this, and 588 * there's no need for it. 589 */ 590 if (waddr_add != QPU_W_NOP || waddr_mul != QPU_W_NOP) { 591 DRM_DEBUG("branch instruction at %d wrote a register.\n", 592 validation_state->ip); 593 return false; 594 } 595 596 return true; 597 } 598 599 static bool 600 check_instruction_reads(struct vc4_validated_shader_info *validated_shader, 601 struct vc4_shader_validation_state *validation_state) 602 { 603 uint64_t inst = validation_state->shader[validation_state->ip]; 604 uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A); 605 uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B); 606 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); 607 608 if (raddr_a == QPU_R_UNIF || 609 (raddr_b == QPU_R_UNIF && sig != QPU_SIG_SMALL_IMM)) { 610 /* This can't overflow the uint32_t, because we're reading 8 611 * bytes of instruction to increment by 4 here, so we'd 612 * already be OOM. 613 */ 614 validated_shader->uniforms_size += 4; 615 616 if (validation_state->needs_uniform_address_update) { 617 DRM_DEBUG("Uniform read with undefined uniform " 618 "address\n"); 619 return false; 620 } 621 } 622 623 if ((raddr_a >= 16 && raddr_a < 32) || 624 (raddr_b >= 16 && raddr_b < 32 && sig != QPU_SIG_SMALL_IMM)) { 625 validation_state->all_registers_used = true; 626 } 627 628 return true; 629 } 630 631 /* Make sure that all branches are absolute and point within the shader, and 632 * note their targets for later. 633 */ 634 static bool 635 vc4_validate_branches(struct vc4_shader_validation_state *validation_state) 636 { 637 uint32_t max_branch_target = 0; 638 int ip; 639 int last_branch = -2; 640 641 for (ip = 0; ip < validation_state->max_ip; ip++) { 642 uint64_t inst = validation_state->shader[ip]; 643 int32_t branch_imm = QPU_GET_FIELD(inst, QPU_BRANCH_TARGET); 644 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); 645 uint32_t after_delay_ip = ip + 4; 646 uint32_t branch_target_ip; 647 648 if (sig == QPU_SIG_PROG_END) { 649 /* There are two delay slots after program end is 650 * signaled that are still executed, then we're 651 * finished. validation_state->max_ip is the 652 * instruction after the last valid instruction in the 653 * program. 654 */ 655 validation_state->max_ip = ip + 3; 656 continue; 657 } 658 659 if (sig != QPU_SIG_BRANCH) 660 continue; 661 662 if (ip - last_branch < 4) { 663 DRM_DEBUG("Branch at %d during delay slots\n", ip); 664 return false; 665 } 666 last_branch = ip; 667 668 if (inst & QPU_BRANCH_REG) { 669 DRM_DEBUG("branching from register relative " 670 "not supported\n"); 671 return false; 672 } 673 674 if (!(inst & QPU_BRANCH_REL)) { 675 DRM_DEBUG("relative branching required\n"); 676 return false; 677 } 678 679 /* The actual branch target is the instruction after the delay 680 * slots, plus whatever byte offset is in the low 32 bits of 681 * the instruction. Make sure we're not branching beyond the 682 * end of the shader object. 683 */ 684 if (branch_imm % sizeof(inst) != 0) { 685 DRM_DEBUG("branch target not aligned\n"); 686 return false; 687 } 688 689 branch_target_ip = after_delay_ip + (branch_imm >> 3); 690 if (branch_target_ip >= validation_state->max_ip) { 691 DRM_DEBUG("Branch at %d outside of shader (ip %d/%d)\n", 692 ip, branch_target_ip, 693 validation_state->max_ip); 694 return false; 695 } 696 set_bit(branch_target_ip, validation_state->branch_targets); 697 698 /* Make sure that the non-branching path is also not outside 699 * the shader. 700 */ 701 if (after_delay_ip >= validation_state->max_ip) { 702 DRM_DEBUG("Branch at %d continues past shader end " 703 "(%d/%d)\n", 704 ip, after_delay_ip, validation_state->max_ip); 705 return false; 706 } 707 set_bit(after_delay_ip, validation_state->branch_targets); 708 max_branch_target = max(max_branch_target, after_delay_ip); 709 } 710 711 if (max_branch_target > validation_state->max_ip - 3) { 712 DRM_DEBUG("Branch landed after QPU_SIG_PROG_END"); 713 return false; 714 } 715 716 return true; 717 } 718 719 /* Resets any known state for the shader, used when we may be branched to from 720 * multiple locations in the program (or at shader start). 721 */ 722 static void 723 reset_validation_state(struct vc4_shader_validation_state *validation_state) 724 { 725 int i; 726 727 for (i = 0; i < 8; i++) 728 validation_state->tmu_setup[i / 4].p_offset[i % 4] = ~0; 729 730 for (i = 0; i < LIVE_REG_COUNT; i++) { 731 validation_state->live_min_clamp_offsets[i] = ~0; 732 validation_state->live_max_clamp_regs[i] = false; 733 validation_state->live_immediates[i] = ~0; 734 } 735 } 736 737 static bool 738 texturing_in_progress(struct vc4_shader_validation_state *validation_state) 739 { 740 return (validation_state->tmu_write_count[0] != 0 || 741 validation_state->tmu_write_count[1] != 0); 742 } 743 744 static bool 745 vc4_handle_branch_target(struct vc4_shader_validation_state *validation_state) 746 { 747 uint32_t ip = validation_state->ip; 748 749 if (!test_bit(ip, validation_state->branch_targets)) 750 return true; 751 752 if (texturing_in_progress(validation_state)) { 753 DRM_DEBUG("Branch target landed during TMU setup\n"); 754 return false; 755 } 756 757 /* Reset our live values tracking, since this instruction may have 758 * multiple predecessors. 759 * 760 * One could potentially do analysis to determine that, for 761 * example, all predecessors have a live max clamp in the same 762 * register, but we don't bother with that. 763 */ 764 reset_validation_state(validation_state); 765 766 /* Since we've entered a basic block from potentially multiple 767 * predecessors, we need the uniforms address to be updated before any 768 * unforms are read. We require that after any branch point, the next 769 * uniform to be loaded is a uniform address offset. That uniform's 770 * offset will be marked by the uniform address register write 771 * validation, or a one-off the end-of-program check. 772 */ 773 validation_state->needs_uniform_address_update = true; 774 775 return true; 776 } 777 778 struct vc4_validated_shader_info * 779 vc4_validate_shader(struct drm_gem_cma_object *shader_obj) 780 { 781 struct vc4_dev *vc4 = to_vc4_dev(shader_obj->base.dev); 782 bool found_shader_end = false; 783 int shader_end_ip = 0; 784 uint32_t last_thread_switch_ip = -3; 785 uint32_t ip; 786 struct vc4_validated_shader_info *validated_shader = NULL; 787 struct vc4_shader_validation_state validation_state; 788 789 if (WARN_ON_ONCE(vc4->is_vc5)) 790 return NULL; 791 792 memset(&validation_state, 0, sizeof(validation_state)); 793 validation_state.shader = shader_obj->vaddr; 794 validation_state.max_ip = shader_obj->base.size / sizeof(uint64_t); 795 796 reset_validation_state(&validation_state); 797 798 validation_state.branch_targets = 799 kcalloc(BITS_TO_LONGS(validation_state.max_ip), 800 sizeof(unsigned long), GFP_KERNEL); 801 if (!validation_state.branch_targets) 802 goto fail; 803 804 validated_shader = kcalloc(1, sizeof(*validated_shader), GFP_KERNEL); 805 if (!validated_shader) 806 goto fail; 807 808 if (!vc4_validate_branches(&validation_state)) 809 goto fail; 810 811 for (ip = 0; ip < validation_state.max_ip; ip++) { 812 uint64_t inst = validation_state.shader[ip]; 813 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); 814 815 validation_state.ip = ip; 816 817 if (!vc4_handle_branch_target(&validation_state)) 818 goto fail; 819 820 if (ip == last_thread_switch_ip + 3) { 821 /* Reset r0-r3 live clamp data */ 822 int i; 823 824 for (i = 64; i < LIVE_REG_COUNT; i++) { 825 validation_state.live_min_clamp_offsets[i] = ~0; 826 validation_state.live_max_clamp_regs[i] = false; 827 validation_state.live_immediates[i] = ~0; 828 } 829 } 830 831 switch (sig) { 832 case QPU_SIG_NONE: 833 case QPU_SIG_WAIT_FOR_SCOREBOARD: 834 case QPU_SIG_SCOREBOARD_UNLOCK: 835 case QPU_SIG_COLOR_LOAD: 836 case QPU_SIG_LOAD_TMU0: 837 case QPU_SIG_LOAD_TMU1: 838 case QPU_SIG_PROG_END: 839 case QPU_SIG_SMALL_IMM: 840 case QPU_SIG_THREAD_SWITCH: 841 case QPU_SIG_LAST_THREAD_SWITCH: 842 if (!check_instruction_writes(validated_shader, 843 &validation_state)) { 844 DRM_DEBUG("Bad write at ip %d\n", ip); 845 goto fail; 846 } 847 848 if (!check_instruction_reads(validated_shader, 849 &validation_state)) 850 goto fail; 851 852 if (sig == QPU_SIG_PROG_END) { 853 found_shader_end = true; 854 shader_end_ip = ip; 855 } 856 857 if (sig == QPU_SIG_THREAD_SWITCH || 858 sig == QPU_SIG_LAST_THREAD_SWITCH) { 859 validated_shader->is_threaded = true; 860 861 if (ip < last_thread_switch_ip + 3) { 862 DRM_DEBUG("Thread switch too soon after " 863 "last switch at ip %d\n", ip); 864 goto fail; 865 } 866 last_thread_switch_ip = ip; 867 } 868 869 break; 870 871 case QPU_SIG_LOAD_IMM: 872 if (!check_instruction_writes(validated_shader, 873 &validation_state)) { 874 DRM_DEBUG("Bad LOAD_IMM write at ip %d\n", ip); 875 goto fail; 876 } 877 break; 878 879 case QPU_SIG_BRANCH: 880 if (!check_branch(inst, validated_shader, 881 &validation_state, ip)) 882 goto fail; 883 884 if (ip < last_thread_switch_ip + 3) { 885 DRM_DEBUG("Branch in thread switch at ip %d", 886 ip); 887 goto fail; 888 } 889 890 break; 891 default: 892 DRM_DEBUG("Unsupported QPU signal %d at " 893 "instruction %d\n", sig, ip); 894 goto fail; 895 } 896 897 /* There are two delay slots after program end is signaled 898 * that are still executed, then we're finished. 899 */ 900 if (found_shader_end && ip == shader_end_ip + 2) 901 break; 902 } 903 904 if (ip == validation_state.max_ip) { 905 DRM_DEBUG("shader failed to terminate before " 906 "shader BO end at %zd\n", 907 shader_obj->base.size); 908 goto fail; 909 } 910 911 /* Might corrupt other thread */ 912 if (validated_shader->is_threaded && 913 validation_state.all_registers_used) { 914 DRM_DEBUG("Shader uses threading, but uses the upper " 915 "half of the registers, too\n"); 916 goto fail; 917 } 918 919 /* If we did a backwards branch and we haven't emitted a uniforms 920 * reset since then, we still need the uniforms stream to have the 921 * uniforms address available so that the backwards branch can do its 922 * uniforms reset. 923 * 924 * We could potentially prove that the backwards branch doesn't 925 * contain any uses of uniforms until program exit, but that doesn't 926 * seem to be worth the trouble. 927 */ 928 if (validation_state.needs_uniform_address_for_loop) { 929 if (!require_uniform_address_uniform(validated_shader)) 930 goto fail; 931 validated_shader->uniforms_size += 4; 932 } 933 934 /* Again, no chance of integer overflow here because the worst case 935 * scenario is 8 bytes of uniforms plus handles per 8-byte 936 * instruction. 937 */ 938 validated_shader->uniforms_src_size = 939 (validated_shader->uniforms_size + 940 4 * validated_shader->num_texture_samples); 941 942 kfree(validation_state.branch_targets); 943 944 return validated_shader; 945 946 fail: 947 kfree(validation_state.branch_targets); 948 if (validated_shader) { 949 kfree(validated_shader->uniform_addr_offsets); 950 kfree(validated_shader->texture_samples); 951 kfree(validated_shader); 952 } 953 return NULL; 954 } 955