1 /* 2 * Copyright © 2014 Broadcom 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24 /** 25 * DOC: Shader validator for VC4. 26 * 27 * The VC4 has no IOMMU between it and system memory, so a user with 28 * access to execute shaders could escalate privilege by overwriting 29 * system memory (using the VPM write address register in the 30 * general-purpose DMA mode) or reading system memory it shouldn't 31 * (reading it as a texture, or uniform data, or vertex data). 32 * 33 * This walks over a shader BO, ensuring that its accesses are 34 * appropriately bounded, and recording how many texture accesses are 35 * made and where so that we can do relocations for them in the 36 * uniform stream. 37 */ 38 39 #include "vc4_drv.h" 40 #include "vc4_qpu_defines.h" 41 42 #define LIVE_REG_COUNT (32 + 32 + 4) 43 44 struct vc4_shader_validation_state { 45 /* Current IP being validated. */ 46 uint32_t ip; 47 48 /* IP at the end of the BO, do not read shader[max_ip] */ 49 uint32_t max_ip; 50 51 uint64_t *shader; 52 53 struct vc4_texture_sample_info tmu_setup[2]; 54 int tmu_write_count[2]; 55 56 /* For registers that were last written to by a MIN instruction with 57 * one argument being a uniform, the address of the uniform. 58 * Otherwise, ~0. 59 * 60 * This is used for the validation of direct address memory reads. 61 */ 62 uint32_t live_min_clamp_offsets[LIVE_REG_COUNT]; 63 bool live_max_clamp_regs[LIVE_REG_COUNT]; 64 uint32_t live_immediates[LIVE_REG_COUNT]; 65 66 /* Bitfield of which IPs are used as branch targets. 67 * 68 * Used for validation that the uniform stream is updated at the right 69 * points and clearing the texturing/clamping state. 70 */ 71 unsigned long *branch_targets; 72 73 /* Set when entering a basic block, and cleared when the uniform 74 * address update is found. This is used to make sure that we don't 75 * read uniforms when the address is undefined. 76 */ 77 bool needs_uniform_address_update; 78 79 /* Set when we find a backwards branch. If the branch is backwards, 80 * the taraget is probably doing an address reset to read uniforms, 81 * and so we need to be sure that a uniforms address is present in the 82 * stream, even if the shader didn't need to read uniforms in later 83 * basic blocks. 84 */ 85 bool needs_uniform_address_for_loop; 86 87 /* Set when we find an instruction writing the top half of the 88 * register files. If we allowed writing the unusable regs in 89 * a threaded shader, then the other shader running on our 90 * QPU's clamp validation would be invalid. 91 */ 92 bool all_registers_used; 93 }; 94 95 static uint32_t 96 waddr_to_live_reg_index(uint32_t waddr, bool is_b) 97 { 98 if (waddr < 32) { 99 if (is_b) 100 return 32 + waddr; 101 else 102 return waddr; 103 } else if (waddr <= QPU_W_ACC3) { 104 return 64 + waddr - QPU_W_ACC0; 105 } else { 106 return ~0; 107 } 108 } 109 110 static uint32_t 111 raddr_add_a_to_live_reg_index(uint64_t inst) 112 { 113 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); 114 uint32_t add_a = QPU_GET_FIELD(inst, QPU_ADD_A); 115 uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A); 116 uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B); 117 118 if (add_a == QPU_MUX_A) 119 return raddr_a; 120 else if (add_a == QPU_MUX_B && sig != QPU_SIG_SMALL_IMM) 121 return 32 + raddr_b; 122 else if (add_a <= QPU_MUX_R3) 123 return 64 + add_a; 124 else 125 return ~0; 126 } 127 128 static bool 129 live_reg_is_upper_half(uint32_t lri) 130 { 131 return (lri >= 16 && lri < 32) || 132 (lri >= 32 + 16 && lri < 32 + 32); 133 } 134 135 static bool 136 is_tmu_submit(uint32_t waddr) 137 { 138 return (waddr == QPU_W_TMU0_S || 139 waddr == QPU_W_TMU1_S); 140 } 141 142 static bool 143 is_tmu_write(uint32_t waddr) 144 { 145 return (waddr >= QPU_W_TMU0_S && 146 waddr <= QPU_W_TMU1_B); 147 } 148 149 static bool 150 record_texture_sample(struct vc4_validated_shader_info *validated_shader, 151 struct vc4_shader_validation_state *validation_state, 152 int tmu) 153 { 154 uint32_t s = validated_shader->num_texture_samples; 155 int i; 156 struct vc4_texture_sample_info *temp_samples; 157 158 temp_samples = krealloc(validated_shader->texture_samples, 159 (s + 1) * sizeof(*temp_samples), 160 GFP_KERNEL); 161 if (!temp_samples) 162 return false; 163 164 memcpy(&temp_samples[s], 165 &validation_state->tmu_setup[tmu], 166 sizeof(*temp_samples)); 167 168 validated_shader->num_texture_samples = s + 1; 169 validated_shader->texture_samples = temp_samples; 170 171 for (i = 0; i < 4; i++) 172 validation_state->tmu_setup[tmu].p_offset[i] = ~0; 173 174 return true; 175 } 176 177 static bool 178 check_tmu_write(struct vc4_validated_shader_info *validated_shader, 179 struct vc4_shader_validation_state *validation_state, 180 bool is_mul) 181 { 182 uint64_t inst = validation_state->shader[validation_state->ip]; 183 uint32_t waddr = (is_mul ? 184 QPU_GET_FIELD(inst, QPU_WADDR_MUL) : 185 QPU_GET_FIELD(inst, QPU_WADDR_ADD)); 186 uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A); 187 uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B); 188 int tmu = waddr > QPU_W_TMU0_B; 189 bool submit = is_tmu_submit(waddr); 190 bool is_direct = submit && validation_state->tmu_write_count[tmu] == 0; 191 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); 192 193 if (is_direct) { 194 uint32_t add_b = QPU_GET_FIELD(inst, QPU_ADD_B); 195 uint32_t clamp_reg, clamp_offset; 196 197 if (sig == QPU_SIG_SMALL_IMM) { 198 DRM_ERROR("direct TMU read used small immediate\n"); 199 return false; 200 } 201 202 /* Make sure that this texture load is an add of the base 203 * address of the UBO to a clamped offset within the UBO. 204 */ 205 if (is_mul || 206 QPU_GET_FIELD(inst, QPU_OP_ADD) != QPU_A_ADD) { 207 DRM_ERROR("direct TMU load wasn't an add\n"); 208 return false; 209 } 210 211 /* We assert that the clamped address is the first 212 * argument, and the UBO base address is the second argument. 213 * This is arbitrary, but simpler than supporting flipping the 214 * two either way. 215 */ 216 clamp_reg = raddr_add_a_to_live_reg_index(inst); 217 if (clamp_reg == ~0) { 218 DRM_ERROR("direct TMU load wasn't clamped\n"); 219 return false; 220 } 221 222 clamp_offset = validation_state->live_min_clamp_offsets[clamp_reg]; 223 if (clamp_offset == ~0) { 224 DRM_ERROR("direct TMU load wasn't clamped\n"); 225 return false; 226 } 227 228 /* Store the clamp value's offset in p1 (see reloc_tex() in 229 * vc4_validate.c). 230 */ 231 validation_state->tmu_setup[tmu].p_offset[1] = 232 clamp_offset; 233 234 if (!(add_b == QPU_MUX_A && raddr_a == QPU_R_UNIF) && 235 !(add_b == QPU_MUX_B && raddr_b == QPU_R_UNIF)) { 236 DRM_ERROR("direct TMU load didn't add to a uniform\n"); 237 return false; 238 } 239 240 validation_state->tmu_setup[tmu].is_direct = true; 241 } else { 242 if (raddr_a == QPU_R_UNIF || (sig != QPU_SIG_SMALL_IMM && 243 raddr_b == QPU_R_UNIF)) { 244 DRM_ERROR("uniform read in the same instruction as " 245 "texture setup.\n"); 246 return false; 247 } 248 } 249 250 if (validation_state->tmu_write_count[tmu] >= 4) { 251 DRM_ERROR("TMU%d got too many parameters before dispatch\n", 252 tmu); 253 return false; 254 } 255 validation_state->tmu_setup[tmu].p_offset[validation_state->tmu_write_count[tmu]] = 256 validated_shader->uniforms_size; 257 validation_state->tmu_write_count[tmu]++; 258 /* Since direct uses a RADDR uniform reference, it will get counted in 259 * check_instruction_reads() 260 */ 261 if (!is_direct) { 262 if (validation_state->needs_uniform_address_update) { 263 DRM_ERROR("Texturing with undefined uniform address\n"); 264 return false; 265 } 266 267 validated_shader->uniforms_size += 4; 268 } 269 270 if (submit) { 271 if (!record_texture_sample(validated_shader, 272 validation_state, tmu)) { 273 return false; 274 } 275 276 validation_state->tmu_write_count[tmu] = 0; 277 } 278 279 return true; 280 } 281 282 static bool require_uniform_address_uniform(struct vc4_validated_shader_info *validated_shader) 283 { 284 uint32_t o = validated_shader->num_uniform_addr_offsets; 285 uint32_t num_uniforms = validated_shader->uniforms_size / 4; 286 287 validated_shader->uniform_addr_offsets = 288 krealloc(validated_shader->uniform_addr_offsets, 289 (o + 1) * 290 sizeof(*validated_shader->uniform_addr_offsets), 291 GFP_KERNEL); 292 if (!validated_shader->uniform_addr_offsets) 293 return false; 294 295 validated_shader->uniform_addr_offsets[o] = num_uniforms; 296 validated_shader->num_uniform_addr_offsets++; 297 298 return true; 299 } 300 301 static bool 302 validate_uniform_address_write(struct vc4_validated_shader_info *validated_shader, 303 struct vc4_shader_validation_state *validation_state, 304 bool is_mul) 305 { 306 uint64_t inst = validation_state->shader[validation_state->ip]; 307 u32 add_b = QPU_GET_FIELD(inst, QPU_ADD_B); 308 u32 raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A); 309 u32 raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B); 310 u32 add_lri = raddr_add_a_to_live_reg_index(inst); 311 /* We want our reset to be pointing at whatever uniform follows the 312 * uniforms base address. 313 */ 314 u32 expected_offset = validated_shader->uniforms_size + 4; 315 316 /* We only support absolute uniform address changes, and we 317 * require that they be in the current basic block before any 318 * of its uniform reads. 319 * 320 * One could potentially emit more efficient QPU code, by 321 * noticing that (say) an if statement does uniform control 322 * flow for all threads and that the if reads the same number 323 * of uniforms on each side. However, this scheme is easy to 324 * validate so it's all we allow for now. 325 */ 326 switch (QPU_GET_FIELD(inst, QPU_SIG)) { 327 case QPU_SIG_NONE: 328 case QPU_SIG_SCOREBOARD_UNLOCK: 329 case QPU_SIG_COLOR_LOAD: 330 case QPU_SIG_LOAD_TMU0: 331 case QPU_SIG_LOAD_TMU1: 332 break; 333 default: 334 DRM_ERROR("uniforms address change must be " 335 "normal math\n"); 336 return false; 337 } 338 339 if (is_mul || QPU_GET_FIELD(inst, QPU_OP_ADD) != QPU_A_ADD) { 340 DRM_ERROR("Uniform address reset must be an ADD.\n"); 341 return false; 342 } 343 344 if (QPU_GET_FIELD(inst, QPU_COND_ADD) != QPU_COND_ALWAYS) { 345 DRM_ERROR("Uniform address reset must be unconditional.\n"); 346 return false; 347 } 348 349 if (QPU_GET_FIELD(inst, QPU_PACK) != QPU_PACK_A_NOP && 350 !(inst & QPU_PM)) { 351 DRM_ERROR("No packing allowed on uniforms reset\n"); 352 return false; 353 } 354 355 if (add_lri == -1) { 356 DRM_ERROR("First argument of uniform address write must be " 357 "an immediate value.\n"); 358 return false; 359 } 360 361 if (validation_state->live_immediates[add_lri] != expected_offset) { 362 DRM_ERROR("Resetting uniforms with offset %db instead of %db\n", 363 validation_state->live_immediates[add_lri], 364 expected_offset); 365 return false; 366 } 367 368 if (!(add_b == QPU_MUX_A && raddr_a == QPU_R_UNIF) && 369 !(add_b == QPU_MUX_B && raddr_b == QPU_R_UNIF)) { 370 DRM_ERROR("Second argument of uniform address write must be " 371 "a uniform.\n"); 372 return false; 373 } 374 375 validation_state->needs_uniform_address_update = false; 376 validation_state->needs_uniform_address_for_loop = false; 377 return require_uniform_address_uniform(validated_shader); 378 } 379 380 static bool 381 check_reg_write(struct vc4_validated_shader_info *validated_shader, 382 struct vc4_shader_validation_state *validation_state, 383 bool is_mul) 384 { 385 uint64_t inst = validation_state->shader[validation_state->ip]; 386 uint32_t waddr = (is_mul ? 387 QPU_GET_FIELD(inst, QPU_WADDR_MUL) : 388 QPU_GET_FIELD(inst, QPU_WADDR_ADD)); 389 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); 390 bool ws = inst & QPU_WS; 391 bool is_b = is_mul ^ ws; 392 u32 lri = waddr_to_live_reg_index(waddr, is_b); 393 394 if (lri != -1) { 395 uint32_t cond_add = QPU_GET_FIELD(inst, QPU_COND_ADD); 396 uint32_t cond_mul = QPU_GET_FIELD(inst, QPU_COND_MUL); 397 398 if (sig == QPU_SIG_LOAD_IMM && 399 QPU_GET_FIELD(inst, QPU_PACK) == QPU_PACK_A_NOP && 400 ((is_mul && cond_mul == QPU_COND_ALWAYS) || 401 (!is_mul && cond_add == QPU_COND_ALWAYS))) { 402 validation_state->live_immediates[lri] = 403 QPU_GET_FIELD(inst, QPU_LOAD_IMM); 404 } else { 405 validation_state->live_immediates[lri] = ~0; 406 } 407 408 if (live_reg_is_upper_half(lri)) 409 validation_state->all_registers_used = true; 410 } 411 412 switch (waddr) { 413 case QPU_W_UNIFORMS_ADDRESS: 414 if (is_b) { 415 DRM_ERROR("relative uniforms address change " 416 "unsupported\n"); 417 return false; 418 } 419 420 return validate_uniform_address_write(validated_shader, 421 validation_state, 422 is_mul); 423 424 case QPU_W_TLB_COLOR_MS: 425 case QPU_W_TLB_COLOR_ALL: 426 case QPU_W_TLB_Z: 427 /* These only interact with the tile buffer, not main memory, 428 * so they're safe. 429 */ 430 return true; 431 432 case QPU_W_TMU0_S: 433 case QPU_W_TMU0_T: 434 case QPU_W_TMU0_R: 435 case QPU_W_TMU0_B: 436 case QPU_W_TMU1_S: 437 case QPU_W_TMU1_T: 438 case QPU_W_TMU1_R: 439 case QPU_W_TMU1_B: 440 return check_tmu_write(validated_shader, validation_state, 441 is_mul); 442 443 case QPU_W_HOST_INT: 444 case QPU_W_TMU_NOSWAP: 445 case QPU_W_TLB_ALPHA_MASK: 446 case QPU_W_MUTEX_RELEASE: 447 /* XXX: I haven't thought about these, so don't support them 448 * for now. 449 */ 450 DRM_ERROR("Unsupported waddr %d\n", waddr); 451 return false; 452 453 case QPU_W_VPM_ADDR: 454 DRM_ERROR("General VPM DMA unsupported\n"); 455 return false; 456 457 case QPU_W_VPM: 458 case QPU_W_VPMVCD_SETUP: 459 /* We allow VPM setup in general, even including VPM DMA 460 * configuration setup, because the (unsafe) DMA can only be 461 * triggered by QPU_W_VPM_ADDR writes. 462 */ 463 return true; 464 465 case QPU_W_TLB_STENCIL_SETUP: 466 return true; 467 } 468 469 return true; 470 } 471 472 static void 473 track_live_clamps(struct vc4_validated_shader_info *validated_shader, 474 struct vc4_shader_validation_state *validation_state) 475 { 476 uint64_t inst = validation_state->shader[validation_state->ip]; 477 uint32_t op_add = QPU_GET_FIELD(inst, QPU_OP_ADD); 478 uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD); 479 uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL); 480 uint32_t cond_add = QPU_GET_FIELD(inst, QPU_COND_ADD); 481 uint32_t add_a = QPU_GET_FIELD(inst, QPU_ADD_A); 482 uint32_t add_b = QPU_GET_FIELD(inst, QPU_ADD_B); 483 uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A); 484 uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B); 485 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); 486 bool ws = inst & QPU_WS; 487 uint32_t lri_add_a, lri_add, lri_mul; 488 bool add_a_is_min_0; 489 490 /* Check whether OP_ADD's A argumennt comes from a live MAX(x, 0), 491 * before we clear previous live state. 492 */ 493 lri_add_a = raddr_add_a_to_live_reg_index(inst); 494 add_a_is_min_0 = (lri_add_a != ~0 && 495 validation_state->live_max_clamp_regs[lri_add_a]); 496 497 /* Clear live state for registers written by our instruction. */ 498 lri_add = waddr_to_live_reg_index(waddr_add, ws); 499 lri_mul = waddr_to_live_reg_index(waddr_mul, !ws); 500 if (lri_mul != ~0) { 501 validation_state->live_max_clamp_regs[lri_mul] = false; 502 validation_state->live_min_clamp_offsets[lri_mul] = ~0; 503 } 504 if (lri_add != ~0) { 505 validation_state->live_max_clamp_regs[lri_add] = false; 506 validation_state->live_min_clamp_offsets[lri_add] = ~0; 507 } else { 508 /* Nothing further to do for live tracking, since only ADDs 509 * generate new live clamp registers. 510 */ 511 return; 512 } 513 514 /* Now, handle remaining live clamp tracking for the ADD operation. */ 515 516 if (cond_add != QPU_COND_ALWAYS) 517 return; 518 519 if (op_add == QPU_A_MAX) { 520 /* Track live clamps of a value to a minimum of 0 (in either 521 * arg). 522 */ 523 if (sig != QPU_SIG_SMALL_IMM || raddr_b != 0 || 524 (add_a != QPU_MUX_B && add_b != QPU_MUX_B)) { 525 return; 526 } 527 528 validation_state->live_max_clamp_regs[lri_add] = true; 529 } else if (op_add == QPU_A_MIN) { 530 /* Track live clamps of a value clamped to a minimum of 0 and 531 * a maximum of some uniform's offset. 532 */ 533 if (!add_a_is_min_0) 534 return; 535 536 if (!(add_b == QPU_MUX_A && raddr_a == QPU_R_UNIF) && 537 !(add_b == QPU_MUX_B && raddr_b == QPU_R_UNIF && 538 sig != QPU_SIG_SMALL_IMM)) { 539 return; 540 } 541 542 validation_state->live_min_clamp_offsets[lri_add] = 543 validated_shader->uniforms_size; 544 } 545 } 546 547 static bool 548 check_instruction_writes(struct vc4_validated_shader_info *validated_shader, 549 struct vc4_shader_validation_state *validation_state) 550 { 551 uint64_t inst = validation_state->shader[validation_state->ip]; 552 uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD); 553 uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL); 554 bool ok; 555 556 if (is_tmu_write(waddr_add) && is_tmu_write(waddr_mul)) { 557 DRM_ERROR("ADD and MUL both set up textures\n"); 558 return false; 559 } 560 561 ok = (check_reg_write(validated_shader, validation_state, false) && 562 check_reg_write(validated_shader, validation_state, true)); 563 564 track_live_clamps(validated_shader, validation_state); 565 566 return ok; 567 } 568 569 static bool 570 check_branch(uint64_t inst, 571 struct vc4_validated_shader_info *validated_shader, 572 struct vc4_shader_validation_state *validation_state, 573 int ip) 574 { 575 int32_t branch_imm = QPU_GET_FIELD(inst, QPU_BRANCH_TARGET); 576 uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD); 577 uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL); 578 579 if ((int)branch_imm < 0) 580 validation_state->needs_uniform_address_for_loop = true; 581 582 /* We don't want to have to worry about validation of this, and 583 * there's no need for it. 584 */ 585 if (waddr_add != QPU_W_NOP || waddr_mul != QPU_W_NOP) { 586 DRM_ERROR("branch instruction at %d wrote a register.\n", 587 validation_state->ip); 588 return false; 589 } 590 591 return true; 592 } 593 594 static bool 595 check_instruction_reads(struct vc4_validated_shader_info *validated_shader, 596 struct vc4_shader_validation_state *validation_state) 597 { 598 uint64_t inst = validation_state->shader[validation_state->ip]; 599 uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A); 600 uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B); 601 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); 602 603 if (raddr_a == QPU_R_UNIF || 604 (raddr_b == QPU_R_UNIF && sig != QPU_SIG_SMALL_IMM)) { 605 /* This can't overflow the uint32_t, because we're reading 8 606 * bytes of instruction to increment by 4 here, so we'd 607 * already be OOM. 608 */ 609 validated_shader->uniforms_size += 4; 610 611 if (validation_state->needs_uniform_address_update) { 612 DRM_ERROR("Uniform read with undefined uniform " 613 "address\n"); 614 return false; 615 } 616 } 617 618 if ((raddr_a >= 16 && raddr_a < 32) || 619 (raddr_b >= 16 && raddr_b < 32 && sig != QPU_SIG_SMALL_IMM)) { 620 validation_state->all_registers_used = true; 621 } 622 623 return true; 624 } 625 626 /* Make sure that all branches are absolute and point within the shader, and 627 * note their targets for later. 628 */ 629 static bool 630 vc4_validate_branches(struct vc4_shader_validation_state *validation_state) 631 { 632 uint32_t max_branch_target = 0; 633 int ip; 634 int last_branch = -2; 635 636 for (ip = 0; ip < validation_state->max_ip; ip++) { 637 uint64_t inst = validation_state->shader[ip]; 638 int32_t branch_imm = QPU_GET_FIELD(inst, QPU_BRANCH_TARGET); 639 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); 640 uint32_t after_delay_ip = ip + 4; 641 uint32_t branch_target_ip; 642 643 if (sig == QPU_SIG_PROG_END) { 644 /* There are two delay slots after program end is 645 * signaled that are still executed, then we're 646 * finished. validation_state->max_ip is the 647 * instruction after the last valid instruction in the 648 * program. 649 */ 650 validation_state->max_ip = ip + 3; 651 continue; 652 } 653 654 if (sig != QPU_SIG_BRANCH) 655 continue; 656 657 if (ip - last_branch < 4) { 658 DRM_ERROR("Branch at %d during delay slots\n", ip); 659 return false; 660 } 661 last_branch = ip; 662 663 if (inst & QPU_BRANCH_REG) { 664 DRM_ERROR("branching from register relative " 665 "not supported\n"); 666 return false; 667 } 668 669 if (!(inst & QPU_BRANCH_REL)) { 670 DRM_ERROR("relative branching required\n"); 671 return false; 672 } 673 674 /* The actual branch target is the instruction after the delay 675 * slots, plus whatever byte offset is in the low 32 bits of 676 * the instruction. Make sure we're not branching beyond the 677 * end of the shader object. 678 */ 679 if (branch_imm % sizeof(inst) != 0) { 680 DRM_ERROR("branch target not aligned\n"); 681 return false; 682 } 683 684 branch_target_ip = after_delay_ip + (branch_imm >> 3); 685 if (branch_target_ip >= validation_state->max_ip) { 686 DRM_ERROR("Branch at %d outside of shader (ip %d/%d)\n", 687 ip, branch_target_ip, 688 validation_state->max_ip); 689 return false; 690 } 691 set_bit(branch_target_ip, validation_state->branch_targets); 692 693 /* Make sure that the non-branching path is also not outside 694 * the shader. 695 */ 696 if (after_delay_ip >= validation_state->max_ip) { 697 DRM_ERROR("Branch at %d continues past shader end " 698 "(%d/%d)\n", 699 ip, after_delay_ip, validation_state->max_ip); 700 return false; 701 } 702 set_bit(after_delay_ip, validation_state->branch_targets); 703 max_branch_target = max(max_branch_target, after_delay_ip); 704 } 705 706 if (max_branch_target > validation_state->max_ip - 3) { 707 DRM_ERROR("Branch landed after QPU_SIG_PROG_END"); 708 return false; 709 } 710 711 return true; 712 } 713 714 /* Resets any known state for the shader, used when we may be branched to from 715 * multiple locations in the program (or at shader start). 716 */ 717 static void 718 reset_validation_state(struct vc4_shader_validation_state *validation_state) 719 { 720 int i; 721 722 for (i = 0; i < 8; i++) 723 validation_state->tmu_setup[i / 4].p_offset[i % 4] = ~0; 724 725 for (i = 0; i < LIVE_REG_COUNT; i++) { 726 validation_state->live_min_clamp_offsets[i] = ~0; 727 validation_state->live_max_clamp_regs[i] = false; 728 validation_state->live_immediates[i] = ~0; 729 } 730 } 731 732 static bool 733 texturing_in_progress(struct vc4_shader_validation_state *validation_state) 734 { 735 return (validation_state->tmu_write_count[0] != 0 || 736 validation_state->tmu_write_count[1] != 0); 737 } 738 739 static bool 740 vc4_handle_branch_target(struct vc4_shader_validation_state *validation_state) 741 { 742 uint32_t ip = validation_state->ip; 743 744 if (!test_bit(ip, validation_state->branch_targets)) 745 return true; 746 747 if (texturing_in_progress(validation_state)) { 748 DRM_ERROR("Branch target landed during TMU setup\n"); 749 return false; 750 } 751 752 /* Reset our live values tracking, since this instruction may have 753 * multiple predecessors. 754 * 755 * One could potentially do analysis to determine that, for 756 * example, all predecessors have a live max clamp in the same 757 * register, but we don't bother with that. 758 */ 759 reset_validation_state(validation_state); 760 761 /* Since we've entered a basic block from potentially multiple 762 * predecessors, we need the uniforms address to be updated before any 763 * unforms are read. We require that after any branch point, the next 764 * uniform to be loaded is a uniform address offset. That uniform's 765 * offset will be marked by the uniform address register write 766 * validation, or a one-off the end-of-program check. 767 */ 768 validation_state->needs_uniform_address_update = true; 769 770 return true; 771 } 772 773 struct vc4_validated_shader_info * 774 vc4_validate_shader(struct drm_gem_cma_object *shader_obj) 775 { 776 bool found_shader_end = false; 777 int shader_end_ip = 0; 778 uint32_t last_thread_switch_ip = -3; 779 uint32_t ip; 780 struct vc4_validated_shader_info *validated_shader = NULL; 781 struct vc4_shader_validation_state validation_state; 782 783 memset(&validation_state, 0, sizeof(validation_state)); 784 validation_state.shader = shader_obj->vaddr; 785 validation_state.max_ip = shader_obj->base.size / sizeof(uint64_t); 786 787 reset_validation_state(&validation_state); 788 789 validation_state.branch_targets = 790 kcalloc(BITS_TO_LONGS(validation_state.max_ip), 791 sizeof(unsigned long), GFP_KERNEL); 792 if (!validation_state.branch_targets) 793 goto fail; 794 795 validated_shader = kcalloc(1, sizeof(*validated_shader), GFP_KERNEL); 796 if (!validated_shader) 797 goto fail; 798 799 if (!vc4_validate_branches(&validation_state)) 800 goto fail; 801 802 for (ip = 0; ip < validation_state.max_ip; ip++) { 803 uint64_t inst = validation_state.shader[ip]; 804 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); 805 806 validation_state.ip = ip; 807 808 if (!vc4_handle_branch_target(&validation_state)) 809 goto fail; 810 811 if (ip == last_thread_switch_ip + 3) { 812 /* Reset r0-r3 live clamp data */ 813 int i; 814 815 for (i = 64; i < LIVE_REG_COUNT; i++) { 816 validation_state.live_min_clamp_offsets[i] = ~0; 817 validation_state.live_max_clamp_regs[i] = false; 818 validation_state.live_immediates[i] = ~0; 819 } 820 } 821 822 switch (sig) { 823 case QPU_SIG_NONE: 824 case QPU_SIG_WAIT_FOR_SCOREBOARD: 825 case QPU_SIG_SCOREBOARD_UNLOCK: 826 case QPU_SIG_COLOR_LOAD: 827 case QPU_SIG_LOAD_TMU0: 828 case QPU_SIG_LOAD_TMU1: 829 case QPU_SIG_PROG_END: 830 case QPU_SIG_SMALL_IMM: 831 case QPU_SIG_THREAD_SWITCH: 832 case QPU_SIG_LAST_THREAD_SWITCH: 833 if (!check_instruction_writes(validated_shader, 834 &validation_state)) { 835 DRM_ERROR("Bad write at ip %d\n", ip); 836 goto fail; 837 } 838 839 if (!check_instruction_reads(validated_shader, 840 &validation_state)) 841 goto fail; 842 843 if (sig == QPU_SIG_PROG_END) { 844 found_shader_end = true; 845 shader_end_ip = ip; 846 } 847 848 if (sig == QPU_SIG_THREAD_SWITCH || 849 sig == QPU_SIG_LAST_THREAD_SWITCH) { 850 validated_shader->is_threaded = true; 851 852 if (ip < last_thread_switch_ip + 3) { 853 DRM_ERROR("Thread switch too soon after " 854 "last switch at ip %d\n", ip); 855 goto fail; 856 } 857 last_thread_switch_ip = ip; 858 } 859 860 break; 861 862 case QPU_SIG_LOAD_IMM: 863 if (!check_instruction_writes(validated_shader, 864 &validation_state)) { 865 DRM_ERROR("Bad LOAD_IMM write at ip %d\n", ip); 866 goto fail; 867 } 868 break; 869 870 case QPU_SIG_BRANCH: 871 if (!check_branch(inst, validated_shader, 872 &validation_state, ip)) 873 goto fail; 874 875 if (ip < last_thread_switch_ip + 3) { 876 DRM_ERROR("Branch in thread switch at ip %d", 877 ip); 878 goto fail; 879 } 880 881 break; 882 default: 883 DRM_ERROR("Unsupported QPU signal %d at " 884 "instruction %d\n", sig, ip); 885 goto fail; 886 } 887 888 /* There are two delay slots after program end is signaled 889 * that are still executed, then we're finished. 890 */ 891 if (found_shader_end && ip == shader_end_ip + 2) 892 break; 893 } 894 895 if (ip == validation_state.max_ip) { 896 DRM_ERROR("shader failed to terminate before " 897 "shader BO end at %zd\n", 898 shader_obj->base.size); 899 goto fail; 900 } 901 902 /* Might corrupt other thread */ 903 if (validated_shader->is_threaded && 904 validation_state.all_registers_used) { 905 DRM_ERROR("Shader uses threading, but uses the upper " 906 "half of the registers, too\n"); 907 goto fail; 908 } 909 910 /* If we did a backwards branch and we haven't emitted a uniforms 911 * reset since then, we still need the uniforms stream to have the 912 * uniforms address available so that the backwards branch can do its 913 * uniforms reset. 914 * 915 * We could potentially prove that the backwards branch doesn't 916 * contain any uses of uniforms until program exit, but that doesn't 917 * seem to be worth the trouble. 918 */ 919 if (validation_state.needs_uniform_address_for_loop) { 920 if (!require_uniform_address_uniform(validated_shader)) 921 goto fail; 922 validated_shader->uniforms_size += 4; 923 } 924 925 /* Again, no chance of integer overflow here because the worst case 926 * scenario is 8 bytes of uniforms plus handles per 8-byte 927 * instruction. 928 */ 929 validated_shader->uniforms_src_size = 930 (validated_shader->uniforms_size + 931 4 * validated_shader->num_texture_samples); 932 933 kfree(validation_state.branch_targets); 934 935 return validated_shader; 936 937 fail: 938 kfree(validation_state.branch_targets); 939 if (validated_shader) { 940 kfree(validated_shader->texture_samples); 941 kfree(validated_shader); 942 } 943 return NULL; 944 } 945