1 /* 2 * Copyright © 2015 Thomas Helland 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24 #include "nir_loop_analyze.h" 25 #include "util/bitset.h" 26 #include "nir.h" 27 #include "nir_constant_expressions.h" 28 29 typedef enum { 30 undefined, 31 basic_induction 32 } nir_loop_variable_type; 33 34 typedef struct { 35 /* A link for the work list */ 36 struct list_head process_link; 37 38 bool in_loop; 39 40 /* The ssa_def associated with this info */ 41 nir_def *def; 42 43 /* The type of this ssa_def */ 44 nir_loop_variable_type type; 45 46 /* True if variable is in an if branch */ 47 bool in_if_branch; 48 49 /* True if variable is in a nested loop */ 50 bool in_nested_loop; 51 52 /* Could be a basic_induction if following uniforms are inlined */ 53 nir_src *init_src; 54 nir_alu_src *update_src; 55 56 /** 57 * SSA def of the phi-node associated with this induction variable. 58 * 59 * Every loop induction variable has an associated phi node in the loop 60 * header. This may point to the same SSA def as \c def. If, however, \c def 61 * is the increment of the induction variable, this will point to the SSA 62 * def being incremented. 63 */ 64 nir_def *basis; 65 } nir_loop_variable; 66 67 typedef struct { 68 /* The loop we store information for */ 69 nir_loop *loop; 70 71 /* Loop_variable for all ssa_defs in function */ 72 nir_loop_variable *loop_vars; 73 BITSET_WORD *loop_vars_init; 74 75 /* A list of the loop_vars to analyze */ 76 struct list_head process_list; 77 78 nir_variable_mode indirect_mask; 79 80 bool force_unroll_sampler_indirect; 81 } loop_info_state; 82 83 static nir_loop_variable * get_loop_var(nir_def * value,loop_info_state * state)84 get_loop_var(nir_def *value, loop_info_state *state) 85 { 86 nir_loop_variable *var = &(state->loop_vars[value->index]); 87 88 if (!BITSET_TEST(state->loop_vars_init, value->index)) { 89 var->in_loop = false; 90 var->def = value; 91 var->in_if_branch = false; 92 var->in_nested_loop = false; 93 var->init_src = NULL; 94 var->update_src = NULL; 95 var->type = undefined; 96 97 BITSET_SET(state->loop_vars_init, value->index); 98 } 99 100 return var; 101 } 102 103 typedef struct { 104 loop_info_state *state; 105 bool in_if_branch; 106 bool in_nested_loop; 107 } init_loop_state; 108 109 static bool init_loop_def(nir_def * def,void * void_init_loop_state)110 init_loop_def(nir_def *def, void *void_init_loop_state) 111 { 112 init_loop_state *loop_init_state = void_init_loop_state; 113 nir_loop_variable *var = get_loop_var(def, loop_init_state->state); 114 115 if (loop_init_state->in_nested_loop) { 116 var->in_nested_loop = true; 117 } else if (loop_init_state->in_if_branch) { 118 var->in_if_branch = true; 119 } else { 120 /* Add to the tail of the list. That way we start at the beginning of 121 * the defs in the loop instead of the end when walking the list. This 122 * means less recursive calls. Only add defs that are not in nested 123 * loops or conditional blocks. 124 */ 125 list_addtail(&var->process_link, &loop_init_state->state->process_list); 126 } 127 128 var->in_loop = true; 129 130 return true; 131 } 132 133 /** Calculate an estimated cost in number of instructions 134 * 135 * We do this so that we don't unroll loops which will later get massively 136 * inflated due to int64 or fp64 lowering. The estimates provided here don't 137 * have to be massively accurate; they just have to be good enough that loop 138 * unrolling doesn't cause things to blow up too much. 139 */ 140 static unsigned instr_cost(loop_info_state * state,nir_instr * instr,const nir_shader_compiler_options * options)141 instr_cost(loop_info_state *state, nir_instr *instr, 142 const nir_shader_compiler_options *options) 143 { 144 if (instr->type == nir_instr_type_intrinsic || 145 instr->type == nir_instr_type_tex) 146 return 1; 147 148 if (instr->type != nir_instr_type_alu) 149 return 0; 150 151 nir_alu_instr *alu = nir_instr_as_alu(instr); 152 const nir_op_info *info = &nir_op_infos[alu->op]; 153 unsigned cost = 1; 154 155 if (nir_op_is_selection(alu->op)) { 156 nir_scalar cond_scalar = { alu->src[0].src.ssa, 0 }; 157 if (nir_is_terminator_condition_with_two_inputs(cond_scalar)) { 158 nir_instr *sel_cond = alu->src[0].src.ssa->parent_instr; 159 nir_alu_instr *sel_alu = nir_instr_as_alu(sel_cond); 160 161 nir_scalar rhs, lhs; 162 lhs = nir_scalar_chase_alu_src(cond_scalar, 0); 163 rhs = nir_scalar_chase_alu_src(cond_scalar, 1); 164 165 /* If the selects condition is a comparision between a constant and 166 * a basic induction variable we know that it will be eliminated once 167 * the loop is unrolled so here we assign it a cost of 0. 168 */ 169 if ((nir_src_is_const(sel_alu->src[0].src) && 170 get_loop_var(rhs.def, state)->type == basic_induction) || 171 (nir_src_is_const(sel_alu->src[1].src) && 172 get_loop_var(lhs.def, state)->type == basic_induction)) { 173 /* Also if the selects condition is only used by the select then 174 * remove that alu instructons cost from the cost total also. 175 */ 176 if (!list_is_singular(&sel_alu->def.uses) || 177 nir_def_used_by_if(&sel_alu->def)) 178 return 0; 179 else 180 return -1; 181 } 182 } 183 } 184 185 if (alu->op == nir_op_flrp) { 186 if ((options->lower_flrp16 && alu->def.bit_size == 16) || 187 (options->lower_flrp32 && alu->def.bit_size == 32) || 188 (options->lower_flrp64 && alu->def.bit_size == 64)) 189 cost *= 3; 190 } 191 192 /* Assume everything 16 or 32-bit is cheap. 193 * 194 * There are no 64-bit ops that don't have a 64-bit thing as their 195 * destination or first source. 196 */ 197 if (alu->def.bit_size < 64 && 198 nir_src_bit_size(alu->src[0].src) < 64) 199 return cost; 200 201 bool is_fp64 = alu->def.bit_size == 64 && 202 nir_alu_type_get_base_type(info->output_type) == nir_type_float; 203 for (unsigned i = 0; i < info->num_inputs; i++) { 204 if (nir_src_bit_size(alu->src[i].src) == 64 && 205 nir_alu_type_get_base_type(info->input_types[i]) == nir_type_float) 206 is_fp64 = true; 207 } 208 209 if (is_fp64) { 210 /* If it's something lowered normally, it's expensive. */ 211 if (options->lower_doubles_options & 212 nir_lower_doubles_op_to_options_mask(alu->op)) 213 cost *= 20; 214 215 /* If it's full software, it's even more expensive */ 216 if (options->lower_doubles_options & nir_lower_fp64_full_software) { 217 cost *= 100; 218 state->loop->info->has_soft_fp64 = true; 219 } 220 221 return cost; 222 } else { 223 if (options->lower_int64_options & 224 nir_lower_int64_op_to_options_mask(alu->op)) { 225 /* These require a doing the division algorithm. */ 226 if (alu->op == nir_op_idiv || alu->op == nir_op_udiv || 227 alu->op == nir_op_imod || alu->op == nir_op_umod || 228 alu->op == nir_op_irem) 229 return cost * 100; 230 231 /* Other int64 lowering isn't usually all that expensive */ 232 return cost * 5; 233 } 234 235 return cost; 236 } 237 } 238 239 static bool init_loop_block(nir_block * block,loop_info_state * state,bool in_if_branch,bool in_nested_loop)240 init_loop_block(nir_block *block, loop_info_state *state, 241 bool in_if_branch, bool in_nested_loop) 242 { 243 init_loop_state init_state = { .in_if_branch = in_if_branch, 244 .in_nested_loop = in_nested_loop, 245 .state = state }; 246 247 nir_foreach_instr(instr, block) { 248 nir_foreach_def(instr, init_loop_def, &init_state); 249 } 250 251 return true; 252 } 253 254 static inline bool is_var_alu(nir_loop_variable * var)255 is_var_alu(nir_loop_variable *var) 256 { 257 return var->def->parent_instr->type == nir_instr_type_alu; 258 } 259 260 static inline bool is_var_phi(nir_loop_variable * var)261 is_var_phi(nir_loop_variable *var) 262 { 263 return var->def->parent_instr->type == nir_instr_type_phi; 264 } 265 266 /* If all of the instruction sources point to identical ALU instructions (as 267 * per nir_instrs_equal), return one of the ALU instructions. Otherwise, 268 * return NULL. 269 */ 270 static nir_alu_instr * phi_instr_as_alu(nir_phi_instr * phi)271 phi_instr_as_alu(nir_phi_instr *phi) 272 { 273 nir_alu_instr *first = NULL; 274 nir_foreach_phi_src(src, phi) { 275 if (src->src.ssa->parent_instr->type != nir_instr_type_alu) 276 return NULL; 277 278 nir_alu_instr *alu = nir_instr_as_alu(src->src.ssa->parent_instr); 279 if (first == NULL) { 280 first = alu; 281 } else { 282 if (!nir_instrs_equal(&first->instr, &alu->instr)) 283 return NULL; 284 } 285 } 286 287 return first; 288 } 289 290 static bool alu_src_has_identity_swizzle(nir_alu_instr * alu,unsigned src_idx)291 alu_src_has_identity_swizzle(nir_alu_instr *alu, unsigned src_idx) 292 { 293 assert(nir_op_infos[alu->op].input_sizes[src_idx] == 0); 294 for (unsigned i = 0; i < alu->def.num_components; i++) { 295 if (alu->src[src_idx].swizzle[i] != i) 296 return false; 297 } 298 299 return true; 300 } 301 302 static bool is_only_uniform_src(nir_src * src)303 is_only_uniform_src(nir_src *src) 304 { 305 nir_instr *instr = src->ssa->parent_instr; 306 307 switch (instr->type) { 308 case nir_instr_type_alu: { 309 /* Return true if all sources return true. */ 310 nir_alu_instr *alu = nir_instr_as_alu(instr); 311 for (unsigned i = 0; i < nir_op_infos[alu->op].num_inputs; i++) { 312 if (!is_only_uniform_src(&alu->src[i].src)) 313 return false; 314 } 315 return true; 316 } 317 318 case nir_instr_type_intrinsic: { 319 nir_intrinsic_instr *inst = nir_instr_as_intrinsic(instr); 320 /* current uniform inline only support load ubo */ 321 return inst->intrinsic == nir_intrinsic_load_ubo; 322 } 323 324 case nir_instr_type_load_const: 325 /* Always return true for constants. */ 326 return true; 327 328 default: 329 return false; 330 } 331 } 332 333 static bool compute_induction_information(loop_info_state * state)334 compute_induction_information(loop_info_state *state) 335 { 336 unsigned num_induction_vars = 0; 337 338 list_for_each_entry_safe(nir_loop_variable, var, &state->process_list, 339 process_link) { 340 341 /* Things in nested loops or conditionals should not have been added into 342 * the procss_list. 343 */ 344 assert(!var->in_if_branch && !var->in_nested_loop); 345 346 /* We are only interested in checking phis for the basic induction 347 * variable case as its simple to detect. All basic induction variables 348 * have a phi node 349 */ 350 if (!is_var_phi(var)) 351 continue; 352 353 nir_phi_instr *phi = nir_instr_as_phi(var->def->parent_instr); 354 355 nir_loop_variable *alu_src_var = NULL; 356 nir_foreach_phi_src(src, phi) { 357 nir_loop_variable *src_var = get_loop_var(src->src.ssa, state); 358 359 /* If one of the sources is in an if branch or nested loop then don't 360 * attempt to go any further. 361 */ 362 if (src_var->in_if_branch || src_var->in_nested_loop) 363 break; 364 365 /* Detect inductions variables that are incremented in both branches 366 * of an unnested if rather than in a loop block. 367 */ 368 if (is_var_phi(src_var)) { 369 nir_phi_instr *src_phi = 370 nir_instr_as_phi(src_var->def->parent_instr); 371 nir_alu_instr *src_phi_alu = phi_instr_as_alu(src_phi); 372 if (src_phi_alu) { 373 src_var = get_loop_var(&src_phi_alu->def, state); 374 if (!src_var->in_if_branch) 375 break; 376 } 377 } 378 379 if (!src_var->in_loop && !var->init_src) { 380 var->init_src = &src->src; 381 } else if (is_var_alu(src_var) && !var->update_src) { 382 alu_src_var = src_var; 383 nir_alu_instr *alu = nir_instr_as_alu(src_var->def->parent_instr); 384 385 /* Check for unsupported alu operations */ 386 if (alu->op != nir_op_iadd && alu->op != nir_op_fadd && 387 alu->op != nir_op_imul && alu->op != nir_op_fmul && 388 alu->op != nir_op_ishl && alu->op != nir_op_ishr && 389 alu->op != nir_op_ushr) 390 break; 391 392 if (nir_op_infos[alu->op].num_inputs == 2) { 393 for (unsigned i = 0; i < 2; i++) { 394 /* Is one of the operands const or uniform, and the other the phi. 395 * The phi source can't be swizzled in any way. 396 */ 397 if (alu->src[1 - i].src.ssa == &phi->def && 398 alu_src_has_identity_swizzle(alu, 1 - i)) { 399 if (is_only_uniform_src(&alu->src[i].src)) 400 var->update_src = alu->src + i; 401 } 402 } 403 } 404 405 if (!var->update_src) 406 break; 407 } else { 408 var->update_src = NULL; 409 break; 410 } 411 } 412 413 if (var->update_src && var->init_src && 414 is_only_uniform_src(var->init_src)) { 415 alu_src_var->init_src = var->init_src; 416 alu_src_var->update_src = var->update_src; 417 alu_src_var->basis = var->def; 418 alu_src_var->type = basic_induction; 419 420 var->basis = var->def; 421 var->type = basic_induction; 422 423 num_induction_vars += 2; 424 } else { 425 var->init_src = NULL; 426 var->update_src = NULL; 427 var->basis = NULL; 428 } 429 } 430 431 nir_loop_info *info = state->loop->info; 432 ralloc_free(info->induction_vars); 433 info->num_induction_vars = 0; 434 435 /* record induction variables into nir_loop_info */ 436 if (num_induction_vars) { 437 info->induction_vars = ralloc_array(info, nir_loop_induction_variable, 438 num_induction_vars); 439 440 list_for_each_entry(nir_loop_variable, var, &state->process_list, 441 process_link) { 442 if (var->type == basic_induction) { 443 nir_loop_induction_variable *ivar = 444 &info->induction_vars[info->num_induction_vars++]; 445 ivar->def = var->def; 446 ivar->init_src = var->init_src; 447 ivar->update_src = var->update_src; 448 } 449 } 450 /* don't overflow */ 451 assert(info->num_induction_vars <= num_induction_vars); 452 } 453 454 return num_induction_vars != 0; 455 } 456 457 static bool find_loop_terminators(loop_info_state * state)458 find_loop_terminators(loop_info_state *state) 459 { 460 bool success = false; 461 foreach_list_typed_safe(nir_cf_node, node, node, &state->loop->body) { 462 if (node->type == nir_cf_node_if) { 463 nir_if *nif = nir_cf_node_as_if(node); 464 465 nir_block *break_blk = NULL; 466 nir_block *continue_from_blk = NULL; 467 bool continue_from_then = true; 468 469 nir_block *last_then = nir_if_last_then_block(nif); 470 nir_block *last_else = nir_if_last_else_block(nif); 471 if (nir_block_ends_in_break(last_then)) { 472 break_blk = last_then; 473 continue_from_blk = last_else; 474 continue_from_then = false; 475 } else if (nir_block_ends_in_break(last_else)) { 476 break_blk = last_else; 477 continue_from_blk = last_then; 478 } 479 480 /* If there is a break then we should find a terminator. If we can 481 * not find a loop terminator, but there is a break-statement then 482 * we should return false so that we do not try to find trip-count 483 */ 484 if (!nir_is_trivial_loop_if(nif, break_blk)) { 485 state->loop->info->complex_loop = true; 486 return false; 487 } 488 489 /* Continue if the if contained no jumps at all */ 490 if (!break_blk) 491 continue; 492 493 if (nif->condition.ssa->parent_instr->type == nir_instr_type_phi) { 494 state->loop->info->complex_loop = true; 495 return false; 496 } 497 498 nir_loop_terminator *terminator = 499 rzalloc(state->loop->info, nir_loop_terminator); 500 501 list_addtail(&terminator->loop_terminator_link, 502 &state->loop->info->loop_terminator_list); 503 504 terminator->nif = nif; 505 terminator->break_block = break_blk; 506 terminator->continue_from_block = continue_from_blk; 507 terminator->continue_from_then = continue_from_then; 508 terminator->conditional_instr = nif->condition.ssa->parent_instr; 509 510 success = true; 511 } 512 } 513 514 return success; 515 } 516 517 /* This function looks for an array access within a loop that uses an 518 * induction variable for the array index. If found it returns the size of the 519 * array, otherwise 0 is returned. If we find an induction var we pass it back 520 * to the caller via array_index_out. 521 */ 522 static unsigned find_array_access_via_induction(loop_info_state * state,nir_deref_instr * deref,nir_loop_variable ** array_index_out)523 find_array_access_via_induction(loop_info_state *state, 524 nir_deref_instr *deref, 525 nir_loop_variable **array_index_out) 526 { 527 for (nir_deref_instr *d = deref; d; d = nir_deref_instr_parent(d)) { 528 if (d->deref_type != nir_deref_type_array) 529 continue; 530 531 nir_loop_variable *array_index = get_loop_var(d->arr.index.ssa, state); 532 533 if (array_index->type != basic_induction) 534 continue; 535 536 if (array_index_out) 537 *array_index_out = array_index; 538 539 nir_deref_instr *parent = nir_deref_instr_parent(d); 540 541 if (glsl_type_is_array_or_matrix(parent->type)) { 542 return glsl_get_length(parent->type); 543 } else { 544 assert(glsl_type_is_vector(parent->type)); 545 return glsl_get_vector_elements(parent->type); 546 } 547 } 548 549 return 0; 550 } 551 552 static bool guess_loop_limit(loop_info_state * state,nir_const_value * limit_val,nir_scalar basic_ind)553 guess_loop_limit(loop_info_state *state, nir_const_value *limit_val, 554 nir_scalar basic_ind) 555 { 556 unsigned min_array_size = 0; 557 558 nir_foreach_block_in_cf_node(block, &state->loop->cf_node) { 559 nir_foreach_instr(instr, block) { 560 if (instr->type != nir_instr_type_intrinsic) 561 continue; 562 563 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); 564 565 /* Check for arrays variably-indexed by a loop induction variable. */ 566 if (intrin->intrinsic == nir_intrinsic_load_deref || 567 intrin->intrinsic == nir_intrinsic_store_deref || 568 intrin->intrinsic == nir_intrinsic_copy_deref) { 569 570 nir_loop_variable *array_idx = NULL; 571 unsigned array_size = 572 find_array_access_via_induction(state, 573 nir_src_as_deref(intrin->src[0]), 574 &array_idx); 575 if (array_idx && basic_ind.def == array_idx->def && 576 (min_array_size == 0 || min_array_size > array_size)) { 577 /* Array indices are scalars */ 578 assert(basic_ind.def->num_components == 1); 579 min_array_size = array_size; 580 } 581 582 if (intrin->intrinsic != nir_intrinsic_copy_deref) 583 continue; 584 585 array_size = 586 find_array_access_via_induction(state, 587 nir_src_as_deref(intrin->src[1]), 588 &array_idx); 589 if (array_idx && basic_ind.def == array_idx->def && 590 (min_array_size == 0 || min_array_size > array_size)) { 591 /* Array indices are scalars */ 592 assert(basic_ind.def->num_components == 1); 593 min_array_size = array_size; 594 } 595 } 596 } 597 } 598 599 if (min_array_size) { 600 *limit_val = nir_const_value_for_uint(min_array_size, 601 basic_ind.def->bit_size); 602 return true; 603 } 604 605 return false; 606 } 607 608 static nir_op invert_comparison_if_needed(nir_op alu_op, bool invert); 609 610 /* Returns whether "limit_op(a, b) alu_op c" is equivalent to "(a alu_op c) || (b alu_op c)". */ 611 static bool is_minmax_compatible(nir_op limit_op,nir_op alu_op,bool limit_rhs,bool invert_cond)612 is_minmax_compatible(nir_op limit_op, nir_op alu_op, bool limit_rhs, bool invert_cond) 613 { 614 bool is_max; 615 switch (limit_op) { 616 case nir_op_imin: 617 case nir_op_fmin: 618 case nir_op_umin: 619 is_max = false; 620 break; 621 case nir_op_imax: 622 case nir_op_fmax: 623 case nir_op_umax: 624 is_max = true; 625 break; 626 default: 627 return false; 628 } 629 630 if (nir_op_infos[limit_op].input_types[0] != nir_op_infos[alu_op].input_types[0]) 631 return false; 632 633 /* Comparisons we can split are: 634 * - min(a, b) < c 635 * - c < max(a, b) 636 * - max(a, b) >= c 637 * - c >= min(a, b) 638 */ 639 switch (invert_comparison_if_needed(alu_op, invert_cond)) { 640 case nir_op_ilt: 641 case nir_op_flt: 642 case nir_op_ult: 643 return (!limit_rhs && !is_max) || (limit_rhs && is_max); 644 case nir_op_ige: 645 case nir_op_fge: 646 case nir_op_uge: 647 return (!limit_rhs && is_max) || (limit_rhs && !is_max); 648 default: 649 return false; 650 } 651 } 652 653 static bool try_find_limit_of_alu(nir_scalar limit,nir_const_value * limit_val,nir_op alu_op,bool invert_cond,nir_loop_terminator * terminator,loop_info_state * state)654 try_find_limit_of_alu(nir_scalar limit, nir_const_value *limit_val, nir_op alu_op, 655 bool invert_cond, nir_loop_terminator *terminator, 656 loop_info_state *state) 657 { 658 if (!nir_scalar_is_alu(limit)) 659 return false; 660 661 nir_op limit_op = nir_scalar_alu_op(limit); 662 if (is_minmax_compatible(limit_op, alu_op, !terminator->induction_rhs, invert_cond)) { 663 for (unsigned i = 0; i < 2; i++) { 664 nir_scalar src = nir_scalar_chase_alu_src(limit, i); 665 if (nir_scalar_is_const(src)) { 666 *limit_val = nir_scalar_as_const_value(src); 667 terminator->exact_trip_count_unknown = true; 668 return true; 669 } 670 } 671 } 672 673 return false; 674 } 675 676 static nir_const_value eval_const_unop(nir_op op,unsigned bit_size,nir_const_value src0,unsigned execution_mode)677 eval_const_unop(nir_op op, unsigned bit_size, nir_const_value src0, 678 unsigned execution_mode) 679 { 680 assert(nir_op_infos[op].num_inputs == 1); 681 nir_const_value dest; 682 nir_const_value *src[1] = { &src0 }; 683 nir_eval_const_opcode(op, &dest, 1, bit_size, src, execution_mode); 684 return dest; 685 } 686 687 static nir_const_value eval_const_binop(nir_op op,unsigned bit_size,nir_const_value src0,nir_const_value src1,unsigned execution_mode)688 eval_const_binop(nir_op op, unsigned bit_size, 689 nir_const_value src0, nir_const_value src1, 690 unsigned execution_mode) 691 { 692 assert(nir_op_infos[op].num_inputs == 2); 693 nir_const_value dest; 694 nir_const_value *src[2] = { &src0, &src1 }; 695 nir_eval_const_opcode(op, &dest, 1, bit_size, src, execution_mode); 696 return dest; 697 } 698 699 static int find_replacement(const nir_scalar * originals,nir_scalar key,unsigned num_replacements)700 find_replacement(const nir_scalar *originals, nir_scalar key, 701 unsigned num_replacements) 702 { 703 for (int i = 0; i < num_replacements; i++) { 704 if (nir_scalar_equal(originals[i], key)) 705 return i; 706 } 707 708 return -1; 709 } 710 711 /** 712 * Try to evaluate an ALU instruction as a constant with a replacement 713 * 714 * Much like \c nir_opt_constant_folding.c:try_fold_alu, this method attempts 715 * to evaluate an ALU instruction as a constant. There are two significant 716 * differences. 717 * 718 * First, this method performs the evaluation recursively. If any source of 719 * the ALU instruction is not itself a constant, it is first evaluated. 720 * 721 * Second, if the SSA value \c original is encountered as a source of the ALU 722 * instruction, the value \c replacement is substituted. 723 * 724 * The intended purpose of this function is to evaluate an arbitrary 725 * expression involving a loop induction variable. In this case, \c original 726 * would be the phi node associated with the induction variable, and 727 * \c replacement is the initial value of the induction variable. 728 * 729 * \returns true if the ALU instruction can be evaluated as constant (after 730 * applying the previously described substitution) or false otherwise. 731 */ 732 static bool try_eval_const_alu(nir_const_value * dest,nir_scalar alu_s,const nir_scalar * originals,const nir_const_value * replacements,unsigned num_replacements,unsigned execution_mode)733 try_eval_const_alu(nir_const_value *dest, nir_scalar alu_s, const nir_scalar *originals, 734 const nir_const_value *replacements, 735 unsigned num_replacements, unsigned execution_mode) 736 { 737 nir_alu_instr *alu = nir_instr_as_alu(alu_s.def->parent_instr); 738 739 if (nir_op_infos[alu->op].output_size) 740 return false; 741 742 /* In the case that any outputs/inputs have unsized types, then we need to 743 * guess the bit-size. In this case, the validator ensures that all 744 * bit-sizes match so we can just take the bit-size from first 745 * output/input with an unsized type. If all the outputs/inputs are sized 746 * then we don't need to guess the bit-size at all because the code we 747 * generate for constant opcodes in this case already knows the sizes of 748 * the types involved and does not need the provided bit-size for anything 749 * (although it still requires to receive a valid bit-size). 750 */ 751 unsigned bit_size = 0; 752 if (!nir_alu_type_get_type_size(nir_op_infos[alu->op].output_type)) { 753 bit_size = alu->def.bit_size; 754 } else { 755 for (unsigned i = 0; i < nir_op_infos[alu->op].num_inputs; i++) { 756 if (!nir_alu_type_get_type_size(nir_op_infos[alu->op].input_types[i])) 757 bit_size = alu->src[i].src.ssa->bit_size; 758 } 759 760 if (bit_size == 0) 761 bit_size = 32; 762 } 763 764 nir_const_value src[NIR_MAX_VEC_COMPONENTS]; 765 nir_const_value *src_ptrs[NIR_MAX_VEC_COMPONENTS]; 766 767 for (unsigned i = 0; i < nir_op_infos[alu->op].num_inputs; i++) { 768 nir_scalar src_s = nir_scalar_chase_alu_src(alu_s, i); 769 770 src_ptrs[i] = &src[i]; 771 if (nir_scalar_is_const(src_s)) { 772 src[i] = nir_scalar_as_const_value(src_s); 773 continue; 774 } 775 776 int r = find_replacement(originals, src_s, num_replacements); 777 if (r >= 0) { 778 src[i] = replacements[r]; 779 } else if (!nir_scalar_is_alu(src_s) || 780 !try_eval_const_alu(&src[i], src_s, 781 originals, replacements, 782 num_replacements, execution_mode)) { 783 return false; 784 } 785 } 786 787 nir_eval_const_opcode(alu->op, dest, 1, bit_size, src_ptrs, execution_mode); 788 789 return true; 790 } 791 792 static nir_op invert_comparison_if_needed(nir_op alu_op,bool invert)793 invert_comparison_if_needed(nir_op alu_op, bool invert) 794 { 795 if (!invert) 796 return alu_op; 797 798 switch (alu_op) { 799 case nir_op_fge: 800 return nir_op_flt; 801 case nir_op_ige: 802 return nir_op_ilt; 803 case nir_op_uge: 804 return nir_op_ult; 805 case nir_op_flt: 806 return nir_op_fge; 807 case nir_op_ilt: 808 return nir_op_ige; 809 case nir_op_ult: 810 return nir_op_uge; 811 case nir_op_feq: 812 return nir_op_fneu; 813 case nir_op_ieq: 814 return nir_op_ine; 815 case nir_op_fneu: 816 return nir_op_feq; 817 case nir_op_ine: 818 return nir_op_ieq; 819 default: 820 unreachable("Unsuported comparison!"); 821 } 822 } 823 824 static int32_t get_iteration(nir_op cond_op,nir_const_value initial,nir_const_value step,nir_const_value limit,bool invert_cond,unsigned bit_size,unsigned execution_mode)825 get_iteration(nir_op cond_op, nir_const_value initial, nir_const_value step, 826 nir_const_value limit, bool invert_cond, unsigned bit_size, 827 unsigned execution_mode) 828 { 829 nir_const_value span, iter; 830 unsigned iter_bit_size = bit_size; 831 832 switch (invert_comparison_if_needed(cond_op, invert_cond)) { 833 case nir_op_ine: 834 /* In order for execution to be here, limit must be the same as initial. 835 * Otherwise will_break_on_first_iteration would have returned false. 836 * If step is zero, the loop is infinite. Otherwise the loop will 837 * execute once. 838 */ 839 return step.u64 == 0 ? -1 : 1; 840 841 case nir_op_ige: 842 case nir_op_ilt: 843 case nir_op_ieq: 844 span = eval_const_binop(nir_op_isub, bit_size, limit, initial, 845 execution_mode); 846 iter = eval_const_binop(nir_op_idiv, bit_size, span, step, 847 execution_mode); 848 break; 849 850 case nir_op_uge: 851 case nir_op_ult: 852 span = eval_const_binop(nir_op_isub, bit_size, limit, initial, 853 execution_mode); 854 iter = eval_const_binop(nir_op_udiv, bit_size, span, step, 855 execution_mode); 856 break; 857 858 case nir_op_fneu: 859 /* In order for execution to be here, limit must be the same as initial. 860 * Otherwise will_break_on_first_iteration would have returned false. 861 * If step is zero, the loop is infinite. Otherwise the loop will 862 * execute once. 863 * 864 * This is a little more tricky for floating point since X-Y might still 865 * be X even if Y is not zero. Instead check that (initial + step) != 866 * initial. 867 */ 868 span = eval_const_binop(nir_op_fadd, bit_size, initial, step, 869 execution_mode); 870 iter = eval_const_binop(nir_op_feq, bit_size, initial, 871 span, execution_mode); 872 873 /* return (initial + step) == initial ? -1 : 1 */ 874 return iter.b ? -1 : 1; 875 876 case nir_op_fge: 877 case nir_op_flt: 878 case nir_op_feq: 879 span = eval_const_binop(nir_op_fsub, bit_size, limit, initial, 880 execution_mode); 881 iter = eval_const_binop(nir_op_fdiv, bit_size, span, 882 step, execution_mode); 883 iter = eval_const_unop(nir_op_f2i64, bit_size, iter, execution_mode); 884 iter_bit_size = 64; 885 break; 886 887 default: 888 return -1; 889 } 890 891 uint64_t iter_u64 = nir_const_value_as_uint(iter, iter_bit_size); 892 return iter_u64 > u_intN_max(iter_bit_size) ? -1 : (int)iter_u64; 893 } 894 895 static int32_t get_iteration_empirical(nir_scalar cond,nir_alu_instr * incr_alu,nir_scalar basis,nir_const_value initial,nir_scalar limit_basis,nir_const_value limit,bool invert_cond,unsigned execution_mode,unsigned max_unroll_iterations)896 get_iteration_empirical(nir_scalar cond, nir_alu_instr *incr_alu, 897 nir_scalar basis, nir_const_value initial, 898 nir_scalar limit_basis, nir_const_value limit, 899 bool invert_cond, unsigned execution_mode, 900 unsigned max_unroll_iterations) 901 { 902 int iter_count = 0; 903 nir_const_value result; 904 905 const nir_scalar incr = nir_get_scalar(&incr_alu->def, basis.comp); 906 907 const nir_scalar original[] = {basis, limit_basis}; 908 nir_const_value replacement[] = {initial, limit}; 909 910 while (iter_count <= max_unroll_iterations) { 911 bool success; 912 913 success = try_eval_const_alu(&result, cond, original, replacement, 914 2, execution_mode); 915 if (!success) 916 return -1; 917 918 const bool cond_succ = invert_cond ? !result.b : result.b; 919 if (cond_succ) 920 return iter_count; 921 922 iter_count++; 923 924 success = try_eval_const_alu(&result, incr, original, replacement, 925 2, execution_mode); 926 assert(success); 927 928 replacement[0] = result; 929 } 930 931 return -1; 932 } 933 934 static bool will_break_on_first_iteration(nir_scalar cond,nir_scalar basis,nir_scalar limit_basis,nir_const_value initial,nir_const_value limit,bool invert_cond,unsigned execution_mode)935 will_break_on_first_iteration(nir_scalar cond, nir_scalar basis, 936 nir_scalar limit_basis, 937 nir_const_value initial, nir_const_value limit, 938 bool invert_cond, unsigned execution_mode) 939 { 940 nir_const_value result; 941 942 const nir_scalar originals[2] = { basis, limit_basis }; 943 const nir_const_value replacements[2] = { initial, limit }; 944 945 ASSERTED bool success = try_eval_const_alu(&result, cond, originals, 946 replacements, 2, execution_mode); 947 948 assert(success); 949 950 return invert_cond ? !result.b : result.b; 951 } 952 953 static bool test_iterations(int32_t iter_int,nir_const_value step,nir_const_value limit,nir_op cond_op,unsigned bit_size,nir_alu_type induction_base_type,nir_const_value initial,bool limit_rhs,bool invert_cond,unsigned execution_mode)954 test_iterations(int32_t iter_int, nir_const_value step, 955 nir_const_value limit, nir_op cond_op, unsigned bit_size, 956 nir_alu_type induction_base_type, 957 nir_const_value initial, bool limit_rhs, bool invert_cond, 958 unsigned execution_mode) 959 { 960 assert(nir_op_infos[cond_op].num_inputs == 2); 961 962 nir_const_value iter_src; 963 nir_op mul_op; 964 nir_op add_op; 965 switch (induction_base_type) { 966 case nir_type_float: 967 iter_src = nir_const_value_for_float(iter_int, bit_size); 968 mul_op = nir_op_fmul; 969 add_op = nir_op_fadd; 970 break; 971 case nir_type_int: 972 case nir_type_uint: 973 iter_src = nir_const_value_for_int(iter_int, bit_size); 974 mul_op = nir_op_imul; 975 add_op = nir_op_iadd; 976 break; 977 default: 978 unreachable("Unhandled induction variable base type!"); 979 } 980 981 /* Multiple the iteration count we are testing by the number of times we 982 * step the induction variable each iteration. 983 */ 984 nir_const_value mul_result = 985 eval_const_binop(mul_op, bit_size, iter_src, step, execution_mode); 986 987 /* Add the initial value to the accumulated induction variable total */ 988 nir_const_value add_result = 989 eval_const_binop(add_op, bit_size, mul_result, initial, execution_mode); 990 991 nir_const_value *src[2]; 992 src[limit_rhs ? 0 : 1] = &add_result; 993 src[limit_rhs ? 1 : 0] = &limit; 994 995 /* Evaluate the loop exit condition */ 996 nir_const_value result; 997 nir_eval_const_opcode(cond_op, &result, 1, bit_size, src, execution_mode); 998 999 return invert_cond ? !result.b : result.b; 1000 } 1001 1002 static int calculate_iterations(nir_scalar basis,nir_scalar limit_basis,nir_const_value initial,nir_const_value step,nir_const_value limit,nir_alu_instr * alu,nir_scalar cond,nir_op alu_op,bool limit_rhs,bool invert_cond,unsigned execution_mode,unsigned max_unroll_iterations)1003 calculate_iterations(nir_scalar basis, nir_scalar limit_basis, 1004 nir_const_value initial, nir_const_value step, 1005 nir_const_value limit, nir_alu_instr *alu, 1006 nir_scalar cond, nir_op alu_op, bool limit_rhs, 1007 bool invert_cond, unsigned execution_mode, 1008 unsigned max_unroll_iterations) 1009 { 1010 /* nir_op_isub should have been lowered away by this point */ 1011 assert(alu->op != nir_op_isub); 1012 1013 /* Make sure the alu type for our induction variable is compatible with the 1014 * conditional alus input type. If its not something has gone really wrong. 1015 */ 1016 nir_alu_type induction_base_type = 1017 nir_alu_type_get_base_type(nir_op_infos[alu->op].output_type); 1018 if (induction_base_type == nir_type_int || induction_base_type == nir_type_uint) { 1019 assert(nir_alu_type_get_base_type(nir_op_infos[alu_op].input_types[1]) == nir_type_int || 1020 nir_alu_type_get_base_type(nir_op_infos[alu_op].input_types[1]) == nir_type_uint); 1021 } else { 1022 assert(nir_alu_type_get_base_type(nir_op_infos[alu_op].input_types[0]) == 1023 induction_base_type); 1024 } 1025 1026 /* do-while loops can increment the starting value before the condition is 1027 * checked. e.g. 1028 * 1029 * do { 1030 * ndx++; 1031 * } while (ndx < 3); 1032 * 1033 * Here we check if the induction variable is used directly by the loop 1034 * condition and if so we assume we need to step the initial value. 1035 */ 1036 unsigned trip_offset = 0; 1037 nir_alu_instr *cond_alu = nir_instr_as_alu(cond.def->parent_instr); 1038 if (cond_alu->src[0].src.ssa == &alu->def || 1039 cond_alu->src[1].src.ssa == &alu->def) { 1040 trip_offset = 1; 1041 } 1042 1043 unsigned bit_size = nir_src_bit_size(alu->src[0].src); 1044 1045 /* get_iteration works under assumption that iterator will be 1046 * incremented or decremented until it hits the limit, 1047 * however if the loop condition is false on the first iteration 1048 * get_iteration's assumption is broken. Handle such loops first. 1049 */ 1050 if (will_break_on_first_iteration(cond, basis, limit_basis, initial, 1051 limit, invert_cond, execution_mode)) { 1052 return 0; 1053 } 1054 1055 /* For loops incremented with addition operation, it's easy to 1056 * calculate the number of iterations theoretically. Even though it 1057 * is possible for other operations as well, it is much more error 1058 * prone, and doesn't cover all possible cases. So, we try to 1059 * emulate the loop. 1060 */ 1061 int iter_int; 1062 switch (alu->op) { 1063 case nir_op_iadd: 1064 case nir_op_fadd: 1065 assert(nir_src_bit_size(alu->src[0].src) == 1066 nir_src_bit_size(alu->src[1].src)); 1067 1068 iter_int = get_iteration(alu_op, initial, step, limit, invert_cond, 1069 bit_size, execution_mode); 1070 break; 1071 case nir_op_fmul: 1072 /* Detecting non-zero loop counts when the loop increment is floating 1073 * point multiplication triggers a preexisting problem in 1074 * glsl-fs-loop-unroll-mul-fp64.shader_test. See 1075 * https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3445#note_1779438. 1076 */ 1077 return -1; 1078 case nir_op_imul: 1079 case nir_op_ishl: 1080 case nir_op_ishr: 1081 case nir_op_ushr: 1082 return get_iteration_empirical(cond, alu, basis, initial, 1083 limit_basis, limit, invert_cond, 1084 execution_mode, max_unroll_iterations); 1085 default: 1086 unreachable("Invalid induction variable increment operation."); 1087 } 1088 1089 /* If iter_int is negative the loop is ill-formed or is the conditional is 1090 * unsigned with a huge iteration count so don't bother going any further. 1091 */ 1092 if (iter_int < 0) 1093 return -1; 1094 1095 nir_op actual_alu_op = invert_comparison_if_needed(alu_op, invert_cond); 1096 if (actual_alu_op == nir_op_ine || actual_alu_op == nir_op_fneu) 1097 return iter_int; 1098 1099 /* An explanation from the GLSL unrolling pass: 1100 * 1101 * Make sure that the calculated number of iterations satisfies the exit 1102 * condition. This is needed to catch off-by-one errors and some types of 1103 * ill-formed loops. For example, we need to detect that the following 1104 * loop does not have a maximum iteration count. 1105 * 1106 * for (float x = 0.0; x != 0.9; x += 0.2); 1107 */ 1108 for (int bias = -1; bias <= 1; bias++) { 1109 const int iter_bias = iter_int + bias; 1110 if (iter_bias < 1) 1111 continue; 1112 1113 if (test_iterations(iter_bias, step, limit, alu_op, bit_size, 1114 induction_base_type, initial, 1115 limit_rhs, invert_cond, execution_mode)) { 1116 return iter_bias - trip_offset; 1117 } 1118 } 1119 1120 return -1; 1121 } 1122 1123 static bool get_induction_and_limit_vars(nir_scalar cond,nir_scalar * ind,nir_scalar * limit,bool * limit_rhs,loop_info_state * state)1124 get_induction_and_limit_vars(nir_scalar cond, 1125 nir_scalar *ind, 1126 nir_scalar *limit, 1127 bool *limit_rhs, 1128 loop_info_state *state) 1129 { 1130 nir_scalar rhs, lhs; 1131 lhs = nir_scalar_chase_alu_src(cond, 0); 1132 rhs = nir_scalar_chase_alu_src(cond, 1); 1133 1134 nir_loop_variable *src0_lv = get_loop_var(lhs.def, state); 1135 nir_loop_variable *src1_lv = get_loop_var(rhs.def, state); 1136 1137 if (src0_lv->type == basic_induction) { 1138 if (!nir_src_is_const(*src0_lv->init_src)) 1139 return false; 1140 1141 *ind = lhs; 1142 *limit = rhs; 1143 *limit_rhs = true; 1144 return true; 1145 } else if (src1_lv->type == basic_induction) { 1146 if (!nir_src_is_const(*src1_lv->init_src)) 1147 return false; 1148 1149 *ind = rhs; 1150 *limit = lhs; 1151 *limit_rhs = false; 1152 return true; 1153 } else { 1154 return false; 1155 } 1156 } 1157 1158 static bool try_find_trip_count_vars_in_logical_op(nir_scalar * cond,nir_scalar * ind,nir_scalar * limit,bool * limit_rhs,loop_info_state * state)1159 try_find_trip_count_vars_in_logical_op(nir_scalar *cond, 1160 nir_scalar *ind, 1161 nir_scalar *limit, 1162 bool *limit_rhs, 1163 loop_info_state *state) 1164 { 1165 const nir_op alu_op = nir_scalar_alu_op(*cond); 1166 bool exit_loop_on_false = alu_op == nir_op_ieq || alu_op == nir_op_inot; 1167 nir_scalar logical_op = exit_loop_on_false ? 1168 nir_scalar_chase_alu_src(*cond, 0) : *cond; 1169 1170 if (alu_op == nir_op_ieq) { 1171 nir_scalar zero = nir_scalar_chase_alu_src(*cond, 1); 1172 1173 if (!nir_scalar_is_alu(logical_op) || !nir_scalar_is_const(zero)) { 1174 /* Maybe we had it the wrong way, flip things around */ 1175 nir_scalar tmp = zero; 1176 zero = logical_op; 1177 logical_op = tmp; 1178 1179 /* If we still didn't find what we need then return */ 1180 if (!nir_scalar_is_const(zero)) 1181 return false; 1182 } 1183 1184 /* If the loop is not breaking on (x && y) == 0 then return */ 1185 if (nir_scalar_as_uint(zero) != 0) 1186 return false; 1187 } 1188 1189 if (!nir_scalar_is_alu(logical_op)) 1190 return false; 1191 1192 if ((exit_loop_on_false && (nir_scalar_alu_op(logical_op) != nir_op_iand)) || 1193 (!exit_loop_on_false && (nir_scalar_alu_op(logical_op) != nir_op_ior))) 1194 return false; 1195 1196 /* Check if iand src is a terminator condition and try get induction var 1197 * and trip limit var. 1198 */ 1199 bool found_induction_var = false; 1200 for (unsigned i = 0; i < 2; i++) { 1201 nir_scalar src = nir_scalar_chase_alu_src(logical_op, i); 1202 if (nir_is_terminator_condition_with_two_inputs(src) && 1203 get_induction_and_limit_vars(src, ind, limit, limit_rhs, state)) { 1204 *cond = src; 1205 found_induction_var = true; 1206 1207 /* If we've found one with a constant limit, stop. */ 1208 if (nir_scalar_is_const(*limit)) 1209 return true; 1210 } 1211 } 1212 1213 return found_induction_var; 1214 } 1215 1216 /* Run through each of the terminators of the loop and try to infer a possible 1217 * trip-count. We need to check them all, and set the lowest trip-count as the 1218 * trip-count of our loop. If one of the terminators has an undecidable 1219 * trip-count we can not safely assume anything about the duration of the 1220 * loop. 1221 */ 1222 static void find_trip_count(loop_info_state * state,unsigned execution_mode,unsigned max_unroll_iterations)1223 find_trip_count(loop_info_state *state, unsigned execution_mode, 1224 unsigned max_unroll_iterations) 1225 { 1226 bool trip_count_known = true; 1227 bool guessed_trip_count = false; 1228 nir_loop_terminator *limiting_terminator = NULL; 1229 int max_trip_count = -1; 1230 1231 list_for_each_entry(nir_loop_terminator, terminator, 1232 &state->loop->info->loop_terminator_list, 1233 loop_terminator_link) { 1234 nir_scalar cond = { terminator->nif->condition.ssa, 0 }; 1235 1236 if (!nir_scalar_is_alu(cond)) { 1237 /* If we get here the loop is dead and will get cleaned up by the 1238 * nir_opt_dead_cf pass. 1239 */ 1240 trip_count_known = false; 1241 terminator->exact_trip_count_unknown = true; 1242 continue; 1243 } 1244 1245 nir_op alu_op = nir_scalar_alu_op(cond); 1246 1247 bool invert_cond = terminator->continue_from_then; 1248 1249 bool limit_rhs; 1250 nir_scalar basic_ind = { NULL, 0 }; 1251 nir_scalar limit; 1252 1253 if ((alu_op == nir_op_inot || alu_op == nir_op_ieq || alu_op == nir_op_ior) && 1254 try_find_trip_count_vars_in_logical_op(&cond, &basic_ind, &limit, 1255 &limit_rhs, state)) { 1256 1257 /* The loop is exiting on (x && y) == 0 so we need to get the 1258 * inverse of x or y (i.e. which ever contained the induction var) in 1259 * order to compute the trip count. 1260 */ 1261 if (alu_op == nir_op_inot || alu_op == nir_op_ieq) 1262 invert_cond = !invert_cond; 1263 1264 alu_op = nir_scalar_alu_op(cond); 1265 trip_count_known = false; 1266 terminator->conditional_instr = cond.def->parent_instr; 1267 terminator->exact_trip_count_unknown = true; 1268 } 1269 1270 if (!basic_ind.def) { 1271 if (nir_is_supported_terminator_condition(cond)) { 1272 /* Extract and inverse the comparision if it is wrapped in an inot 1273 */ 1274 if (alu_op == nir_op_inot) { 1275 cond = nir_scalar_chase_alu_src(cond, 0); 1276 alu_op = nir_scalar_alu_op(cond); 1277 invert_cond = !invert_cond; 1278 } 1279 1280 get_induction_and_limit_vars(cond, &basic_ind, 1281 &limit, &limit_rhs, state); 1282 } 1283 } 1284 1285 /* The comparison has to have a basic induction variable for us to be 1286 * able to find trip counts. 1287 */ 1288 if (!basic_ind.def) { 1289 trip_count_known = false; 1290 terminator->exact_trip_count_unknown = true; 1291 continue; 1292 } 1293 1294 terminator->induction_rhs = !limit_rhs; 1295 1296 /* Attempt to find a constant limit for the loop */ 1297 nir_const_value limit_val; 1298 if (nir_scalar_is_const(limit)) { 1299 limit_val = nir_scalar_as_const_value(limit); 1300 } else { 1301 trip_count_known = false; 1302 1303 if (!try_find_limit_of_alu(limit, &limit_val, alu_op, invert_cond, terminator, state)) { 1304 /* Guess loop limit based on array access */ 1305 if (!guess_loop_limit(state, &limit_val, basic_ind)) { 1306 terminator->exact_trip_count_unknown = true; 1307 continue; 1308 } 1309 1310 guessed_trip_count = true; 1311 } 1312 } 1313 1314 /* We have determined that we have the following constants: 1315 * (With the typical int i = 0; i < x; i++; as an example) 1316 * - Upper limit. 1317 * - Starting value 1318 * - Step / iteration size 1319 * Thats all thats needed to calculate the trip-count 1320 */ 1321 1322 nir_loop_variable *lv = get_loop_var(basic_ind.def, state); 1323 1324 /* The basic induction var might be a vector but, because we guarantee 1325 * earlier that the phi source has a scalar swizzle, we can take the 1326 * component from basic_ind. 1327 */ 1328 nir_scalar initial_s = { lv->init_src->ssa, basic_ind.comp }; 1329 nir_scalar alu_s = { 1330 lv->update_src->src.ssa, 1331 lv->update_src->swizzle[basic_ind.comp] 1332 }; 1333 1334 /* We are not guaranteed by that at one of these sources is a constant. 1335 * Try to find one. 1336 */ 1337 if (!nir_scalar_is_const(initial_s) || 1338 !nir_scalar_is_const(alu_s)) 1339 continue; 1340 1341 nir_const_value initial_val = nir_scalar_as_const_value(initial_s); 1342 nir_const_value step_val = nir_scalar_as_const_value(alu_s); 1343 1344 int iterations = calculate_iterations(nir_get_scalar(lv->basis, basic_ind.comp), limit, 1345 initial_val, step_val, limit_val, 1346 nir_instr_as_alu(nir_src_parent_instr(&lv->update_src->src)), 1347 cond, 1348 alu_op, limit_rhs, 1349 invert_cond, 1350 execution_mode, 1351 max_unroll_iterations); 1352 1353 /* Where we not able to calculate the iteration count */ 1354 if (iterations == -1) { 1355 trip_count_known = false; 1356 guessed_trip_count = false; 1357 terminator->exact_trip_count_unknown = true; 1358 continue; 1359 } 1360 1361 if (guessed_trip_count) { 1362 guessed_trip_count = false; 1363 terminator->exact_trip_count_unknown = true; 1364 if (state->loop->info->guessed_trip_count == 0 || 1365 state->loop->info->guessed_trip_count > iterations) 1366 state->loop->info->guessed_trip_count = iterations; 1367 1368 continue; 1369 } 1370 1371 /* If this is the first run or we have found a smaller amount of 1372 * iterations than previously (we have identified a more limiting 1373 * terminator) set the trip count and limiting terminator. 1374 */ 1375 if (max_trip_count == -1 || iterations < max_trip_count) { 1376 max_trip_count = iterations; 1377 limiting_terminator = terminator; 1378 } 1379 } 1380 1381 state->loop->info->exact_trip_count_known = trip_count_known; 1382 if (max_trip_count > -1) 1383 state->loop->info->max_trip_count = max_trip_count; 1384 state->loop->info->limiting_terminator = limiting_terminator; 1385 } 1386 1387 static bool force_unroll_array_access(loop_info_state * state,nir_deref_instr * deref,bool contains_sampler)1388 force_unroll_array_access(loop_info_state *state, nir_deref_instr *deref, 1389 bool contains_sampler) 1390 { 1391 unsigned array_size = find_array_access_via_induction(state, deref, NULL); 1392 if (array_size) { 1393 if ((array_size == state->loop->info->max_trip_count) && 1394 nir_deref_mode_must_be(deref, nir_var_shader_in | 1395 nir_var_shader_out | 1396 nir_var_shader_temp | 1397 nir_var_function_temp)) 1398 return true; 1399 1400 if (nir_deref_mode_must_be(deref, state->indirect_mask)) 1401 return true; 1402 1403 if (contains_sampler && state->force_unroll_sampler_indirect) 1404 return true; 1405 } 1406 1407 return false; 1408 } 1409 1410 static bool force_unroll_heuristics(loop_info_state * state,nir_block * block)1411 force_unroll_heuristics(loop_info_state *state, nir_block *block) 1412 { 1413 nir_foreach_instr(instr, block) { 1414 if (instr->type == nir_instr_type_tex) { 1415 nir_tex_instr *tex_instr = nir_instr_as_tex(instr); 1416 int sampler_idx = 1417 nir_tex_instr_src_index(tex_instr, 1418 nir_tex_src_sampler_deref); 1419 1420 if (sampler_idx >= 0) { 1421 nir_deref_instr *deref = 1422 nir_instr_as_deref(tex_instr->src[sampler_idx].src.ssa->parent_instr); 1423 if (force_unroll_array_access(state, deref, true)) 1424 return true; 1425 } 1426 } 1427 1428 if (instr->type != nir_instr_type_intrinsic) 1429 continue; 1430 1431 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); 1432 1433 /* Check for arrays variably-indexed by a loop induction variable. 1434 * Unrolling the loop may convert that access into constant-indexing. 1435 */ 1436 if (intrin->intrinsic == nir_intrinsic_load_deref || 1437 intrin->intrinsic == nir_intrinsic_store_deref || 1438 intrin->intrinsic == nir_intrinsic_copy_deref) { 1439 if (force_unroll_array_access(state, 1440 nir_src_as_deref(intrin->src[0]), 1441 false)) 1442 return true; 1443 1444 if (intrin->intrinsic == nir_intrinsic_copy_deref && 1445 force_unroll_array_access(state, 1446 nir_src_as_deref(intrin->src[1]), 1447 false)) 1448 return true; 1449 } 1450 } 1451 1452 return false; 1453 } 1454 1455 static void get_loop_info(loop_info_state * state,nir_function_impl * impl)1456 get_loop_info(loop_info_state *state, nir_function_impl *impl) 1457 { 1458 nir_shader *shader = impl->function->shader; 1459 const nir_shader_compiler_options *options = shader->options; 1460 1461 /* Add all entries in the outermost part of the loop to the processing list 1462 * Mark the entries in conditionals or in nested loops accordingly 1463 */ 1464 foreach_list_typed_safe(nir_cf_node, node, node, &state->loop->body) { 1465 switch (node->type) { 1466 1467 case nir_cf_node_block: 1468 init_loop_block(nir_cf_node_as_block(node), state, false, false); 1469 break; 1470 1471 case nir_cf_node_if: 1472 nir_foreach_block_in_cf_node(block, node) 1473 init_loop_block(block, state, true, false); 1474 break; 1475 1476 case nir_cf_node_loop: 1477 nir_foreach_block_in_cf_node(block, node) { 1478 init_loop_block(block, state, false, true); 1479 } 1480 break; 1481 1482 case nir_cf_node_function: 1483 break; 1484 } 1485 } 1486 1487 /* Try to find all simple terminators of the loop. If we can't find any, 1488 * or we find possible terminators that have side effects then bail. 1489 */ 1490 if (!find_loop_terminators(state)) { 1491 list_for_each_entry_safe(nir_loop_terminator, terminator, 1492 &state->loop->info->loop_terminator_list, 1493 loop_terminator_link) { 1494 list_del(&terminator->loop_terminator_link); 1495 ralloc_free(terminator); 1496 } 1497 return; 1498 } 1499 1500 if (!compute_induction_information(state)) 1501 return; 1502 1503 /* Run through each of the terminators and try to compute a trip-count */ 1504 find_trip_count(state, 1505 impl->function->shader->info.float_controls_execution_mode, 1506 impl->function->shader->options->max_unroll_iterations); 1507 1508 nir_foreach_block_in_cf_node(block, &state->loop->cf_node) { 1509 nir_foreach_instr(instr, block) { 1510 state->loop->info->instr_cost += instr_cost(state, instr, options); 1511 } 1512 1513 if (state->loop->info->force_unroll) 1514 continue; 1515 1516 if (force_unroll_heuristics(state, block)) { 1517 state->loop->info->force_unroll = true; 1518 } 1519 } 1520 } 1521 1522 static loop_info_state * initialize_loop_info_state(nir_loop * loop,void * mem_ctx,nir_function_impl * impl)1523 initialize_loop_info_state(nir_loop *loop, void *mem_ctx, 1524 nir_function_impl *impl) 1525 { 1526 loop_info_state *state = rzalloc(mem_ctx, loop_info_state); 1527 state->loop_vars = ralloc_array(mem_ctx, nir_loop_variable, 1528 impl->ssa_alloc); 1529 state->loop_vars_init = rzalloc_array(mem_ctx, BITSET_WORD, 1530 BITSET_WORDS(impl->ssa_alloc)); 1531 state->loop = loop; 1532 1533 list_inithead(&state->process_list); 1534 1535 if (loop->info) 1536 ralloc_free(loop->info); 1537 1538 loop->info = rzalloc(loop, nir_loop_info); 1539 1540 list_inithead(&loop->info->loop_terminator_list); 1541 1542 return state; 1543 } 1544 1545 static void process_loops(nir_cf_node * cf_node,nir_variable_mode indirect_mask,bool force_unroll_sampler_indirect)1546 process_loops(nir_cf_node *cf_node, nir_variable_mode indirect_mask, 1547 bool force_unroll_sampler_indirect) 1548 { 1549 switch (cf_node->type) { 1550 case nir_cf_node_block: 1551 return; 1552 case nir_cf_node_if: { 1553 nir_if *if_stmt = nir_cf_node_as_if(cf_node); 1554 foreach_list_typed(nir_cf_node, nested_node, node, &if_stmt->then_list) 1555 process_loops(nested_node, indirect_mask, force_unroll_sampler_indirect); 1556 foreach_list_typed(nir_cf_node, nested_node, node, &if_stmt->else_list) 1557 process_loops(nested_node, indirect_mask, force_unroll_sampler_indirect); 1558 return; 1559 } 1560 case nir_cf_node_loop: { 1561 nir_loop *loop = nir_cf_node_as_loop(cf_node); 1562 assert(!nir_loop_has_continue_construct(loop)); 1563 1564 foreach_list_typed(nir_cf_node, nested_node, node, &loop->body) 1565 process_loops(nested_node, indirect_mask, force_unroll_sampler_indirect); 1566 break; 1567 } 1568 default: 1569 unreachable("unknown cf node type"); 1570 } 1571 1572 nir_loop *loop = nir_cf_node_as_loop(cf_node); 1573 nir_function_impl *impl = nir_cf_node_get_function(cf_node); 1574 void *mem_ctx = ralloc_context(NULL); 1575 1576 loop_info_state *state = initialize_loop_info_state(loop, mem_ctx, impl); 1577 state->indirect_mask = indirect_mask; 1578 state->force_unroll_sampler_indirect = force_unroll_sampler_indirect; 1579 1580 get_loop_info(state, impl); 1581 1582 ralloc_free(mem_ctx); 1583 } 1584 1585 void nir_loop_analyze_impl(nir_function_impl * impl,nir_variable_mode indirect_mask,bool force_unroll_sampler_indirect)1586 nir_loop_analyze_impl(nir_function_impl *impl, 1587 nir_variable_mode indirect_mask, 1588 bool force_unroll_sampler_indirect) 1589 { 1590 nir_index_ssa_defs(impl); 1591 foreach_list_typed(nir_cf_node, node, node, &impl->body) 1592 process_loops(node, indirect_mask, force_unroll_sampler_indirect); 1593 } 1594