/* * Copyright © 2010 Intel Corporation * SPDX-License-Identifier: MIT */ #include "brw_fs.h" #include "brw_fs_builder.h" using namespace brw; /** * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs. */ bool brw_fs_lower_constant_loads(fs_visitor &s) { unsigned index, pull_index; bool progress = false; foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) { /* Set up the annotation tracking for new generated instructions. */ const fs_builder ibld(&s, block, inst); for (int i = 0; i < inst->sources; i++) { if (inst->src[i].file != UNIFORM) continue; /* We'll handle this case later */ if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) continue; if (!s.get_pull_locs(inst->src[i], &index, &pull_index)) continue; assert(inst->src[i].stride == 0); const unsigned block_sz = 64; /* Fetch one cacheline at a time. */ const fs_builder ubld = ibld.exec_all().group(block_sz / 4, 0); const brw_reg dst = ubld.vgrf(BRW_TYPE_UD); const unsigned base = pull_index * 4; brw_reg srcs[PULL_UNIFORM_CONSTANT_SRCS]; srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE] = brw_imm_ud(index); srcs[PULL_UNIFORM_CONSTANT_SRC_OFFSET] = brw_imm_ud(base & ~(block_sz - 1)); srcs[PULL_UNIFORM_CONSTANT_SRC_SIZE] = brw_imm_ud(block_sz); ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, dst, srcs, PULL_UNIFORM_CONSTANT_SRCS); /* Rewrite the instruction to use the temporary VGRF. */ inst->src[i].file = VGRF; inst->src[i].nr = dst.nr; inst->src[i].offset = (base & (block_sz - 1)) + inst->src[i].offset % 4; progress = true; } if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && inst->src[0].file == UNIFORM) { if (!s.get_pull_locs(inst->src[0], &index, &pull_index)) continue; ibld.VARYING_PULL_CONSTANT_LOAD(inst->dst, brw_imm_ud(index), brw_reg() /* surface_handle */, inst->src[1], pull_index * 4, 4, 1); inst->remove(block); progress = true; } } s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS); return progress; } bool brw_fs_lower_load_payload(fs_visitor &s) { bool progress = false; foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) { if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD) continue; assert(inst->dst.file == VGRF); assert(inst->saturate == false); brw_reg dst = inst->dst; const fs_builder ibld(&s, block, inst); const fs_builder ubld = ibld.exec_all(); for (uint8_t i = 0; i < inst->header_size;) { /* Number of header GRFs to initialize at once with a single MOV * instruction. */ const unsigned n = (i + 1 < inst->header_size && inst->src[i].stride == 1 && inst->src[i + 1].equals(byte_offset(inst->src[i], REG_SIZE))) ? 2 : 1; if (inst->src[i].file != BAD_FILE) ubld.group(8 * n, 0).MOV(retype(dst, BRW_TYPE_UD), retype(inst->src[i], BRW_TYPE_UD)); dst = byte_offset(dst, n * REG_SIZE); i += n; } for (uint8_t i = inst->header_size; i < inst->sources; i++) { dst.type = inst->src[i].type; if (inst->src[i].file != BAD_FILE) { ibld.MOV(dst, inst->src[i]); } dst = offset(dst, ibld, 1); } inst->remove(block); progress = true; } if (progress) s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS); return progress; } /** * Lower CSEL with unsupported types to CMP+SEL. * * Or, for unsigned ==/!= comparisons, simply change the types. */ bool brw_fs_lower_csel(fs_visitor &s) { const intel_device_info *devinfo = s.devinfo; bool progress = false; foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) { if (inst->opcode != BRW_OPCODE_CSEL) continue; bool supported = false; enum brw_reg_type orig_type = inst->src[2].type; enum brw_reg_type new_type = orig_type; switch (orig_type) { case BRW_TYPE_F: /* Gfx9 CSEL can only do F */ supported = true; break; case BRW_TYPE_HF: case BRW_TYPE_W: case BRW_TYPE_D: /* Gfx11+ CSEL can do HF, W, and D. Note that we can't simply * retype integer ==/!= comparisons as float on earlier hardware * because it breaks for 0x8000000 and 0 (-0.0 == 0.0). */ supported = devinfo->ver >= 11; break; case BRW_TYPE_UW: case BRW_TYPE_UD: /* CSEL doesn't support UW/UD but we can simply retype to use the * signed types when comparing with == or !=. */ supported = devinfo->ver >= 11 && (inst->conditional_mod == BRW_CONDITIONAL_EQ || inst->conditional_mod == BRW_CONDITIONAL_NEQ); /* Bspec 47408, Gfx125+ CSEL does support the both signed and unsigned * integer types. */ if (devinfo->verx10 < 125) { new_type = inst->src[2].type == BRW_TYPE_UD ? BRW_TYPE_D : BRW_TYPE_W; } break; default: break; } if (!supported) { const fs_builder ibld(&s, block, inst); /* CSEL: dst = src2 0 ? src0 : src1 */ brw_reg zero = brw_imm_reg(orig_type); ibld.CMP(retype(brw_null_reg(), orig_type), inst->src[2], zero, inst->conditional_mod); inst->opcode = BRW_OPCODE_SEL; inst->predicate = BRW_PREDICATE_NORMAL; inst->conditional_mod = BRW_CONDITIONAL_NONE; inst->resize_sources(2); progress = true; } else if (new_type != orig_type) { inst->src[0].type = new_type; inst->src[1].type = new_type; inst->src[2].type = new_type; progress = true; } } if (progress) s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS); return progress; } bool brw_fs_lower_sub_sat(fs_visitor &s) { bool progress = false; foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) { const fs_builder ibld(&s, block, inst); if (inst->opcode == SHADER_OPCODE_USUB_SAT || inst->opcode == SHADER_OPCODE_ISUB_SAT) { /* The fundamental problem is the hardware performs source negation * at the bit width of the source. If the source is 0x80000000D, the * negation is 0x80000000D. As a result, subtractSaturate(0, * 0x80000000) will produce 0x80000000 instead of 0x7fffffff. There * are at least three ways to resolve this: * * 1. Use the accumulator for the negated source. The accumulator is * 33 bits, so our source 0x80000000 is sign-extended to * 0x1800000000. The negation of which is 0x080000000. This * doesn't help for 64-bit integers (which are already bigger than * 33 bits). There are also only 8 accumulators, so SIMD16 or * SIMD32 instructions would have to be split into multiple SIMD8 * instructions. * * 2. Use slightly different math. For any n-bit value x, we know (x * >> 1) != -(x >> 1). We can use this fact to only do * subtractions involving (x >> 1). subtractSaturate(a, b) == * subtractSaturate(subtractSaturate(a, (b >> 1)), b - (b >> 1)). * * 3. For unsigned sources, it is sufficient to replace the * subtractSaturate with (a > b) ? a - b : 0. * * It may also be possible to use the SUBB instruction. This * implicitly writes the accumulator, so it could only be used in the * same situations as #1 above. It is further limited by only * allowing UD sources. */ if (inst->exec_size == 8 && inst->src[0].type != BRW_TYPE_Q && inst->src[0].type != BRW_TYPE_UQ) { brw_reg acc = retype(brw_acc_reg(inst->exec_size), inst->src[1].type); ibld.MOV(acc, inst->src[1]); fs_inst *add = ibld.ADD(inst->dst, acc, inst->src[0]); add->saturate = true; add->src[0].negate = true; } else if (inst->opcode == SHADER_OPCODE_ISUB_SAT) { /* tmp = src1 >> 1; * dst = add.sat(add.sat(src0, -tmp), -(src1 - tmp)); */ fs_inst *add; brw_reg tmp = ibld.vgrf(inst->src[0].type); ibld.SHR(tmp, inst->src[1], brw_imm_d(1)); brw_reg s1_sub_t = ibld.ADD(inst->src[1], negate(tmp)); brw_reg sat_s0_sub_t = ibld.ADD(inst->src[0], negate(tmp), &add); add->saturate = true; add = ibld.ADD(inst->dst, sat_s0_sub_t, negate(s1_sub_t)); add->saturate = true; } else { /* a > b ? a - b : 0 */ ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1], BRW_CONDITIONAL_G); fs_inst *add = ibld.ADD(inst->dst, inst->src[0], inst->src[1]); add->src[1].negate = !add->src[1].negate; ibld.SEL(inst->dst, inst->dst, brw_imm_ud(0)) ->predicate = BRW_PREDICATE_NORMAL; } inst->remove(block); progress = true; } } if (progress) s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); return progress; } /** * Transform barycentric vectors into the interleaved form expected by the PLN * instruction and returned by the Gfx7+ PI shared function. * * For channels 0-15 in SIMD16 mode they are expected to be laid out as * follows in the register file: * * rN+0: X[0-7] * rN+1: Y[0-7] * rN+2: X[8-15] * rN+3: Y[8-15] * * There is no need to handle SIMD32 here -- This is expected to be run after * SIMD lowering, since SIMD lowering relies on vectors having the standard * component layout. */ bool brw_fs_lower_barycentrics(fs_visitor &s) { const intel_device_info *devinfo = s.devinfo; if (s.stage != MESA_SHADER_FRAGMENT || devinfo->ver >= 20) return false; bool progress = false; foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) { if (inst->exec_size < 16) continue; const fs_builder ibld(&s, block, inst); const fs_builder ubld = ibld.exec_all().group(8, 0); switch (inst->opcode) { case BRW_OPCODE_PLN: { assert(inst->exec_size == 16); const brw_reg tmp = ibld.vgrf(inst->src[1].type, 2); brw_reg srcs[4]; for (unsigned i = 0; i < ARRAY_SIZE(srcs); i++) srcs[i] = horiz_offset(offset(inst->src[1], ibld, i % 2), 8 * (i / 2)); ubld.LOAD_PAYLOAD(tmp, srcs, ARRAY_SIZE(srcs), ARRAY_SIZE(srcs)); inst->src[1] = tmp; progress = true; break; } case FS_OPCODE_INTERPOLATE_AT_SAMPLE: case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET: case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: { assert(inst->exec_size == 16); const brw_reg tmp = ibld.vgrf(inst->dst.type, 2); for (unsigned i = 0; i < 2; i++) { for (unsigned g = 0; g < inst->exec_size / 8; g++) { fs_inst *mov = ibld.at(block, inst->next).group(8, g) .MOV(horiz_offset(offset(inst->dst, ibld, i), 8 * g), offset(tmp, ubld, 2 * g + i)); mov->predicate = inst->predicate; mov->predicate_inverse = inst->predicate_inverse; mov->flag_subreg = inst->flag_subreg; } } inst->dst = tmp; progress = true; break; } default: break; } } if (progress) s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); return progress; } /** * Lower a derivative instruction as the floating-point difference of two * swizzles of the source, specified as \p swz0 and \p swz1. */ static bool lower_derivative(fs_visitor &s, bblock_t *block, fs_inst *inst, unsigned swz0, unsigned swz1) { const fs_builder ubld = fs_builder(&s, block, inst).exec_all(); const brw_reg tmp0 = ubld.vgrf(inst->src[0].type); const brw_reg tmp1 = ubld.vgrf(inst->src[0].type); ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp0, inst->src[0], brw_imm_ud(swz0)); ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp1, inst->src[0], brw_imm_ud(swz1)); inst->resize_sources(2); inst->src[0] = negate(tmp0); inst->src[1] = tmp1; inst->opcode = BRW_OPCODE_ADD; return true; } /** * Lower derivative instructions on platforms where codegen cannot implement * them efficiently (i.e. XeHP). */ bool brw_fs_lower_derivatives(fs_visitor &s) { bool progress = false; if (s.devinfo->verx10 < 125) return false; foreach_block_and_inst(block, fs_inst, inst, s.cfg) { if (inst->opcode == FS_OPCODE_DDX_COARSE) progress |= lower_derivative(s, block, inst, BRW_SWIZZLE_XXXX, BRW_SWIZZLE_YYYY); else if (inst->opcode == FS_OPCODE_DDX_FINE) progress |= lower_derivative(s, block, inst, BRW_SWIZZLE_XXZZ, BRW_SWIZZLE_YYWW); else if (inst->opcode == FS_OPCODE_DDY_COARSE) progress |= lower_derivative(s, block, inst, BRW_SWIZZLE_XXXX, BRW_SWIZZLE_ZZZZ); else if (inst->opcode == FS_OPCODE_DDY_FINE) progress |= lower_derivative(s, block, inst, BRW_SWIZZLE_XYXY, BRW_SWIZZLE_ZWZW); } if (progress) s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); return progress; } bool brw_fs_lower_find_live_channel(fs_visitor &s) { bool progress = false; bool packed_dispatch = brw_stage_has_packed_dispatch(s.devinfo, s.stage, s.max_polygons, s.prog_data); bool vmask = s.stage == MESA_SHADER_FRAGMENT && brw_wm_prog_data(s.prog_data)->uses_vmask; foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) { if (inst->opcode != SHADER_OPCODE_FIND_LIVE_CHANNEL && inst->opcode != SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL && inst->opcode != SHADER_OPCODE_LOAD_LIVE_CHANNELS) continue; bool first = inst->opcode == SHADER_OPCODE_FIND_LIVE_CHANNEL; /* Getting the first active channel index is easy on Gfx8: Just find * the first bit set in the execution mask. The register exists on * HSW already but it reads back as all ones when the current * instruction has execution masking disabled, so it's kind of * useless there. */ const fs_builder ibld(&s, block, inst); if (!inst->is_partial_write()) ibld.emit_undef_for_dst(inst); const fs_builder ubld = fs_builder(&s, block, inst).exec_all().group(1, 0); brw_reg exec_mask = ubld.vgrf(BRW_TYPE_UD); ubld.UNDEF(exec_mask); ubld.emit(SHADER_OPCODE_READ_ARCH_REG, exec_mask, retype(brw_mask_reg(0), BRW_TYPE_UD)); /* ce0 doesn't consider the thread dispatch mask (DMask or VMask), * so combine the execution and dispatch masks to obtain the true mask. * * If we're looking for the first live channel, and we have packed * dispatch, we can skip this step, as we know all dispatched channels * will appear at the front of the mask. */ if (!(first && packed_dispatch)) { brw_reg mask = ubld.vgrf(BRW_TYPE_UD); ubld.UNDEF(mask); ubld.emit(SHADER_OPCODE_READ_ARCH_REG, mask, retype(brw_sr0_reg(vmask ? 3 : 2), BRW_TYPE_UD)); /* Quarter control has the effect of magically shifting the value of * ce0 so you'll get the first/last active channel relative to the * specified quarter control as result. */ if (inst->group > 0) ubld.SHR(mask, mask, brw_imm_ud(ALIGN(inst->group, 8))); ubld.AND(mask, exec_mask, mask); exec_mask = mask; } switch (inst->opcode) { case SHADER_OPCODE_FIND_LIVE_CHANNEL: ubld.FBL(inst->dst, exec_mask); break; case SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL: { brw_reg tmp = ubld.vgrf(BRW_TYPE_UD); ubld.UNDEF(tmp); ubld.LZD(tmp, exec_mask); ubld.ADD(inst->dst, negate(tmp), brw_imm_uw(31)); break; } case SHADER_OPCODE_LOAD_LIVE_CHANNELS: ubld.MOV(inst->dst, exec_mask); break; default: unreachable("Impossible."); } inst->remove(block); progress = true; } if (progress) s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); return progress; } /** * From the Skylake PRM Vol. 2a docs for sends: * * "It is required that the second block of GRFs does not overlap with the * first block." * * There are plenty of cases where we may accidentally violate this due to * having, for instance, both sources be the constant 0. This little pass * just adds a new vgrf for the second payload and copies it over. */ bool brw_fs_lower_sends_overlapping_payload(fs_visitor &s) { bool progress = false; foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) { if (inst->opcode == SHADER_OPCODE_SEND && inst->ex_mlen > 0 && regions_overlap(inst->src[2], inst->mlen * REG_SIZE, inst->src[3], inst->ex_mlen * REG_SIZE)) { const unsigned arg = inst->mlen < inst->ex_mlen ? 2 : 3; const unsigned len = MIN2(inst->mlen, inst->ex_mlen); brw_reg tmp = brw_vgrf(s.alloc.allocate(len), BRW_TYPE_UD); /* Sadly, we've lost all notion of channels and bit sizes at this * point. Just WE_all it. */ const fs_builder ibld = fs_builder(&s, block, inst).exec_all().group(16, 0); brw_reg copy_src = retype(inst->src[arg], BRW_TYPE_UD); brw_reg copy_dst = tmp; for (unsigned i = 0; i < len; i += 2) { if (len == i + 1) { /* Only one register left; do SIMD8 */ ibld.group(8, 0).MOV(copy_dst, copy_src); } else { ibld.MOV(copy_dst, copy_src); } copy_src = offset(copy_src, ibld, 1); copy_dst = offset(copy_dst, ibld, 1); } inst->src[arg] = tmp; progress = true; } } if (progress) s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); return progress; } /** * Three source instruction must have a GRF destination register. * ARF NULL is not allowed. Fix that up by allocating a temporary GRF. */ bool brw_fs_lower_3src_null_dest(fs_visitor &s) { bool progress = false; foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) { if (inst->is_3src(s.compiler) && inst->dst.is_null()) { inst->dst = brw_vgrf(s.alloc.allocate(s.dispatch_width / 8), inst->dst.type); progress = true; } } if (progress) s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | DEPENDENCY_VARIABLES); return progress; } static bool unsupported_64bit_type(const intel_device_info *devinfo, enum brw_reg_type type) { return (!devinfo->has_64bit_float && type == BRW_TYPE_DF) || (!devinfo->has_64bit_int && (type == BRW_TYPE_UQ || type == BRW_TYPE_Q)); } /** * Perform lowering to legalize the IR for various ALU restrictions. * * For example: * - Splitting 64-bit MOV/SEL into 2x32-bit where needed */ bool brw_fs_lower_alu_restrictions(fs_visitor &s) { const intel_device_info *devinfo = s.devinfo; bool progress = false; foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) { switch (inst->opcode) { case BRW_OPCODE_MOV: if (unsupported_64bit_type(devinfo, inst->dst.type)) { assert(inst->dst.type == inst->src[0].type); assert(!inst->saturate); assert(!inst->src[0].abs); assert(!inst->src[0].negate); const brw::fs_builder ibld(&s, block, inst); enum brw_reg_type type = brw_type_with_size(inst->dst.type, 32); if (!inst->is_partial_write()) ibld.emit_undef_for_dst(inst); ibld.MOV(subscript(inst->dst, type, 1), subscript(inst->src[0], type, 1)); ibld.MOV(subscript(inst->dst, type, 0), subscript(inst->src[0], type, 0)); inst->remove(block); progress = true; } break; case BRW_OPCODE_SEL: if (unsupported_64bit_type(devinfo, inst->dst.type)) { assert(inst->dst.type == inst->src[0].type); assert(!inst->saturate); assert(!inst->src[0].abs && !inst->src[0].negate); assert(!inst->src[1].abs && !inst->src[1].negate); assert(inst->conditional_mod == BRW_CONDITIONAL_NONE); const brw::fs_builder ibld(&s, block, inst); enum brw_reg_type type = brw_type_with_size(inst->dst.type, 32); if (!inst->is_partial_write()) ibld.emit_undef_for_dst(inst); set_predicate(inst->predicate, ibld.SEL(subscript(inst->dst, type, 0), subscript(inst->src[0], type, 0), subscript(inst->src[1], type, 0))); set_predicate(inst->predicate, ibld.SEL(subscript(inst->dst, type, 1), subscript(inst->src[0], type, 1), subscript(inst->src[1], type, 1))); inst->remove(block); progress = true; } break; default: break; } } if (progress) { s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DATA_FLOW | DEPENDENCY_INSTRUCTION_DETAIL); } return progress; } static void brw_fs_lower_vgrf_to_fixed_grf(const struct intel_device_info *devinfo, fs_inst *inst, brw_reg *reg, bool compressed) { if (reg->file != VGRF) return; struct brw_reg new_reg; if (reg->stride == 0) { new_reg = brw_vec1_grf(reg->nr, 0); } else if (reg->stride > 4) { assert(reg != &inst->dst); assert(reg->stride * brw_type_size_bytes(reg->type) <= REG_SIZE); new_reg = brw_vecn_grf(1, reg->nr, 0); new_reg = stride(new_reg, reg->stride, 1, 0); } else { /* From the Haswell PRM: * * "VertStride must be used to cross GRF register boundaries. This * rule implies that elements within a 'Width' cannot cross GRF * boundaries." * * The maximum width value that could satisfy this restriction is: */ const unsigned reg_width = REG_SIZE / (reg->stride * brw_type_size_bytes(reg->type)); /* Because the hardware can only split source regions at a whole * multiple of width during decompression (i.e. vertically), clamp * the value obtained above to the physical execution size of a * single decompressed chunk of the instruction: */ const bool compressed = inst->dst.component_size(inst->exec_size) > REG_SIZE; const unsigned phys_width = compressed ? inst->exec_size / 2 : inst->exec_size; /* XXX - The equation above is strictly speaking not correct on * hardware that supports unbalanced GRF writes -- On Gfx9+ * each decompressed chunk of the instruction may have a * different execution size when the number of components * written to each destination GRF is not the same. */ const unsigned max_hw_width = 16; const unsigned width = MIN3(reg_width, phys_width, max_hw_width); new_reg = brw_vecn_grf(width, reg->nr, 0); new_reg = stride(new_reg, width * reg->stride, width, reg->stride); } new_reg = retype(new_reg, reg->type); new_reg = byte_offset(new_reg, reg->offset); new_reg.abs = reg->abs; new_reg.negate = reg->negate; *reg = new_reg; } void brw_fs_lower_vgrfs_to_fixed_grfs(fs_visitor &s) { assert(s.grf_used || !"Must be called after register allocation"); foreach_block_and_inst(block, fs_inst, inst, s.cfg) { /* If the instruction writes to more than one register, it needs to be * explicitly marked as compressed on Gen <= 5. On Gen >= 6 the * hardware figures out by itself what the right compression mode is, * but we still need to know whether the instruction is compressed to * set up the source register regions appropriately. * * XXX - This is wrong for instructions that write a single register but * read more than one which should strictly speaking be treated as * compressed. For instructions that don't write any registers it * relies on the destination being a null register of the correct * type and regioning so the instruction is considered compressed * or not accordingly. */ const bool compressed = inst->dst.component_size(inst->exec_size) > REG_SIZE; brw_fs_lower_vgrf_to_fixed_grf(s.devinfo, inst, &inst->dst, compressed); for (int i = 0; i < inst->sources; i++) { brw_fs_lower_vgrf_to_fixed_grf(s.devinfo, inst, &inst->src[i], compressed); } } s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DATA_FLOW | DEPENDENCY_VARIABLES); } bool brw_fs_lower_load_subgroup_invocation(fs_visitor &s) { bool progress = false; foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) { if (inst->opcode != SHADER_OPCODE_LOAD_SUBGROUP_INVOCATION) continue; const fs_builder abld = fs_builder(&s, block, inst).annotate("SubgroupInvocation"); const fs_builder ubld8 = abld.group(8, 0).exec_all(); ubld8.UNDEF(inst->dst); if (inst->exec_size == 8) { assert(inst->dst.type == BRW_TYPE_UD); brw_reg uw = retype(inst->dst, BRW_TYPE_UW); ubld8.MOV(uw, brw_imm_v(0x76543210)); ubld8.MOV(inst->dst, uw); } else { assert(inst->dst.type == BRW_TYPE_UW); ubld8.MOV(inst->dst, brw_imm_v(0x76543210)); ubld8.ADD(byte_offset(inst->dst, 16), inst->dst, brw_imm_uw(8u)); if (inst->exec_size > 16) { const fs_builder ubld16 = abld.group(16, 0).exec_all(); ubld16.ADD(byte_offset(inst->dst, 32), inst->dst, brw_imm_uw(16u)); } } inst->remove(block); progress = true; } if (progress) s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); return progress; } bool brw_fs_lower_indirect_mov(fs_visitor &s) { bool progress = false; if (s.devinfo->ver < 20) return progress; foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) { if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT) { if (brw_type_size_bytes(inst->src[0].type) > 1 && brw_type_size_bytes(inst->dst.type) > 1) { continue; } assert(brw_type_size_bytes(inst->src[0].type) == brw_type_size_bytes(inst->dst.type)); const fs_builder ibld(&s, block, inst); /* Extract unaligned part */ uint16_t extra_offset = inst->src[0].offset & 0x1; brw_reg offset = ibld.ADD(inst->src[1], brw_imm_uw(extra_offset)); /* Check if offset is odd or even so that we can choose either high or * low byte from the result. */ brw_reg is_odd = ibld.AND(offset, brw_imm_ud(1)); /* Make sure offset is word (2-bytes) aligned */ offset = ibld.AND(offset, brw_imm_uw(~1)); /* Indirect addressing(vx1 and vxh) not supported with UB/B datatype for * Src0, so change data type for src0 and dst to UW. */ brw_reg dst = ibld.vgrf(BRW_TYPE_UW); /* Substract unaligned offset from src0 offset since we already * accounted unaligned part in the indirect byte offset. */ brw_reg start = retype(inst->src[0], BRW_TYPE_UW); start.offset &= ~extra_offset; /* Adjust length to account extra offset. */ assert(inst->src[2].file == IMM); brw_reg length = brw_imm_ud(inst->src[2].ud + extra_offset); ibld.emit(SHADER_OPCODE_MOV_INDIRECT, dst, start, offset, length); /* Select high byte if offset is odd otherwise select low byte. */ brw_reg lo = ibld.AND(dst, brw_imm_uw(0xff)); brw_reg hi = ibld.SHR(dst, brw_imm_uw(8)); brw_reg result = ibld.vgrf(BRW_TYPE_UW); ibld.CSEL(result, hi, lo, is_odd, BRW_CONDITIONAL_NZ); /* Extra MOV needed here to convert back to the corresponding B type */ ibld.MOV(inst->dst, result); inst->remove(block); progress = true; } } if (progress) s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); return progress; }