xref: /aosp_15_r20/external/mesa3d/src/intel/compiler/brw_fs_opt_virtual_grfs.cpp (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2010 Intel Corporation
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "brw_fs.h"
7 #include "brw_fs_builder.h"
8 
9 using namespace brw;
10 
11 /**
12  * Split large virtual GRFs into separate components if we can.
13  *
14  * This pass aggressively splits VGRFs into as small a chunks as possible,
15  * down to single registers if it can.  If no VGRFs can be split, we return
16  * false so this pass can safely be used inside an optimization loop.  We
17  * want to split, because virtual GRFs are what we register allocate and
18  * spill (due to contiguousness requirements for some instructions), and
19  * they're what we naturally generate in the codegen process, but most
20  * virtual GRFs don't actually need to be contiguous sets of GRFs.  If we
21  * split, we'll end up with reduced live intervals and better dead code
22  * elimination and coalescing.
23  */
24 bool
brw_fs_opt_split_virtual_grfs(fs_visitor & s)25 brw_fs_opt_split_virtual_grfs(fs_visitor &s)
26 {
27    /* Compact the register file so we eliminate dead vgrfs.  This
28     * only defines split points for live registers, so if we have
29     * too large dead registers they will hit assertions later.
30     */
31    brw_fs_opt_compact_virtual_grfs(s);
32 
33    unsigned num_vars = s.alloc.count;
34 
35    /* Count the total number of registers */
36    unsigned reg_count = 0;
37    unsigned *vgrf_to_reg = new unsigned[num_vars];
38    for (unsigned i = 0; i < num_vars; i++) {
39       vgrf_to_reg[i] = reg_count;
40       reg_count += s.alloc.sizes[i];
41    }
42 
43    /* An array of "split points".  For each register slot, this indicates
44     * if this slot can be separated from the previous slot.  Every time an
45     * instruction uses multiple elements of a register (as a source or
46     * destination), we mark the used slots as inseparable.  Then we go
47     * through and split the registers into the smallest pieces we can.
48     */
49    bool *split_points = new bool[reg_count];
50    memset(split_points, 0, reg_count * sizeof(*split_points));
51 
52    /* Mark all used registers as fully splittable following the physical
53     * register size.
54     */
55    const unsigned reg_inc = reg_unit(s.devinfo);
56    foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
57       if (inst->dst.file == VGRF) {
58          unsigned reg = vgrf_to_reg[inst->dst.nr];
59          for (unsigned j = reg_inc; j < s.alloc.sizes[inst->dst.nr]; j += reg_inc)
60             split_points[reg + j] = true;
61       }
62 
63       for (unsigned i = 0; i < inst->sources; i++) {
64          if (inst->src[i].file == VGRF) {
65             unsigned reg = vgrf_to_reg[inst->src[i].nr];
66             for (unsigned j = reg_inc; j < s.alloc.sizes[inst->src[i].nr]; j += reg_inc)
67                split_points[reg + j] = true;
68          }
69       }
70    }
71 
72    foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
73       /* We fix up undef instructions later */
74       if (inst->opcode == SHADER_OPCODE_UNDEF) {
75          assert(inst->dst.file == VGRF);
76          continue;
77       }
78 
79       if (inst->dst.file == VGRF) {
80          unsigned reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE;
81          for (unsigned j = 1; j < regs_written(inst); j++)
82             split_points[reg + j] = false;
83       }
84       for (unsigned i = 0; i < inst->sources; i++) {
85          if (inst->src[i].file == VGRF) {
86             unsigned reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE;
87             for (unsigned j = 1; j < regs_read(inst, i); j++)
88                split_points[reg + j] = false;
89          }
90       }
91    }
92 
93    /* Bitset of which registers have been split */
94    bool *vgrf_has_split = new bool[num_vars];
95    memset(vgrf_has_split, 0, num_vars * sizeof(*vgrf_has_split));
96 
97    unsigned *new_virtual_grf = new unsigned[reg_count];
98    unsigned *new_reg_offset = new unsigned[reg_count];
99 
100    unsigned reg = 0;
101    bool has_splits = false;
102    for (unsigned i = 0; i < num_vars; i++) {
103       /* The first one should always be 0 as a quick sanity check. */
104       assert(split_points[reg] == false);
105 
106       /* j = 0 case */
107       new_reg_offset[reg] = 0;
108       reg++;
109       unsigned offset = 1;
110 
111       /* j > 0 case */
112       for (unsigned j = 1; j < s.alloc.sizes[i]; j++) {
113          /* If this is a split point, reset the offset to 0 and allocate a
114           * new virtual GRF for the previous offset many registers
115           */
116          if (split_points[reg]) {
117             has_splits = true;
118             vgrf_has_split[i] = true;
119             assert(offset <= MAX_VGRF_SIZE(s.devinfo));
120             unsigned grf = s.alloc.allocate(offset);
121             for (unsigned k = reg - offset; k < reg; k++)
122                new_virtual_grf[k] = grf;
123             offset = 0;
124          }
125          new_reg_offset[reg] = offset;
126          offset++;
127          reg++;
128       }
129 
130       /* The last one gets the original register number */
131       assert(offset <= MAX_VGRF_SIZE(s.devinfo));
132       s.alloc.sizes[i] = offset;
133       for (unsigned k = reg - offset; k < reg; k++)
134          new_virtual_grf[k] = i;
135    }
136    assert(reg == reg_count);
137 
138    bool progress;
139    if (!has_splits) {
140       progress = false;
141       goto cleanup;
142    }
143 
144    foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
145       if (inst->opcode == SHADER_OPCODE_UNDEF) {
146          assert(inst->dst.file == VGRF);
147          if (vgrf_has_split[inst->dst.nr]) {
148             const fs_builder ibld(&s, block, inst);
149             assert(inst->size_written % REG_SIZE == 0);
150             unsigned reg_offset = inst->dst.offset / REG_SIZE;
151             unsigned size_written = 0;
152             while (size_written < inst->size_written) {
153                reg = vgrf_to_reg[inst->dst.nr] + reg_offset + size_written / REG_SIZE;
154                fs_inst *undef =
155                   ibld.UNDEF(
156                      byte_offset(brw_vgrf(new_virtual_grf[reg], inst->dst.type),
157                                  new_reg_offset[reg] * REG_SIZE));
158                undef->size_written =
159                   MIN2(inst->size_written - size_written, undef->size_written);
160                assert(undef->size_written % REG_SIZE == 0);
161                size_written += undef->size_written;
162             }
163             inst->remove(block);
164          } else {
165             reg = vgrf_to_reg[inst->dst.nr];
166             assert(new_reg_offset[reg] == 0);
167             assert(new_virtual_grf[reg] == inst->dst.nr);
168          }
169          continue;
170       }
171 
172       if (inst->dst.file == VGRF) {
173          reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE;
174          if (vgrf_has_split[inst->dst.nr]) {
175             inst->dst.nr = new_virtual_grf[reg];
176             inst->dst.offset = new_reg_offset[reg] * REG_SIZE +
177                                inst->dst.offset % REG_SIZE;
178             assert(new_reg_offset[reg] < s.alloc.sizes[new_virtual_grf[reg]]);
179          } else {
180             assert(new_reg_offset[reg] == inst->dst.offset / REG_SIZE);
181             assert(new_virtual_grf[reg] == inst->dst.nr);
182          }
183       }
184       for (unsigned i = 0; i < inst->sources; i++) {
185 	 if (inst->src[i].file != VGRF)
186             continue;
187 
188          reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE;
189          if (vgrf_has_split[inst->src[i].nr]) {
190             inst->src[i].nr = new_virtual_grf[reg];
191             inst->src[i].offset = new_reg_offset[reg] * REG_SIZE +
192                                   inst->src[i].offset % REG_SIZE;
193             assert(new_reg_offset[reg] < s.alloc.sizes[new_virtual_grf[reg]]);
194          } else {
195             assert(new_reg_offset[reg] == inst->src[i].offset / REG_SIZE);
196             assert(new_virtual_grf[reg] == inst->src[i].nr);
197          }
198       }
199    }
200    s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | DEPENDENCY_VARIABLES);
201 
202    progress = true;
203 
204 cleanup:
205    delete[] split_points;
206    delete[] vgrf_has_split;
207    delete[] new_virtual_grf;
208    delete[] new_reg_offset;
209    delete[] vgrf_to_reg;
210 
211    return progress;
212 }
213 
214 /**
215  * Remove unused virtual GRFs and compact the vgrf_* arrays.
216  *
217  * During code generation, we create tons of temporary variables, many of
218  * which get immediately killed and are never used again.  Yet, in later
219  * optimization and analysis passes, such as compute_live_intervals, we need
220  * to loop over all the virtual GRFs.  Compacting them can save a lot of
221  * overhead.
222  */
223 bool
brw_fs_opt_compact_virtual_grfs(fs_visitor & s)224 brw_fs_opt_compact_virtual_grfs(fs_visitor &s)
225 {
226    bool progress = false;
227    int *remap_table = new int[s.alloc.count];
228    memset(remap_table, -1, s.alloc.count * sizeof(int));
229 
230    /* Mark which virtual GRFs are used. */
231    foreach_block_and_inst(block, const fs_inst, inst, s.cfg) {
232       if (inst->dst.file == VGRF)
233          remap_table[inst->dst.nr] = 0;
234 
235       for (int i = 0; i < inst->sources; i++) {
236          if (inst->src[i].file == VGRF)
237             remap_table[inst->src[i].nr] = 0;
238       }
239    }
240 
241    /* Compact the GRF arrays. */
242    int new_index = 0;
243    for (unsigned i = 0; i < s.alloc.count; i++) {
244       if (remap_table[i] == -1) {
245          /* We just found an unused register.  This means that we are
246           * actually going to compact something.
247           */
248          progress = true;
249       } else {
250          remap_table[i] = new_index;
251          s.alloc.sizes[new_index] = s.alloc.sizes[i];
252          s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | DEPENDENCY_VARIABLES);
253          ++new_index;
254       }
255    }
256 
257    s.alloc.count = new_index;
258 
259    /* Patch all the instructions to use the newly renumbered registers */
260    foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
261       if (inst->dst.file == VGRF)
262          inst->dst.nr = remap_table[inst->dst.nr];
263 
264       for (int i = 0; i < inst->sources; i++) {
265          if (inst->src[i].file == VGRF)
266             inst->src[i].nr = remap_table[inst->src[i].nr];
267       }
268    }
269 
270    /* Patch all the references to delta_xy, since they're used in register
271     * allocation.  If they're unused, switch them to BAD_FILE so we don't
272     * think some random VGRF is delta_xy.
273     */
274    for (unsigned i = 0; i < ARRAY_SIZE(s.delta_xy); i++) {
275       if (s.delta_xy[i].file == VGRF) {
276          if (remap_table[s.delta_xy[i].nr] != -1) {
277             s.delta_xy[i].nr = remap_table[s.delta_xy[i].nr];
278          } else {
279             s.delta_xy[i].file = BAD_FILE;
280          }
281       }
282    }
283 
284    delete[] remap_table;
285 
286    return progress;
287 }
288 
289 
290