1 /*
2 * Copyright © 2010 Intel Corporation
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "brw_fs.h"
7 #include "brw_fs_builder.h"
8
9 using namespace brw;
10
11 /**
12 * Split large virtual GRFs into separate components if we can.
13 *
14 * This pass aggressively splits VGRFs into as small a chunks as possible,
15 * down to single registers if it can. If no VGRFs can be split, we return
16 * false so this pass can safely be used inside an optimization loop. We
17 * want to split, because virtual GRFs are what we register allocate and
18 * spill (due to contiguousness requirements for some instructions), and
19 * they're what we naturally generate in the codegen process, but most
20 * virtual GRFs don't actually need to be contiguous sets of GRFs. If we
21 * split, we'll end up with reduced live intervals and better dead code
22 * elimination and coalescing.
23 */
24 bool
brw_fs_opt_split_virtual_grfs(fs_visitor & s)25 brw_fs_opt_split_virtual_grfs(fs_visitor &s)
26 {
27 /* Compact the register file so we eliminate dead vgrfs. This
28 * only defines split points for live registers, so if we have
29 * too large dead registers they will hit assertions later.
30 */
31 brw_fs_opt_compact_virtual_grfs(s);
32
33 unsigned num_vars = s.alloc.count;
34
35 /* Count the total number of registers */
36 unsigned reg_count = 0;
37 unsigned *vgrf_to_reg = new unsigned[num_vars];
38 for (unsigned i = 0; i < num_vars; i++) {
39 vgrf_to_reg[i] = reg_count;
40 reg_count += s.alloc.sizes[i];
41 }
42
43 /* An array of "split points". For each register slot, this indicates
44 * if this slot can be separated from the previous slot. Every time an
45 * instruction uses multiple elements of a register (as a source or
46 * destination), we mark the used slots as inseparable. Then we go
47 * through and split the registers into the smallest pieces we can.
48 */
49 bool *split_points = new bool[reg_count];
50 memset(split_points, 0, reg_count * sizeof(*split_points));
51
52 /* Mark all used registers as fully splittable following the physical
53 * register size.
54 */
55 const unsigned reg_inc = reg_unit(s.devinfo);
56 foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
57 if (inst->dst.file == VGRF) {
58 unsigned reg = vgrf_to_reg[inst->dst.nr];
59 for (unsigned j = reg_inc; j < s.alloc.sizes[inst->dst.nr]; j += reg_inc)
60 split_points[reg + j] = true;
61 }
62
63 for (unsigned i = 0; i < inst->sources; i++) {
64 if (inst->src[i].file == VGRF) {
65 unsigned reg = vgrf_to_reg[inst->src[i].nr];
66 for (unsigned j = reg_inc; j < s.alloc.sizes[inst->src[i].nr]; j += reg_inc)
67 split_points[reg + j] = true;
68 }
69 }
70 }
71
72 foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
73 /* We fix up undef instructions later */
74 if (inst->opcode == SHADER_OPCODE_UNDEF) {
75 assert(inst->dst.file == VGRF);
76 continue;
77 }
78
79 if (inst->dst.file == VGRF) {
80 unsigned reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE;
81 for (unsigned j = 1; j < regs_written(inst); j++)
82 split_points[reg + j] = false;
83 }
84 for (unsigned i = 0; i < inst->sources; i++) {
85 if (inst->src[i].file == VGRF) {
86 unsigned reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE;
87 for (unsigned j = 1; j < regs_read(inst, i); j++)
88 split_points[reg + j] = false;
89 }
90 }
91 }
92
93 /* Bitset of which registers have been split */
94 bool *vgrf_has_split = new bool[num_vars];
95 memset(vgrf_has_split, 0, num_vars * sizeof(*vgrf_has_split));
96
97 unsigned *new_virtual_grf = new unsigned[reg_count];
98 unsigned *new_reg_offset = new unsigned[reg_count];
99
100 unsigned reg = 0;
101 bool has_splits = false;
102 for (unsigned i = 0; i < num_vars; i++) {
103 /* The first one should always be 0 as a quick sanity check. */
104 assert(split_points[reg] == false);
105
106 /* j = 0 case */
107 new_reg_offset[reg] = 0;
108 reg++;
109 unsigned offset = 1;
110
111 /* j > 0 case */
112 for (unsigned j = 1; j < s.alloc.sizes[i]; j++) {
113 /* If this is a split point, reset the offset to 0 and allocate a
114 * new virtual GRF for the previous offset many registers
115 */
116 if (split_points[reg]) {
117 has_splits = true;
118 vgrf_has_split[i] = true;
119 assert(offset <= MAX_VGRF_SIZE(s.devinfo));
120 unsigned grf = s.alloc.allocate(offset);
121 for (unsigned k = reg - offset; k < reg; k++)
122 new_virtual_grf[k] = grf;
123 offset = 0;
124 }
125 new_reg_offset[reg] = offset;
126 offset++;
127 reg++;
128 }
129
130 /* The last one gets the original register number */
131 assert(offset <= MAX_VGRF_SIZE(s.devinfo));
132 s.alloc.sizes[i] = offset;
133 for (unsigned k = reg - offset; k < reg; k++)
134 new_virtual_grf[k] = i;
135 }
136 assert(reg == reg_count);
137
138 bool progress;
139 if (!has_splits) {
140 progress = false;
141 goto cleanup;
142 }
143
144 foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
145 if (inst->opcode == SHADER_OPCODE_UNDEF) {
146 assert(inst->dst.file == VGRF);
147 if (vgrf_has_split[inst->dst.nr]) {
148 const fs_builder ibld(&s, block, inst);
149 assert(inst->size_written % REG_SIZE == 0);
150 unsigned reg_offset = inst->dst.offset / REG_SIZE;
151 unsigned size_written = 0;
152 while (size_written < inst->size_written) {
153 reg = vgrf_to_reg[inst->dst.nr] + reg_offset + size_written / REG_SIZE;
154 fs_inst *undef =
155 ibld.UNDEF(
156 byte_offset(brw_vgrf(new_virtual_grf[reg], inst->dst.type),
157 new_reg_offset[reg] * REG_SIZE));
158 undef->size_written =
159 MIN2(inst->size_written - size_written, undef->size_written);
160 assert(undef->size_written % REG_SIZE == 0);
161 size_written += undef->size_written;
162 }
163 inst->remove(block);
164 } else {
165 reg = vgrf_to_reg[inst->dst.nr];
166 assert(new_reg_offset[reg] == 0);
167 assert(new_virtual_grf[reg] == inst->dst.nr);
168 }
169 continue;
170 }
171
172 if (inst->dst.file == VGRF) {
173 reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE;
174 if (vgrf_has_split[inst->dst.nr]) {
175 inst->dst.nr = new_virtual_grf[reg];
176 inst->dst.offset = new_reg_offset[reg] * REG_SIZE +
177 inst->dst.offset % REG_SIZE;
178 assert(new_reg_offset[reg] < s.alloc.sizes[new_virtual_grf[reg]]);
179 } else {
180 assert(new_reg_offset[reg] == inst->dst.offset / REG_SIZE);
181 assert(new_virtual_grf[reg] == inst->dst.nr);
182 }
183 }
184 for (unsigned i = 0; i < inst->sources; i++) {
185 if (inst->src[i].file != VGRF)
186 continue;
187
188 reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE;
189 if (vgrf_has_split[inst->src[i].nr]) {
190 inst->src[i].nr = new_virtual_grf[reg];
191 inst->src[i].offset = new_reg_offset[reg] * REG_SIZE +
192 inst->src[i].offset % REG_SIZE;
193 assert(new_reg_offset[reg] < s.alloc.sizes[new_virtual_grf[reg]]);
194 } else {
195 assert(new_reg_offset[reg] == inst->src[i].offset / REG_SIZE);
196 assert(new_virtual_grf[reg] == inst->src[i].nr);
197 }
198 }
199 }
200 s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | DEPENDENCY_VARIABLES);
201
202 progress = true;
203
204 cleanup:
205 delete[] split_points;
206 delete[] vgrf_has_split;
207 delete[] new_virtual_grf;
208 delete[] new_reg_offset;
209 delete[] vgrf_to_reg;
210
211 return progress;
212 }
213
214 /**
215 * Remove unused virtual GRFs and compact the vgrf_* arrays.
216 *
217 * During code generation, we create tons of temporary variables, many of
218 * which get immediately killed and are never used again. Yet, in later
219 * optimization and analysis passes, such as compute_live_intervals, we need
220 * to loop over all the virtual GRFs. Compacting them can save a lot of
221 * overhead.
222 */
223 bool
brw_fs_opt_compact_virtual_grfs(fs_visitor & s)224 brw_fs_opt_compact_virtual_grfs(fs_visitor &s)
225 {
226 bool progress = false;
227 int *remap_table = new int[s.alloc.count];
228 memset(remap_table, -1, s.alloc.count * sizeof(int));
229
230 /* Mark which virtual GRFs are used. */
231 foreach_block_and_inst(block, const fs_inst, inst, s.cfg) {
232 if (inst->dst.file == VGRF)
233 remap_table[inst->dst.nr] = 0;
234
235 for (int i = 0; i < inst->sources; i++) {
236 if (inst->src[i].file == VGRF)
237 remap_table[inst->src[i].nr] = 0;
238 }
239 }
240
241 /* Compact the GRF arrays. */
242 int new_index = 0;
243 for (unsigned i = 0; i < s.alloc.count; i++) {
244 if (remap_table[i] == -1) {
245 /* We just found an unused register. This means that we are
246 * actually going to compact something.
247 */
248 progress = true;
249 } else {
250 remap_table[i] = new_index;
251 s.alloc.sizes[new_index] = s.alloc.sizes[i];
252 s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | DEPENDENCY_VARIABLES);
253 ++new_index;
254 }
255 }
256
257 s.alloc.count = new_index;
258
259 /* Patch all the instructions to use the newly renumbered registers */
260 foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
261 if (inst->dst.file == VGRF)
262 inst->dst.nr = remap_table[inst->dst.nr];
263
264 for (int i = 0; i < inst->sources; i++) {
265 if (inst->src[i].file == VGRF)
266 inst->src[i].nr = remap_table[inst->src[i].nr];
267 }
268 }
269
270 /* Patch all the references to delta_xy, since they're used in register
271 * allocation. If they're unused, switch them to BAD_FILE so we don't
272 * think some random VGRF is delta_xy.
273 */
274 for (unsigned i = 0; i < ARRAY_SIZE(s.delta_xy); i++) {
275 if (s.delta_xy[i].file == VGRF) {
276 if (remap_table[s.delta_xy[i].nr] != -1) {
277 s.delta_xy[i].nr = remap_table[s.delta_xy[i].nr];
278 } else {
279 s.delta_xy[i].file = BAD_FILE;
280 }
281 }
282 }
283
284 delete[] remap_table;
285
286 return progress;
287 }
288
289
290