1 /*
2 * Copyright © 2013 Intel Corporation
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "brw_eu.h"
7 #include "intel_nir.h"
8 #include "brw_nir.h"
9 #include "brw_fs.h"
10 #include "brw_fs_builder.h"
11 #include "brw_private.h"
12 #include "dev/intel_debug.h"
13
14 using namespace brw;
15
16 /**
17 * Return the number of patches to accumulate before a MULTI_PATCH mode thread is
18 * launched. In cases with a large number of input control points and a large
19 * amount of VS outputs, the VS URB space needed to store an entire 8 patches
20 * worth of data can be prohibitive, so it can be beneficial to launch threads
21 * early.
22 *
23 * See the 3DSTATE_HS::Patch Count Threshold documentation for the recommended
24 * values. Note that 0 means to "disable" early dispatch, meaning to wait for
25 * a full 8 patches as normal.
26 */
27 static int
get_patch_count_threshold(int input_control_points)28 get_patch_count_threshold(int input_control_points)
29 {
30 if (input_control_points <= 4)
31 return 0;
32 else if (input_control_points <= 6)
33 return 5;
34 else if (input_control_points <= 8)
35 return 4;
36 else if (input_control_points <= 10)
37 return 3;
38 else if (input_control_points <= 14)
39 return 2;
40
41 /* Return patch count 1 for PATCHLIST_15 - PATCHLIST_32 */
42 return 1;
43 }
44
45 static void
brw_set_tcs_invocation_id(fs_visitor & s)46 brw_set_tcs_invocation_id(fs_visitor &s)
47 {
48 const struct intel_device_info *devinfo = s.devinfo;
49 struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(s.prog_data);
50 struct brw_vue_prog_data *vue_prog_data = &tcs_prog_data->base;
51 const fs_builder bld = fs_builder(&s).at_end();
52
53 const unsigned instance_id_mask =
54 (devinfo->verx10 >= 125) ? INTEL_MASK(7, 0) :
55 (devinfo->ver >= 11) ? INTEL_MASK(22, 16) :
56 INTEL_MASK(23, 17);
57 const unsigned instance_id_shift =
58 (devinfo->verx10 >= 125) ? 0 : (devinfo->ver >= 11) ? 16 : 17;
59
60 /* Get instance number from g0.2 bits:
61 * * 7:0 on DG2+
62 * * 22:16 on gfx11+
63 * * 23:17 otherwise
64 */
65 brw_reg t =
66 bld.AND(brw_reg(retype(brw_vec1_grf(0, 2), BRW_TYPE_UD)),
67 brw_imm_ud(instance_id_mask));
68
69 if (vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_MULTI_PATCH) {
70 /* gl_InvocationID is just the thread number */
71 s.invocation_id = bld.SHR(t, brw_imm_ud(instance_id_shift));
72 return;
73 }
74
75 assert(vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH);
76
77 brw_reg channels_uw = bld.vgrf(BRW_TYPE_UW);
78 brw_reg channels_ud = bld.vgrf(BRW_TYPE_UD);
79 bld.MOV(channels_uw, brw_reg(brw_imm_uv(0x76543210)));
80 bld.MOV(channels_ud, channels_uw);
81
82 if (tcs_prog_data->instances == 1) {
83 s.invocation_id = channels_ud;
84 } else {
85 /* instance_id = 8 * t + <76543210> */
86 s.invocation_id =
87 bld.ADD(bld.SHR(t, brw_imm_ud(instance_id_shift - 3)), channels_ud);
88 }
89 }
90
91 static void
brw_emit_tcs_thread_end(fs_visitor & s)92 brw_emit_tcs_thread_end(fs_visitor &s)
93 {
94 /* Try and tag the last URB write with EOT instead of emitting a whole
95 * separate write just to finish the thread. There isn't guaranteed to
96 * be one, so this may not succeed.
97 */
98 if (s.mark_last_urb_write_with_eot())
99 return;
100
101 const fs_builder bld = fs_builder(&s).at_end();
102
103 /* Emit a URB write to end the thread. On Broadwell, we use this to write
104 * zero to the "TR DS Cache Disable" bit (we haven't implemented a fancy
105 * algorithm to set it optimally). On other platforms, we simply write
106 * zero to a reserved/MBZ patch header DWord which has no consequence.
107 */
108 brw_reg srcs[URB_LOGICAL_NUM_SRCS];
109 srcs[URB_LOGICAL_SRC_HANDLE] = s.tcs_payload().patch_urb_output;
110 srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(WRITEMASK_X << 16);
111 srcs[URB_LOGICAL_SRC_DATA] = brw_imm_ud(0);
112 srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(1);
113 fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL,
114 reg_undef, srcs, ARRAY_SIZE(srcs));
115 inst->eot = true;
116 }
117
118 static void
brw_assign_tcs_urb_setup(fs_visitor & s)119 brw_assign_tcs_urb_setup(fs_visitor &s)
120 {
121 assert(s.stage == MESA_SHADER_TESS_CTRL);
122
123 /* Rewrite all ATTR file references to HW_REGs. */
124 foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
125 s.convert_attr_sources_to_hw_regs(inst);
126 }
127 }
128
129 static bool
run_tcs(fs_visitor & s)130 run_tcs(fs_visitor &s)
131 {
132 assert(s.stage == MESA_SHADER_TESS_CTRL);
133
134 struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(s.prog_data);
135 const fs_builder bld = fs_builder(&s).at_end();
136
137 assert(vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH ||
138 vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_MULTI_PATCH);
139
140 s.payload_ = new tcs_thread_payload(s);
141
142 /* Initialize gl_InvocationID */
143 brw_set_tcs_invocation_id(s);
144
145 const bool fix_dispatch_mask =
146 vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH &&
147 (s.nir->info.tess.tcs_vertices_out % 8) != 0;
148
149 /* Fix the disptach mask */
150 if (fix_dispatch_mask) {
151 bld.CMP(bld.null_reg_ud(), s.invocation_id,
152 brw_imm_ud(s.nir->info.tess.tcs_vertices_out), BRW_CONDITIONAL_L);
153 bld.IF(BRW_PREDICATE_NORMAL);
154 }
155
156 nir_to_brw(&s);
157
158 if (fix_dispatch_mask) {
159 bld.emit(BRW_OPCODE_ENDIF);
160 }
161
162 brw_emit_tcs_thread_end(s);
163
164 if (s.failed)
165 return false;
166
167 brw_calculate_cfg(s);
168
169 brw_fs_optimize(s);
170
171 s.assign_curb_setup();
172 brw_assign_tcs_urb_setup(s);
173
174 brw_fs_lower_3src_null_dest(s);
175 brw_fs_workaround_memory_fence_before_eot(s);
176 brw_fs_workaround_emit_dummy_mov_instruction(s);
177
178 brw_allocate_registers(s, true /* allow_spilling */);
179
180 return !s.failed;
181 }
182
183 extern "C" const unsigned *
brw_compile_tcs(const struct brw_compiler * compiler,struct brw_compile_tcs_params * params)184 brw_compile_tcs(const struct brw_compiler *compiler,
185 struct brw_compile_tcs_params *params)
186 {
187 const struct intel_device_info *devinfo = compiler->devinfo;
188 nir_shader *nir = params->base.nir;
189 const struct brw_tcs_prog_key *key = params->key;
190 struct brw_tcs_prog_data *prog_data = params->prog_data;
191 struct brw_vue_prog_data *vue_prog_data = &prog_data->base;
192
193 const bool debug_enabled = brw_should_print_shader(nir, DEBUG_TCS);
194
195 vue_prog_data->base.stage = MESA_SHADER_TESS_CTRL;
196 prog_data->base.base.ray_queries = nir->info.ray_queries;
197 prog_data->base.base.total_scratch = 0;
198
199 nir->info.outputs_written = key->outputs_written;
200 nir->info.patch_outputs_written = key->patch_outputs_written;
201
202 struct intel_vue_map input_vue_map;
203 brw_compute_vue_map(devinfo, &input_vue_map, nir->info.inputs_read,
204 nir->info.separate_shader, 1);
205 brw_compute_tess_vue_map(&vue_prog_data->vue_map,
206 nir->info.outputs_written,
207 nir->info.patch_outputs_written);
208
209 brw_nir_apply_key(nir, compiler, &key->base,
210 brw_geometry_stage_dispatch_width(compiler->devinfo));
211 brw_nir_lower_vue_inputs(nir, &input_vue_map);
212 brw_nir_lower_tcs_outputs(nir, &vue_prog_data->vue_map,
213 key->_tes_primitive_mode);
214 if (key->input_vertices > 0)
215 intel_nir_lower_patch_vertices_in(nir, key->input_vertices);
216
217 brw_postprocess_nir(nir, compiler, debug_enabled,
218 key->base.robust_flags);
219
220 bool has_primitive_id =
221 BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID);
222
223 prog_data->patch_count_threshold = get_patch_count_threshold(key->input_vertices);
224
225 if (compiler->use_tcs_multi_patch) {
226 vue_prog_data->dispatch_mode = INTEL_DISPATCH_MODE_TCS_MULTI_PATCH;
227 prog_data->instances = nir->info.tess.tcs_vertices_out;
228 prog_data->include_primitive_id = has_primitive_id;
229 } else {
230 unsigned verts_per_thread = 8;
231 vue_prog_data->dispatch_mode = INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH;
232 prog_data->instances =
233 DIV_ROUND_UP(nir->info.tess.tcs_vertices_out, verts_per_thread);
234 }
235
236 /* Compute URB entry size. The maximum allowed URB entry size is 32k.
237 * That divides up as follows:
238 *
239 * 32 bytes for the patch header (tessellation factors)
240 * 480 bytes for per-patch varyings (a varying component is 4 bytes and
241 * gl_MaxTessPatchComponents = 120)
242 * 16384 bytes for per-vertex varyings (a varying component is 4 bytes,
243 * gl_MaxPatchVertices = 32 and
244 * gl_MaxTessControlOutputComponents = 128)
245 *
246 * 15808 bytes left for varying packing overhead
247 */
248 const int num_per_patch_slots = vue_prog_data->vue_map.num_per_patch_slots;
249 const int num_per_vertex_slots = vue_prog_data->vue_map.num_per_vertex_slots;
250 unsigned output_size_bytes = 0;
251 /* Note that the patch header is counted in num_per_patch_slots. */
252 output_size_bytes += num_per_patch_slots * 16;
253 output_size_bytes += nir->info.tess.tcs_vertices_out *
254 num_per_vertex_slots * 16;
255
256 assert(output_size_bytes >= 1);
257 if (output_size_bytes > GFX7_MAX_HS_URB_ENTRY_SIZE_BYTES)
258 return NULL;
259
260 /* URB entry sizes are stored as a multiple of 64 bytes. */
261 vue_prog_data->urb_entry_size = ALIGN(output_size_bytes, 64) / 64;
262
263 /* HS does not use the usual payload pushing from URB to GRFs,
264 * because we don't have enough registers for a full-size payload, and
265 * the hardware is broken on Haswell anyway.
266 */
267 vue_prog_data->urb_read_length = 0;
268
269 if (unlikely(debug_enabled)) {
270 fprintf(stderr, "TCS Input ");
271 brw_print_vue_map(stderr, &input_vue_map, MESA_SHADER_TESS_CTRL);
272 fprintf(stderr, "TCS Output ");
273 brw_print_vue_map(stderr, &vue_prog_data->vue_map, MESA_SHADER_TESS_CTRL);
274 }
275
276 const unsigned dispatch_width = devinfo->ver >= 20 ? 16 : 8;
277 fs_visitor v(compiler, ¶ms->base, &key->base,
278 &prog_data->base.base, nir, dispatch_width,
279 params->base.stats != NULL, debug_enabled);
280 if (!run_tcs(v)) {
281 params->base.error_str =
282 ralloc_strdup(params->base.mem_ctx, v.fail_msg);
283 return NULL;
284 }
285
286 assert(v.payload().num_regs % reg_unit(devinfo) == 0);
287 prog_data->base.base.dispatch_grf_start_reg = v.payload().num_regs / reg_unit(devinfo);
288
289 fs_generator g(compiler, ¶ms->base,
290 &prog_data->base.base, MESA_SHADER_TESS_CTRL);
291 if (unlikely(debug_enabled)) {
292 g.enable_debug(ralloc_asprintf(params->base.mem_ctx,
293 "%s tessellation control shader %s",
294 nir->info.label ? nir->info.label
295 : "unnamed",
296 nir->info.name));
297 }
298
299 g.generate_code(v.cfg, dispatch_width, v.shader_stats,
300 v.performance_analysis.require(), params->base.stats);
301
302 g.add_const_data(nir->constant_data, nir->constant_data_size);
303
304 return g.get_assembly();
305 }
306