[Mesa-dev] [PATCH v2 1/4] nir: add support for marking used patches when packing varyings

Discussion:

Timothy Arceri

2018-12-10 00:30:58 UTC

This adds support needed for marking the varyings as used but we
don't actually support packing patches in this patch.
---
src/compiler/nir/nir_linking_helpers.c | 73 ++++++++++++++++++--------
1 file changed, 51 insertions(+), 22 deletions(-)

diff --git a/src/compiler/nir/nir_linking_helpers.c b/src/compiler/nir/nir_linking_helpers.c
index a05890ada4..845aba5c87 100644
--- a/src/compiler/nir/nir_linking_helpers.c
+++ b/src/compiler/nir/nir_linking_helpers.c
@@ -289,15 +289,35 @@ struct varying_loc
uint32_t location;
};

+static void
+mark_all_slots_used(nir_variable *var, uint64_t *slots_used,
+ uint64_t slots_used_mask, unsigned num_slots)
+{
+ unsigned loc_offset = var->data.patch ? VARYING_SLOT_PATCH0 : 0;
+
+ slots_used[var->data.patch ? 1 : 0] |= slots_used_mask &
+ (((uint64_t)1 << num_slots) - 1) << (var->data.location - loc_offset);
+}
+
+static void
+mark_used_slots(nir_variable *var, uint64_t *slots_used, unsigned offset)
+{
+ unsigned loc_offset = offset - (var->data.patch ? VARYING_SLOT_PATCH0 : 0);
+
+ slots_used[var->data.patch ? 1 : 0] |= (uint64_t)1 << (var->data.location + loc_offset);
+}
+
static void
remap_slots_and_components(struct exec_list *var_list, gl_shader_stage stage,
struct varying_loc (*remap)[4],
- uint64_t *slots_used, uint64_t *out_slots_read)
+ uint64_t *slots_used, uint64_t *out_slots_read,
+ uint32_t *p_slots_used, uint32_t *p_out_slots_read)
{
- uint64_t out_slots_read_tmp = 0;
+ uint64_t out_slots_read_tmp[2] = {0};
+ uint64_t slots_used_tmp[2] = {0};

/* We don't touch builtins so just copy the bitmask */
- uint64_t slots_used_tmp =
+ slots_used_tmp[0] =
*slots_used & (((uint64_t)1 << (VARYING_SLOT_VAR0 - 1)) - 1);

nir_foreach_variable(var, var_list) {
@@ -305,8 +325,8 @@ remap_slots_and_components(struct exec_list *var_list, gl_shader_stage stage,

/* Only remap things that aren't built-ins */
if (var->data.location >= VARYING_SLOT_VAR0 &&
- var->data.location - VARYING_SLOT_VAR0 < 32) {
- assert(var->data.location - VARYING_SLOT_VAR0 < 32);
+ var->data.location - VARYING_SLOT_VAR0 < MAX_VARYINGS_INCL_PATCH) {
+ assert(var->data.location - VARYING_SLOT_VAR0 < MAX_VARYINGS_INCL_PATCH);

const struct glsl_type *type = var->type;
if (nir_is_per_vertex_io(var, stage)) {
@@ -321,11 +341,17 @@ remap_slots_and_components(struct exec_list *var_list, gl_shader_stage stage,
unsigned location = var->data.location - VARYING_SLOT_VAR0;
struct varying_loc *new_loc = &remap[location][var->data.location_frac];

- uint64_t slots = (((uint64_t)1 << num_slots) - 1) << var->data.location;
- if (slots & *slots_used)
+ unsigned loc_offset = var->data.patch ? VARYING_SLOT_PATCH0 : 0;
+ uint64_t used = var->data.patch ? *p_slots_used : *slots_used;
+ uint64_t outs_used =
+ var->data.patch ? *p_out_slots_read : *out_slots_read;
+ uint64_t slots =
+ (((uint64_t)1 << num_slots) - 1) << (var->data.location - loc_offset);
+
+ if (slots & used)
used_across_stages = true;

- if (slots & *out_slots_read)
+ if (slots & outs_used)
outputs_read = true;

if (new_loc->location) {
@@ -339,30 +365,29 @@ remap_slots_and_components(struct exec_list *var_list, gl_shader_stage stage,
* otherwise we will mess up the mask for things like partially
* marked arrays.
*/
- if (used_across_stages) {
- slots_used_tmp |=
- *slots_used & (((uint64_t)1 << num_slots) - 1) << var->data.location;
- }
+ if (used_across_stages)
+ mark_all_slots_used(var, slots_used_tmp, used, num_slots);

if (outputs_read) {
- out_slots_read_tmp |=
- *out_slots_read & (((uint64_t)1 << num_slots) - 1) << var->data.location;
+ mark_all_slots_used(var, out_slots_read_tmp, outs_used,
+ num_slots);
}
-
} else {
for (unsigned i = 0; i < num_slots; i++) {
if (used_across_stages)
- slots_used_tmp |= (uint64_t)1 << (var->data.location + i);
+ mark_used_slots(var, slots_used_tmp, i);

if (outputs_read)
- out_slots_read_tmp |= (uint64_t)1 << (var->data.location + i);
+ mark_used_slots(var, out_slots_read_tmp, i);
}
}
}
}

- *slots_used = slots_used_tmp;
- *out_slots_read = out_slots_read_tmp;
+ *slots_used = slots_used_tmp[0];
+ *out_slots_read = out_slots_read_tmp[0];
+ *p_slots_used = slots_used_tmp[1];
+ *p_out_slots_read = out_slots_read_tmp[1];
}

/* If there are empty components in the slot compact the remaining components
@@ -376,7 +401,7 @@ compact_components(nir_shader *producer, nir_shader *consumer, uint8_t *comps,
{
struct exec_list *input_list = &consumer->inputs;
struct exec_list *output_list = &producer->outputs;
- struct varying_loc remap[32][4] = {{{0}, {0}}};
+ struct varying_loc remap[MAX_VARYINGS_INCL_PATCH][4] = {{{0}, {0}}};

/* Create a cursor for each interpolation type */
unsigned cursor[4] = {0};
@@ -487,11 +512,15 @@ compact_components(nir_shader *producer, nir_shader *consumer, uint8_t *comps,
}

uint64_t zero = 0;
+ uint32_t zero32 = 0;
remap_slots_and_components(input_list, consumer->info.stage, remap,
- &consumer->info.inputs_read, &zero);
+ &consumer->info.inputs_read, &zero,
+ &consumer->info.patch_inputs_read, &zero32);
remap_slots_and_components(output_list, producer->info.stage, remap,
&producer->info.outputs_written,
- &producer->info.outputs_read);
+ &producer->info.outputs_read,
+ &producer->info.patch_outputs_written,
+ &producer->info.patch_outputs_read);
}

/* We assume that this has been called more-or-less directly after

--
2.19.2

Timothy Arceri

2018-12-10 00:31:01 UTC

Permalink

vkpipeline-db results RADV (VEGA):

Totals from affected shaders:
SGPRS: 27168 -> 27872 (2.59 %)
VGPRS: 24180 -> 24056 (-0.51 %)
Spilled SGPRs: 28 -> 24 (-14.29 %)
Spilled VGPRs: 0 -> 0 (0.00 %)
Private memory VGPRs: 0 -> 0 (0.00 %)
Scratch size: 0 -> 0 (0.00 %) dwords per thread
Code Size: 1584936 -> 1585552 (0.04 %) bytes
LDS: 0 -> 0 (0.00 %) blocks
Max Waves: 3804 -> 3824 (0.53 %)
Wait states: 0 -> 0 (0.00 %)
---
src/compiler/nir/nir_linking_helpers.c | 45 ++++++++++++++++++++++++++
1 file changed, 45 insertions(+)

diff --git a/src/compiler/nir/nir_linking_helpers.c b/src/compiler/nir/nir_linking_helpers.c
index 8bd4acc2ee..badda80979 100644
--- a/src/compiler/nir/nir_linking_helpers.c
+++ b/src/compiler/nir/nir_linking_helpers.c
@@ -415,6 +415,8 @@ remap_slots_and_components(struct exec_list *var_list, gl_shader_stage stage,

struct varying_component {
nir_variable *var;
+ unsigned first_block_use;
+ unsigned last_block_use;
uint8_t interp_type;
uint8_t interp_loc;
bool is_patch;
@@ -441,10 +443,36 @@ cmp_varying_component(const void *comp1_v, const void *comp2_v)
if (comp1->interp_loc != comp2->interp_loc)
return comp1->interp_loc - comp2->interp_loc;

+ /* Attempt to reduce register pressure with crude live range analysis */
+ if (comp1->first_block_use != comp2->first_block_use)
+ return comp1->first_block_use - comp2->first_block_use;
+ if (comp1->last_block_use != comp2->last_block_use)
+ return comp1->last_block_use - comp2->last_block_use;
+
/* If everything else matches just use the original location to sort */
return comp1->var->data.location - comp2->var->data.location;
}

+static void
+set_block_use(struct varying_component *vc_info, nir_src *src,
+ bool is_if_condition)
+{
+ nir_block *blk
+ = nir_cursor_current_block(nir_before_src(src, is_if_condition));
+
+ if (vc_info->initialised) {
+ if (vc_info->first_block_use > blk->index)
+ vc_info->first_block_use = blk->index;
+
+ if (vc_info->last_block_use < blk->index)
+ vc_info->last_block_use = blk->index;
+ } else {
+ vc_info->first_block_use = blk->index;
+ vc_info->last_block_use = blk->index;
+ vc_info->initialised = true;
+ }
+}
+
static void
gather_varying_component_info(nir_shader *consumer,
struct varying_component **varying_comp_info,
@@ -533,6 +561,14 @@ gather_varying_component_info(nir_shader *consumer,
vc_info->interp_loc = get_interp_loc(in_var);
vc_info->is_patch = in_var->data.patch;
}
+
+ nir_foreach_use(src, &intr->dest.ssa) {
+ set_block_use(vc_info, src, false);
+ }
+
+ nir_foreach_if_use(src, &intr->dest.ssa) {
+ set_block_use(vc_info, src, true);
+ }
}
}
}
@@ -651,6 +687,12 @@ void
nir_compact_varyings(nir_shader *producer, nir_shader *consumer,
bool default_to_smooth_interp)
{
+ nir_function_impl *p_impl = nir_shader_get_entrypoint(producer);
+ nir_function_impl *c_impl = nir_shader_get_entrypoint(consumer);
+
+ nir_metadata_require(p_impl, nir_metadata_block_index);
+ nir_metadata_require(c_impl, nir_metadata_block_index);
+
assert(producer->info.stage != MESA_SHADER_FRAGMENT);
assert(consumer->info.stage != MESA_SHADER_VERTEX);

@@ -665,6 +707,9 @@ nir_compact_varyings(nir_shader *producer, nir_shader *consumer,

compact_components(producer, consumer, unmoveable_comps,
default_to_smooth_interp);
+
+ nir_metadata_preserve(p_impl, nir_metadata_block_index);
+ nir_metadata_preserve(c_impl, nir_metadata_block_index);
}

/*

--
2.19.2

Timothy Arceri

2018-12-10 01:17:50 UTC

Permalink

Using robs packing fix for the st I'm actually getting results for
radeonsi now but they are pretty mixed for this patch:

Totals from affected shaders:
SGPRS: 35992 -> 35520 (-1.31 %)
VGPRS: 20688 -> 20808 (0.58 %)
Spilled SGPRs: 1926 -> 1996 (3.63 %)
Spilled VGPRs: 0 -> 0 (0.00 %)
Private memory VGPRs: 0 -> 0 (0.00 %)
Scratch size: 0 -> 0 (0.00 %) dwords per thread
Code Size: 1053168 -> 1055452 (0.22 %) bytes
LDS: 0 -> 0 (0.00 %) blocks
Max Waves: 4636 -> 4616 (-0.43 %)
Wait states: 0 -> 0 (0.00 %)

Post by Timothy Arceri
SGPRS: 27168 -> 27872 (2.59 %)
VGPRS: 24180 -> 24056 (-0.51 %)
Spilled SGPRs: 28 -> 24 (-14.29 %)
Spilled VGPRs: 0 -> 0 (0.00 %)
Private memory VGPRs: 0 -> 0 (0.00 %)
Scratch size: 0 -> 0 (0.00 %) dwords per thread
Code Size: 1584936 -> 1585552 (0.04 %) bytes
LDS: 0 -> 0 (0.00 %) blocks
Max Waves: 3804 -> 3824 (0.53 %)
Wait states: 0 -> 0 (0.00 %)
---
src/compiler/nir/nir_linking_helpers.c | 45 ++++++++++++++++++++++++++
1 file changed, 45 insertions(+)
diff --git a/src/compiler/nir/nir_linking_helpers.c b/src/compiler/nir/nir_linking_helpers.c
index 8bd4acc2ee..badda80979 100644
--- a/src/compiler/nir/nir_linking_helpers.c
+++ b/src/compiler/nir/nir_linking_helpers.c
@@ -415,6 +415,8 @@ remap_slots_and_components(struct exec_list *var_list, gl_shader_stage stage,
struct varying_component {
nir_variable *var;
+ unsigned first_block_use;
+ unsigned last_block_use;
uint8_t interp_type;
uint8_t interp_loc;
bool is_patch;
@@ -441,10 +443,36 @@ cmp_varying_component(const void *comp1_v, const void *comp2_v)
if (comp1->interp_loc != comp2->interp_loc)
return comp1->interp_loc - comp2->interp_loc;
+ /* Attempt to reduce register pressure with crude live range analysis */
+ if (comp1->first_block_use != comp2->first_block_use)
+ return comp1->first_block_use - comp2->first_block_use;
+ if (comp1->last_block_use != comp2->last_block_use)
+ return comp1->last_block_use - comp2->last_block_use;
+
/* If everything else matches just use the original location to sort */
return comp1->var->data.location - comp2->var->data.location;
}
+static void
+set_block_use(struct varying_component *vc_info, nir_src *src,
+ bool is_if_condition)
+{
+ nir_block *blk
+ = nir_cursor_current_block(nir_before_src(src, is_if_condition));
+
+ if (vc_info->initialised) {
+ if (vc_info->first_block_use > blk->index)
+ vc_info->first_block_use = blk->index;
+
+ if (vc_info->last_block_use < blk->index)
+ vc_info->last_block_use = blk->index;
+ } else {
+ vc_info->first_block_use = blk->index;
+ vc_info->last_block_use = blk->index;
+ vc_info->initialised = true;
+ }
+}
+
static void
gather_varying_component_info(nir_shader *consumer,
struct varying_component **varying_comp_info,
@@ -533,6 +561,14 @@ gather_varying_component_info(nir_shader *consumer,
vc_info->interp_loc = get_interp_loc(in_var);
vc_info->is_patch = in_var->data.patch;
}
+
+ nir_foreach_use(src, &intr->dest.ssa) {
+ set_block_use(vc_info, src, false);
+ }
+
+ nir_foreach_if_use(src, &intr->dest.ssa) {
+ set_block_use(vc_info, src, true);
+ }
}
}
}
@@ -651,6 +687,12 @@ void
nir_compact_varyings(nir_shader *producer, nir_shader *consumer,
bool default_to_smooth_interp)
{
+ nir_function_impl *p_impl = nir_shader_get_entrypoint(producer);
+ nir_function_impl *c_impl = nir_shader_get_entrypoint(consumer);
+
+ nir_metadata_require(p_impl, nir_metadata_block_index);
+ nir_metadata_require(c_impl, nir_metadata_block_index);
+
assert(producer->info.stage != MESA_SHADER_FRAGMENT);
assert(consumer->info.stage != MESA_SHADER_VERTEX);
@@ -665,6 +707,9 @@ nir_compact_varyings(nir_shader *producer, nir_shader *consumer,
compact_components(producer, consumer, unmoveable_comps,
default_to_smooth_interp);
+
+ nir_metadata_preserve(p_impl, nir_metadata_block_index);
+ nir_metadata_preserve(c_impl, nir_metadata_block_index);
}
/*

Timothy Arceri

2018-12-10 00:31:00 UTC

Permalink

There are three reasons for the rewrite.

1. Adding support for packing tess patch varyings in a sane way.

2. Making use of qsort allowing the code to be much easier to
follow.

3. Allowing us to add a crude live range analysis for deciding
which components should be packed together. This support will
be added in a future patch.

v2: pack moveable components with the unmoveable components. The
new pass is now functionally the same as the old pass besides
the new support for packing patches.
---
src/compiler/nir/nir_linking_helpers.c | 305 ++++++++++++++++---------
1 file changed, 196 insertions(+), 109 deletions(-)

diff --git a/src/compiler/nir/nir_linking_helpers.c b/src/compiler/nir/nir_linking_helpers.c
index c26582ddec..8bd4acc2ee 100644
--- a/src/compiler/nir/nir_linking_helpers.c
+++ b/src/compiler/nir/nir_linking_helpers.c
@@ -247,22 +247,20 @@ is_packing_supported_for_type(const struct glsl_type *type)
return true;
}

+/* Packing arrays and dual slot varyings is difficult so to avoid complex
+ * algorithms this function marks these components as unmoveable.
+ */
static void
-get_slot_component_masks_and_interp_types(struct exec_list *var_list,
- uint8_t *comps,
- uint8_t *interp_type,
- uint8_t *interp_loc,
- gl_shader_stage stage,
- bool default_to_smooth_interp)
+get_unmoveable_components_masks(struct exec_list *var_list, uint8_t *comps,
+ gl_shader_stage stage,
+ bool default_to_smooth_interp)
{
nir_foreach_variable_safe(var, var_list) {
assert(var->data.location >= 0);

- /* Only remap things that aren't built-ins.
- * TODO: add TES patch support.
- */
+ /* Only remap things that aren't built-ins. */
if (var->data.location >= VARYING_SLOT_VAR0 &&
- var->data.location - VARYING_SLOT_VAR0 < 32) {
+ var->data.location - VARYING_SLOT_VAR0 < MAX_VARYINGS_INCL_PATCH) {

const struct glsl_type *type = var->type;
if (nir_is_per_vertex_io(var, stage)) {
@@ -270,6 +268,12 @@ get_slot_component_masks_and_interp_types(struct exec_list *var_list,
type = glsl_get_array_element(type);
}

+ /* If we can pack this varying then don't mark the components as
+ * used.
+ */
+ if (is_packing_supported_for_type(type))
+ continue;
+
unsigned location = var->data.location - VARYING_SLOT_VAR0;
unsigned elements =
glsl_get_vector_elements(glsl_without_array(type));
@@ -278,10 +282,6 @@ get_slot_component_masks_and_interp_types(struct exec_list *var_list,
unsigned slots = glsl_count_attribute_slots(type, false);
unsigned comps_slot2 = 0;
for (unsigned i = 0; i < slots; i++) {
- interp_type[location + i] =
- get_interp_type(var, type, default_to_smooth_interp);
- interp_loc[location + i] = get_interp_loc(var);
-
if (dual_slot) {
if (i & 1) {
comps[location + i] |= ((1 << comps_slot2) - 1);
@@ -413,32 +413,55 @@ remap_slots_and_components(struct exec_list *var_list, gl_shader_stage stage,
*p_out_slots_read = out_slots_read_tmp[1];
}

-/* If there are empty components in the slot compact the remaining components
- * as close to component 0 as possible. This will make it easier to fill the
- * empty components with components from a different slot in a following pass.
- */
-static void
-compact_components(nir_shader *producer, nir_shader *consumer, uint8_t *comps,
- uint8_t *interp_type, uint8_t *interp_loc,
- bool default_to_smooth_interp)
+struct varying_component {
+ nir_variable *var;
+ uint8_t interp_type;
+ uint8_t interp_loc;
+ bool is_patch;
+ bool initialised;
+};
+
+static int
+cmp_varying_component(const void *comp1_v, const void *comp2_v)
{
- struct exec_list *input_list = &consumer->inputs;
- struct exec_list *output_list = &producer->outputs;
- struct varying_loc remap[MAX_VARYINGS_INCL_PATCH][4] = {{{0}, {0}}};
+ struct varying_component *comp1 = (struct varying_component *) comp1_v;
+ struct varying_component *comp2 = (struct varying_component *) comp2_v;

- /* Create a cursor for each interpolation type */
- unsigned cursor[4] = {0};
+ /* We want patches to be order at the end of the array */
+ if (comp1->is_patch != comp2->is_patch)
+ return comp1->is_patch ? 1 : -1;

- /* We only need to pass over one stage and we choose the consumer as it seems
- * to cause a larger reduction in instruction counts (tested on i965).
+ /* We can only pack varyings with matching interpolation types so group
+ * them together.
*/
- nir_foreach_variable(var, input_list) {
+ if (comp1->interp_type != comp2->interp_type)
+ return comp1->interp_type - comp2->interp_type;
+
+ /* Interpolation loc must match also. */
+ if (comp1->interp_loc != comp2->interp_loc)
+ return comp1->interp_loc - comp2->interp_loc;
+
+ /* If everything else matches just use the original location to sort */
+ return comp1->var->data.location - comp2->var->data.location;
+}
+
+static void
+gather_varying_component_info(nir_shader *consumer,
+ struct varying_component **varying_comp_info,
+ unsigned *varying_comp_info_size,
+ bool default_to_smooth_interp)
+{
+ unsigned store_varying_info_idx[MAX_VARYINGS_INCL_PATCH][4] = {0};
+ unsigned num_of_comps_to_pack = 0;

- /* Only remap things that aren't builtins.
- * TODO: add TES patch support.
- */
+ /* Count the number of varying that can be packed and create a mapping
+ * of those varyings to the array we will pass to qsort.
+ */
+ nir_foreach_variable(var, &consumer->inputs) {
+
+ /* Only remap things that aren't builtins. */
if (var->data.location >= VARYING_SLOT_VAR0 &&
- var->data.location - VARYING_SLOT_VAR0 < 32) {
+ var->data.location - VARYING_SLOT_VAR0 < MAX_VARYINGS_INCL_PATCH) {

/* We can't repack xfb varyings. */
if (var->data.always_active_io)
@@ -450,88 +473,156 @@ compact_components(nir_shader *producer, nir_shader *consumer, uint8_t *comps,
type = glsl_get_array_element(type);
}

- /* Skip types that require more complex packing handling.
- * TODO: add support for these types.
- */
- if (glsl_type_is_array(type) ||
- glsl_type_is_dual_slot(type) ||
- glsl_type_is_matrix(type) ||
- glsl_type_is_struct(type) ||
- glsl_type_is_64bit(type))
+ if (!is_packing_supported_for_type(type))
continue;

- /* We ignore complex types above and all other vector types should
- * have been split into scalar variables by the lower_io_to_scalar
- * pass. The only exeption should by OpenGL xfb varyings.
- */
- if (glsl_get_vector_elements(type) != 1)
+ unsigned loc = var->data.location - VARYING_SLOT_VAR0;
+ store_varying_info_idx[loc][var->data.location_frac] =
+ ++num_of_comps_to_pack;
+ }
+ }
+
+ *varying_comp_info_size = num_of_comps_to_pack;
+ *varying_comp_info = rzalloc_array(NULL, struct varying_component,
+ num_of_comps_to_pack);
+
+ nir_function_impl *impl = nir_shader_get_entrypoint(consumer);
+
+ /* Walk over the shader and populate the varying component info array */
+ nir_foreach_block(block, impl) {
+ nir_foreach_instr(instr, block) {
+ if (instr->type != nir_instr_type_intrinsic)
continue;

- unsigned location = var->data.location - VARYING_SLOT_VAR0;
- uint8_t used_comps = comps[location];
+ nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+ if (intr->intrinsic != nir_intrinsic_load_deref)
+ continue;

- /* If there are no empty components there is nothing more for us to do.
- */
- if (used_comps == 0xf)
+ nir_variable *in_var =
+ nir_deref_instr_get_variable(nir_src_as_deref(intr->src[0]));
+
+ if (in_var->data.mode != nir_var_shader_in)
continue;

- bool found_new_offset = false;
- uint8_t interp = get_interp_type(var, type, default_to_smooth_interp);
- for (; cursor[interp] < 32; cursor[interp]++) {
- uint8_t cursor_used_comps = comps[cursor[interp]];
+ /* We only remap things that aren't builtins. */
+ if (in_var->data.location < VARYING_SLOT_VAR0)
+ continue;

- /* We couldn't find anywhere to pack the varying continue on. */
- if (cursor[interp] == location &&
- (var->data.location_frac == 0 ||
- cursor_used_comps & ((1 << (var->data.location_frac)) - 1)))
- break;
+ unsigned location = in_var->data.location - VARYING_SLOT_VAR0;
+ if (location >= MAX_VARYINGS_INCL_PATCH)
+ continue;

- /* We can only pack varyings with matching interpolation types */
- if (interp_type[cursor[interp]] != interp)
- continue;
+ unsigned var_info_idx =
+ store_varying_info_idx[location][in_var->data.location_frac];
+ if (!var_info_idx)
+ continue;

- /* Interpolation loc must match also.
- * TODO: i965 can handle these if they don't match, but the
- * radeonsi nir backend handles everything as vec4s and so expects
- * this to be the same for all components. We could make this
- * check driver specfific or drop it if NIR ever become the only
- * radeonsi backend.
- */
- if (interp_loc[cursor[interp]] != get_interp_loc(var))
- continue;
+ struct varying_component *vc_info =
+ &(*varying_comp_info)[var_info_idx-1];

- /* If the slot is empty just skip it for now, compact_var_list()
- * can be called after this function to remove empty slots for us.
- * TODO: finish implementing compact_var_list() requires array and
- * matrix splitting.
- */
- if (!cursor_used_comps)
- continue;
+ if (!vc_info->initialised) {
+ const struct glsl_type *type = in_var->type;
+ if (nir_is_per_vertex_io(in_var, consumer->info.stage)) {
+ assert(glsl_type_is_array(type));
+ type = glsl_get_array_element(type);
+ }

- uint8_t unused_comps = ~cursor_used_comps;
+ vc_info->var = in_var;
+ vc_info->interp_type =
+ get_interp_type(in_var, type, default_to_smooth_interp);
+ vc_info->interp_loc = get_interp_loc(in_var);
+ vc_info->is_patch = in_var->data.patch;
+ }
+ }
+ }
+}

- for (unsigned i = 0; i < 4; i++) {
- uint8_t new_var_comps = 1 << i;
- if (unused_comps & new_var_comps) {
- remap[location][var->data.location_frac].component = i;
- remap[location][var->data.location_frac].location =
- cursor[interp] + VARYING_SLOT_VAR0;
+/* If there are empty components in the slot compact the remaining components
+ * as close to component 0 as possible. This will make it easier to fill the
+ * empty components with components from a different slot in a following pass.
+ */
+static void
+compact_components(nir_shader *producer, nir_shader *consumer,
+ uint8_t *unmoveable_comps, bool default_to_smooth_interp)
+{
+ struct exec_list *input_list = &consumer->inputs;
+ struct exec_list *output_list = &producer->outputs;
+ struct varying_loc remap[MAX_VARYINGS_INCL_PATCH][4] = {{{0}, {0}}};
+ struct varying_component *varying_comp_info;
+ unsigned varying_comp_info_size;

- found_new_offset = true;
+ /* Gather varying component info */
+ gather_varying_component_info(consumer, &varying_comp_info,
+ &varying_comp_info_size,
+ default_to_smooth_interp);

- /* Turn off the mask for the component we are remapping */
- if (comps[location] & 1 << var->data.location_frac) {
- comps[location] ^= 1 << var->data.location_frac;
- comps[cursor[interp]] |= new_var_comps;
- }
- break;
- }
+ /* Sort varying components. This attempts to reduce register pressure by
+ * packing components with a similar live range.
+ */
+ qsort(varying_comp_info, varying_comp_info_size,
+ sizeof(struct varying_component), cmp_varying_component);
+
+ unsigned cursor = 0;
+ unsigned comp = 0;
+ struct varying_component *prev_info = NULL;
+
+ /* Set the remap array based on the sorted components */
+ for (unsigned i = 0; i < varying_comp_info_size; i++ ) {
+ struct varying_component *info = &varying_comp_info[i];
+
+ /* Set the cursor to start at 32 for patches */
+ if (info->is_patch) {
+ if (!prev_info)
+ cursor = MAX_VARYING;
+ else if (info->is_patch != prev_info->is_patch)
+ cursor = MAX_VARYING;
+ }
+
+ for (; cursor < MAX_VARYINGS_INCL_PATCH; cursor++) {
+
+ while (comp < 4) {
+ if (unmoveable_comps[cursor] & (1 << comp)) {
+ comp++;
+ continue;
}

- if (found_new_offset)
- break;
+ break;
+ }
+
+ if (comp == 4) {
+ comp = 0;
+ continue;
}
+
+ /* We can only pack varyings with matching interpolation types */
+ if (prev_info && comp != 0 &&
+ info->interp_type != prev_info->interp_type) {
+ comp = 0;
+ continue;
+ }
+
+ /* Interpolation loc must match also.
+ * TODO: i965 can handle these if they don't match, but the
+ * radeonsi nir backend handles everything as vec4s and so expects
+ * this to be the same for all components. We could make this
+ * check driver specfific or drop it if NIR ever become the only
+ * radeonsi backend.
+ */
+ if (prev_info && comp != 0 &&
+ info->interp_loc != prev_info->interp_loc) {
+ comp = 0;
+ continue;
+ }
+
+ unsigned location = info->var->data.location - VARYING_SLOT_VAR0;
+ remap[location][info->var->data.location_frac].component = comp++;
+ remap[location][info->var->data.location_frac].location =
+ cursor + VARYING_SLOT_VAR0;
+
+ break;
}
+
+ prev_info = info;
}

uint64_t zero = 0;
@@ -563,20 +654,16 @@ nir_compact_varyings(nir_shader *producer, nir_shader *consumer,
assert(producer->info.stage != MESA_SHADER_FRAGMENT);
assert(consumer->info.stage != MESA_SHADER_VERTEX);

- uint8_t comps[32] = {0};
- uint8_t interp_type[32] = {0};
- uint8_t interp_loc[32] = {0};
+ uint8_t unmoveable_comps[MAX_VARYINGS_INCL_PATCH] = {0};

- get_slot_component_masks_and_interp_types(&producer->outputs, comps,
- interp_type, interp_loc,
- producer->info.stage,
- default_to_smooth_interp);
- get_slot_component_masks_and_interp_types(&consumer->inputs, comps,
- interp_type, interp_loc,
- consumer->info.stage,
- default_to_smooth_interp);
+ get_unmoveable_components_masks(&producer->outputs, unmoveable_comps,
+ producer->info.stage,
+ default_to_smooth_interp);
+ get_unmoveable_components_masks(&consumer->inputs, unmoveable_comps,
+ consumer->info.stage,
+ default_to_smooth_interp);

- compact_components(producer, consumer, comps, interp_type, interp_loc,
+ compact_components(producer, consumer, unmoveable_comps,
default_to_smooth_interp);
}

--
2.19.2

Timothy Arceri

2018-12-10 05:49:33 UTC

Permalink

Sorry please ignore this for now. I've realised there is a bug here
where we could end up packing components in only one of the shaders but
not the other. For example if we have an array on one side but just a
bunch of individual varyings on the other (which is legal I believe).
I'll send a version 3 to fix this.

Timothy Arceri

2018-12-10 00:30:59 UTC

Permalink

This will be used in the following patches to determine if we
support packing the components of a varying.
---
src/compiler/nir/nir_linking_helpers.c | 23 +++++++++++++++++++++++
1 file changed, 23 insertions(+)

diff --git a/src/compiler/nir/nir_linking_helpers.c b/src/compiler/nir/nir_linking_helpers.c
index 845aba5c87..c26582ddec 100644
--- a/src/compiler/nir/nir_linking_helpers.c
+++ b/src/compiler/nir/nir_linking_helpers.c
@@ -224,6 +224,29 @@ get_interp_loc(nir_variable *var)
return INTERPOLATE_LOC_CENTER;
}

+static bool
+is_packing_supported_for_type(const struct glsl_type *type)
+{
+ /* Skip types that require more complex packing handling.
+ * TODO: add support for these types?
+ */
+ if (glsl_type_is_array(type) ||
+ glsl_type_is_dual_slot(type) ||
+ glsl_type_is_matrix(type) ||
+ glsl_type_is_struct(type) ||
+ glsl_type_is_64bit(type))
+ return false;
+
+ /* We ignore complex types above and all other vector types should
+ * have been split into scalar variables by the lower_io_to_scalar
+ * pass. The only exeption should by OpenGL xfb varyings.
+ */
+ if (glsl_get_vector_elements(type) != 1)
+ return false;
+
+ return true;
+}
+
static void
get_slot_component_masks_and_interp_types(struct exec_list *var_list,
uint8_t *comps,

--
2.19.2