Discussion:
[PATCH 1/5] radeonsi/gfx9: fix vertex idx in ES with multiple waves per threadgroup
Add Reply
Nicolai Hähnle
2017-07-17 10:57:33 UTC
Reply
Permalink
Raw Message
From: Nicolai Hähnle <***@amd.com>

Cc: mesa-***@lists.freedesktop.org
---
src/gallium/drivers/radeonsi/si_shader.c | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 4c0cda5..7a44e61 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -2890,21 +2890,26 @@ static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context *bld_base)
struct si_shader *es = ctx->shader;
struct tgsi_shader_info *info = &es->selector->info;
LLVMValueRef soffset = LLVMGetParam(ctx->main_fn,
ctx->param_es2gs_offset);
LLVMValueRef lds_base = NULL;
unsigned chan;
int i;

if (ctx->screen->b.chip_class >= GFX9 && info->num_outputs) {
unsigned itemsize_dw = es->selector->esgs_itemsize / 4;
- lds_base = LLVMBuildMul(gallivm->builder, ac_get_thread_id(&ctx->ac),
+ LLVMValueRef vertex_idx = ac_get_thread_id(&ctx->ac);
+ LLVMValueRef wave_idx = unpack_param(ctx, ctx->param_merged_wave_info, 24, 4);
+ vertex_idx = LLVMBuildOr(gallivm->builder, vertex_idx,
+ LLVMBuildMul(gallivm->builder, wave_idx,
+ LLVMConstInt(ctx->i32, 64, false), ""), "");
+ lds_base = LLVMBuildMul(gallivm->builder, vertex_idx,
LLVMConstInt(ctx->i32, itemsize_dw, 0), "");
}

for (i = 0; i < info->num_outputs; i++) {
LLVMValueRef *out_ptr = ctx->outputs[i];
int param;

if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
continue;
--
2.9.3
Nicolai Hähnle
2017-07-17 10:57:35 UTC
Reply
Permalink
Raw Message
From: Nicolai Hähnle <***@amd.com>

---
src/gallium/drivers/radeonsi/si_shader.c | 7 +++++++
1 file changed, 7 insertions(+)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 9aeda49..91f2ea3 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -5020,20 +5020,27 @@ const char *si_get_shader_name(const struct si_shader *shader, unsigned processo

void si_shader_dump(struct si_screen *sscreen, const struct si_shader *shader,
struct pipe_debug_callback *debug, unsigned processor,
FILE *file, bool check_debug_option)
{
if (!check_debug_option ||
r600_can_dump_shader(&sscreen->b, processor))
si_dump_shader_key(processor, shader, file);

if (!check_debug_option && shader->binary.llvm_ir_string) {
+ if (shader->previous_stage &&
+ shader->previous_stage->binary.llvm_ir_string) {
+ fprintf(file, "\n%s - previous stage - LLVM IR:\n\n",
+ si_get_shader_name(shader, processor));
+ fprintf(file, "%s\n", shader->previous_stage->binary.llvm_ir_string);
+ }
+
fprintf(file, "\n%s - main shader part - LLVM IR:\n\n",
si_get_shader_name(shader, processor));
fprintf(file, "%s\n", shader->binary.llvm_ir_string);
}

if (!check_debug_option ||
(r600_can_dump_shader(&sscreen->b, processor) &&
!(sscreen->b.debug_flags & DBG_NO_ASM))) {
fprintf(file, "\n%s:\n", si_get_shader_name(shader, processor));
--
2.9.3
Nicolai Hähnle
2017-07-17 10:57:36 UTC
Reply
Permalink
Raw Message
From: Nicolai Hähnle <***@amd.com>

---
src/gallium/drivers/radeonsi/si_shader.c | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 91f2ea3..af93ca1 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -7344,29 +7344,33 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
* workaround must be applied.
*/
if (shader->is_monolithic) {
/* Monolithic shader (compiled as a whole, has many variants,
* may take a long time to compile).
*/
r = si_compile_tgsi_shader(sscreen, tm, shader, true, debug);
if (r)
return r;
} else {
- /* The shader consists of 2-3 parts:
+ /* The shader consists of several parts:
*
* - the middle part is the user shader, it has 1 variant only
* and it was compiled during the creation of the shader
* selector
* - the prolog part is inserted at the beginning
* - the epilog part is inserted at the end
*
* The prolog and epilog have many (but simple) variants.
+ *
+ * Starting with gfx9, geometry and tessellation control
+ * shaders also contain the prolog and user shader parts of
+ * the previous shader stage.
*/

/* Copy the compiled TGSI shader data over. */
shader->is_binary_shared = true;
shader->binary = mainp->binary;
shader->config = mainp->config;
shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
memcpy(shader->info.vs_output_param_offset,
--
2.9.3
Nicolai Hähnle
2017-07-17 10:57:37 UTC
Reply
Permalink
Raw Message
From: Nicolai Hähnle <***@amd.com>

---
src/gallium/drivers/radeonsi/si_shader.c | 3 +++
1 file changed, 3 insertions(+)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index af93ca1..30de03f 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -7359,20 +7359,23 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
* - the prolog part is inserted at the beginning
* - the epilog part is inserted at the end
*
* The prolog and epilog have many (but simple) variants.
*
* Starting with gfx9, geometry and tessellation control
* shaders also contain the prolog and user shader parts of
* the previous shader stage.
*/

+ if (!mainp)
+ return -1;
+
/* Copy the compiled TGSI shader data over. */
shader->is_binary_shared = true;
shader->binary = mainp->binary;
shader->config = mainp->config;
shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
memcpy(shader->info.vs_output_param_offset,
mainp->info.vs_output_param_offset,
sizeof(mainp->info.vs_output_param_offset));
--
2.9.3
Marek Olšák
2017-07-25 18:52:08 UTC
Reply
Permalink
Raw Message
Patches 1, 3-5:

Reviewed: Marek Olšák <***@amd.com>

Marek
Post by Nicolai Hähnle
---
src/gallium/drivers/radeonsi/si_shader.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index af93ca1..30de03f 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -7359,20 +7359,23 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
* - the prolog part is inserted at the beginning
* - the epilog part is inserted at the end
*
* The prolog and epilog have many (but simple) variants.
*
* Starting with gfx9, geometry and tessellation control
* shaders also contain the prolog and user shader parts of
* the previous shader stage.
*/
+ if (!mainp)
+ return -1;
+
/* Copy the compiled TGSI shader data over. */
shader->is_binary_shared = true;
shader->binary = mainp->binary;
shader->config = mainp->config;
shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
memcpy(shader->info.vs_output_param_offset,
mainp->info.vs_output_param_offset,
sizeof(mainp->info.vs_output_param_offset));
--
2.9.3
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev
Nicolai Hähnle
2017-07-17 10:57:34 UTC
Reply
Permalink
Raw Message
From: Nicolai Hähnle <***@amd.com>

With merged ESGS shaders, the GS part of a wave may be empty, and the
hardware gets confused if any GS messages are sent from that wave. Since
S_SENDMSG is executed even when EXEC = 0, we have to wrap even
non-monolithic GS shaders in an if-block, so that the entire shader and
hence the S_SENDMSG instructions are skipped in empty waves.

This change is not required for TCS/HS, but applying it there as well
simplifies the code a bit.

Fixes GL45-CTS.geometry_shader.rendering.rendering.*

Cc: mesa-***@lists.freedesktop.org
---
src/gallium/drivers/radeonsi/si_shader.c | 74 +++++++++++++----------
src/gallium/drivers/radeonsi/si_shader_internal.h | 3 +
2 files changed, 45 insertions(+), 32 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 7a44e61..9aeda49 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -2713,20 +2713,23 @@ si_insert_input_ptr_as_2xi32(struct si_shader_context *ctx, LLVMValueRef ret,
}

/* This only writes the tessellation factor levels. */
static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
{
struct si_shader_context *ctx = si_shader_context(bld_base);
LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;

si_copy_tcs_inputs(bld_base);

+ if (ctx->screen->b.chip_class >= GFX9)
+ lp_build_endif(&ctx->merged_wrap_if_state);
+
rel_patch_id = get_rel_patch_id(ctx);
invocation_id = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);

/* Return epilog parameters from this function. */
LLVMBuilderRef builder = ctx->gallivm.builder;
LLVMValueRef ret = ctx->return_value;
unsigned vgpr;

if (ctx->screen->b.chip_class >= GFX9) {
@@ -2946,20 +2949,23 @@ static LLVMValueRef si_get_gs_wave_id(struct si_shader_context *ctx)
else
return LLVMGetParam(ctx->main_fn, ctx->param_gs_wave_id);
}

static void si_llvm_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base)
{
struct si_shader_context *ctx = si_shader_context(bld_base);

ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE,
si_get_gs_wave_id(ctx));
+
+ if (ctx->screen->b.chip_class >= GFX9)
+ lp_build_endif(&ctx->merged_wrap_if_state);
}

static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context *bld_base)
{
struct si_shader_context *ctx = si_shader_context(bld_base);
struct gallivm_state *gallivm = &ctx->gallivm;
struct tgsi_shader_info *info = &ctx->shader->selector->info;
struct si_shader_output_values *outputs = NULL;
int i,j;

@@ -5523,39 +5529,55 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx,
break;
default:
assert(!"Unsupported shader type");
return false;
}

create_function(ctx);
preload_ring_buffers(ctx);

/* For GFX9 merged shaders:
- * - Set EXEC. If the prolog is present, set EXEC there instead.
+ * - Set EXEC for the first shader. If the prolog is present, set
+ * EXEC there instead.
* - Add a barrier before the second shader.
+ * - In the second shader, reset EXEC to ~0 and wrap the main part in
+ * an if-statement. This is required for correctness in geometry
+ * shaders, to ensure that empty GS waves do not send GS_EMIT and
+ * GS_CUT messages.
*
- * The same thing for monolithic shaders is done in
- * si_build_wrapper_function.
+ * For monolithic merged shaders, the first shader is wrapped in an
+ * if-block together with its prolog in si_build_wrapper_function.
*/
- if (ctx->screen->b.chip_class >= GFX9 && !is_monolithic) {
- if (sel->info.num_instructions > 1 && /* not empty shader */
+ if (ctx->screen->b.chip_class >= GFX9) {
+ if (!is_monolithic &&
+ sel->info.num_instructions > 1 && /* not empty shader */
(shader->key.as_es || shader->key.as_ls) &&
(ctx->type == PIPE_SHADER_TESS_EVAL ||
(ctx->type == PIPE_SHADER_VERTEX &&
!sel->vs_needs_prolog))) {
si_init_exec_from_input(ctx,
ctx->param_merged_wave_info, 0);
} else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
ctx->type == PIPE_SHADER_GEOMETRY) {
- si_init_exec_from_input(ctx,
- ctx->param_merged_wave_info, 8);
+ if (!is_monolithic)
+ si_init_exec_full_mask(ctx);
+
+ /* The barrier must execute for all shaders in a
+ * threadgroup.
+ */
si_llvm_emit_barrier(NULL, bld_base, NULL);
+
+ LLVMValueRef num_threads = unpack_param(ctx, ctx->param_merged_wave_info, 8, 8);
+ LLVMValueRef ena =
+ LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
+ ac_get_thread_id(&ctx->ac), num_threads, "");
+ lp_build_if(&ctx->merged_wrap_if_state, &ctx->gallivm, ena);
}
}

if (ctx->type == PIPE_SHADER_GEOMETRY) {
int i;
for (i = 0; i < 4; i++) {
ctx->gs_next_vertex[i] =
lp_build_alloca(&ctx->gallivm,
ctx->i32, "");
}
@@ -6012,29 +6034,23 @@ static void si_build_wrapper_function(struct si_shader_context *ctx,
LLVMValueRef in[48];
LLVMValueRef ret;
LLVMTypeRef ret_type;
unsigned out_idx = 0;

num_params = LLVMCountParams(parts[part]);
assert(num_params <= ARRAY_SIZE(param_types));

/* Merged shaders are executed conditionally depending
* on the number of enabled threads passed in the input SGPRs. */
- if (is_merged_shader(ctx->shader) &&
- (part == 0 || part == next_shader_first_part)) {
+ if (is_merged_shader(ctx->shader) && part == 0) {
LLVMValueRef ena, count = initial[3];

- /* The thread count for the 2nd shader is at bit-offset 8. */
- if (part == next_shader_first_part) {
- count = LLVMBuildLShr(builder, count,
- LLVMConstInt(ctx->i32, 8, 0), "");
- }
count = LLVMBuildAnd(builder, count,
LLVMConstInt(ctx->i32, 0x7f, 0), "");
ena = LLVMBuildICmp(builder, LLVMIntULT,
ac_get_thread_id(&ctx->ac), count, "");
lp_build_if(&if_state, &ctx->gallivm, ena);
}

/* Derive arguments for the next part from outputs of the
* previous one.
*/
@@ -6077,40 +6093,34 @@ static void si_build_wrapper_function(struct si_shader_context *ctx,
}
}

in[param_idx] = arg;
out_idx += param_size;
}

ret = LLVMBuildCall(builder, parts[part], in, num_params, "");

if (is_merged_shader(ctx->shader) &&
- (part + 1 == next_shader_first_part ||
- part + 1 == num_parts)) {
+ part + 1 == next_shader_first_part) {
lp_build_endif(&if_state);

- if (part + 1 == next_shader_first_part) {
- /* A barrier is required between 2 merged shaders. */
- si_llvm_emit_barrier(NULL, &ctx->bld_base, NULL);
-
- /* The second half of the merged shader should use
- * the inputs from the toplevel (wrapper) function,
- * not the return value from the last call.
- *
- * That's because the last call was executed condi-
- * tionally, so we can't consume it in the main
- * block.
- */
- memcpy(out, initial, sizeof(initial));
- num_out = initial_num_out;
- num_out_sgpr = initial_num_out_sgpr;
- }
+ /* The second half of the merged shader should use
+ * the inputs from the toplevel (wrapper) function,
+ * not the return value from the last call.
+ *
+ * That's because the last call was executed condi-
+ * tionally, so we can't consume it in the main
+ * block.
+ */
+ memcpy(out, initial, sizeof(initial));
+ num_out = initial_num_out;
+ num_out_sgpr = initial_num_out_sgpr;
continue;
}

/* Extract the returned GPRs. */
ret_type = LLVMTypeOf(ret);
num_out = 0;
num_out_sgpr = 0;

if (LLVMGetTypeKind(ret_type) != LLVMVoidTypeKind) {
assert(LLVMGetTypeKind(ret_type) == LLVMStructTypeKind);
diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h
index 3556e69..3f127cf 100644
--- a/src/gallium/drivers/radeonsi/si_shader_internal.h
+++ b/src/gallium/drivers/radeonsi/si_shader_internal.h
@@ -18,20 +18,21 @@
* THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
* USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

#ifndef SI_SHADER_PRIVATE_H
#define SI_SHADER_PRIVATE_H

#include "si_shader.h"
+#include "gallivm/lp_bld_flow.h"
#include "gallivm/lp_bld_init.h"
#include "gallivm/lp_bld_tgsi.h"
#include "tgsi/tgsi_parse.h"
#include "ac_llvm_util.h"
#include "ac_llvm_build.h"

#include <llvm-c/Core.h>
#include <llvm-c/TargetMachine.h>

struct pipe_debug_callback;
@@ -98,20 +99,22 @@ struct si_shader_context {
unsigned temps_count;
LLVMValueRef system_values[RADEON_LLVM_MAX_SYSTEM_VALUES];

LLVMValueRef *imms;
unsigned imms_num;

struct si_llvm_flow *flow;
unsigned flow_depth;
unsigned flow_depth_max;

+ struct lp_build_if_state merged_wrap_if_state;
+
struct tgsi_array_info *temp_arrays;
LLVMValueRef *temp_array_allocas;

LLVMValueRef undef_alloca;

LLVMValueRef main_fn;
LLVMTypeRef return_type;

/* Parameter indices for LLVMGetParam. */
int param_rw_buffers;
--
2.9.3
Marek Olšák
2017-07-25 18:47:54 UTC
Reply
Permalink
Raw Message
Post by Nicolai Hähnle
With merged ESGS shaders, the GS part of a wave may be empty, and the
hardware gets confused if any GS messages are sent from that wave. Since
S_SENDMSG is executed even when EXEC = 0, we have to wrap even
non-monolithic GS shaders in an if-block, so that the entire shader and
hence the S_SENDMSG instructions are skipped in empty waves.
This change is not required for TCS/HS, but applying it there as well
simplifies the code a bit.
Fixes GL45-CTS.geometry_shader.rendering.rendering.*
---
src/gallium/drivers/radeonsi/si_shader.c | 74 +++++++++++++----------
src/gallium/drivers/radeonsi/si_shader_internal.h | 3 +
2 files changed, 45 insertions(+), 32 deletions(-)
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 7a44e61..9aeda49 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -2713,20 +2713,23 @@ si_insert_input_ptr_as_2xi32(struct si_shader_context *ctx, LLVMValueRef ret,
}
/* This only writes the tessellation factor levels. */
static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
{
struct si_shader_context *ctx = si_shader_context(bld_base);
LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
si_copy_tcs_inputs(bld_base);
+ if (ctx->screen->b.chip_class >= GFX9)
+ lp_build_endif(&ctx->merged_wrap_if_state);
+
rel_patch_id = get_rel_patch_id(ctx);
invocation_id = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
/* Return epilog parameters from this function. */
LLVMBuilderRef builder = ctx->gallivm.builder;
LLVMValueRef ret = ctx->return_value;
unsigned vgpr;
if (ctx->screen->b.chip_class >= GFX9) {
@@ -2946,20 +2949,23 @@ static LLVMValueRef si_get_gs_wave_id(struct si_shader_context *ctx)
else
return LLVMGetParam(ctx->main_fn, ctx->param_gs_wave_id);
}
static void si_llvm_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base)
{
struct si_shader_context *ctx = si_shader_context(bld_base);
ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE,
si_get_gs_wave_id(ctx));
+
+ if (ctx->screen->b.chip_class >= GFX9)
+ lp_build_endif(&ctx->merged_wrap_if_state);
}
static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context *bld_base)
{
struct si_shader_context *ctx = si_shader_context(bld_base);
struct gallivm_state *gallivm = &ctx->gallivm;
struct tgsi_shader_info *info = &ctx->shader->selector->info;
struct si_shader_output_values *outputs = NULL;
int i,j;
@@ -5523,39 +5529,55 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx,
break;
assert(!"Unsupported shader type");
return false;
}
create_function(ctx);
preload_ring_buffers(ctx);
- * - Set EXEC. If the prolog is present, set EXEC there instead.
+ * - Set EXEC for the first shader. If the prolog is present, set
+ * EXEC there instead.
* - Add a barrier before the second shader.
+ * - In the second shader, reset EXEC to ~0 and wrap the main part in
+ * an if-statement. This is required for correctness in geometry
+ * shaders, to ensure that empty GS waves do not send GS_EMIT and
+ * GS_CUT messages.
*
- * The same thing for monolithic shaders is done in
- * si_build_wrapper_function.
+ * For monolithic merged shaders, the first shader is wrapped in an
+ * if-block together with its prolog in si_build_wrapper_function.
*/
- if (ctx->screen->b.chip_class >= GFX9 && !is_monolithic) {
- if (sel->info.num_instructions > 1 && /* not empty shader */
+ if (ctx->screen->b.chip_class >= GFX9) {
+ if (!is_monolithic &&
+ sel->info.num_instructions > 1 && /* not empty shader */
(shader->key.as_es || shader->key.as_ls) &&
(ctx->type == PIPE_SHADER_TESS_EVAL ||
(ctx->type == PIPE_SHADER_VERTEX &&
!sel->vs_needs_prolog))) {
si_init_exec_from_input(ctx,
ctx->param_merged_wave_info, 0);
} else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
ctx->type == PIPE_SHADER_GEOMETRY) {
- si_init_exec_from_input(ctx,
- ctx->param_merged_wave_info, 8);
+ if (!is_monolithic)
+ si_init_exec_full_mask(ctx);
The TCS epilog will execute with the full EXEC mask, which might cause issues.

Marek
Nicolai Hähnle
2017-07-29 07:22:32 UTC
Reply
Permalink
Raw Message
Thanks. I think I know what the issue is, testing a fix now.

Cheers,
Nicolai
Hi Nicolai,
FYI, this patch breaks 30 piglit tests on GFX9. Non-monolithic and
R600_DEBUG=nooptvariant ../piglit/bin/shader_runner
/home/marek/dev/piglit/generated_tests/spec/arb_tessellation_shader/execution/built-in-functions/tcs-op-selection-bool-mat2-mat2.shader_test
-auto
ATTENTION: default value of option vblank_mode overridden by environment.
LLVM triggered Diagnostic Handler: <unknown>:0:0: in function main <{
i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32,
i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float,
float, float, float }> ([12 x <4 x i32>] addrspace(2)*, i32, i32, i32,
i32, i32, i32, i32, i32, [32 x <4 x i32>] addrspace(2)*, [80 x <8 x
i32>] addrspace(2)*, [16 x <4 x i32>] addrspace(2)*, i32, i32, i32, i32,
i32, i32, i32, i32, i32, i32, [32 x <4 x i32>] addrspace(2)*, [80 x <8 x
i32>] addrspace(2)*, i32, i32): unsupported dynamic alloca
LLVM failed to compile shader
radeonsi: can't compile a main shader part
R600_DEBUG=mono ../piglit/bin/shader_runner
/home/marek/dev/piglit/generated_tests/spec/arb_tessellation_shader/execution/built-in-functions/tcs-op-selection-bool-mat2-mat2.shader_test
-auto
ATTENTION: default value of option vblank_mode overridden by environment.
LLVM ERROR: Cannot select: t5: i32,ch = stacksave t4
In function: wrapper
Segmentation fault
Marek
With merged ESGS shaders, the GS part of a wave may be empty, and the
hardware gets confused if any GS messages are sent from that wave. Since
S_SENDMSG is executed even when EXEC = 0, we have to wrap even
non-monolithic GS shaders in an if-block, so that the entire shader and
hence the S_SENDMSG instructions are skipped in empty waves.
This change is not required for TCS/HS, but applying it there as well
simplifies the logic a bit.
Fixes GL45-CTS.geometry_shader.rendering.rendering.*
v2: ensure that the TCS epilog doesn't run for non-existing patches
---
src/gallium/drivers/radeonsi/si_shader.c | 109
+++++++++++++++-------
src/gallium/drivers/radeonsi/si_shader_internal.h | 3 +
2 files changed, 79 insertions(+), 33 deletions(-)
diff --git a/src/gallium/drivers/radeonsi/si_shader.c
b/src/gallium/drivers/radeonsi/si_shader.c
index a153cb7..9376d90 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -175,6 +175,20 @@ unsigned si_shader_io_get_unique_index(unsigned
semantic_name, unsigned index)
}
/**
+ * Helper function that builds an LLVM IR PHI node and immediately adds
+ * incoming edges.
+ */
+static LLVMValueRef
+build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type,
+ unsigned count_incoming, LLVMValueRef *values,
+ LLVMBasicBlockRef *blocks)
+{
+ LLVMValueRef phi = LLVMBuildPhi(ctx->builder, type, "");
+ LLVMAddIncoming(phi, values, blocks, count_incoming);
+ return phi;
+}
+
+/**
* Get the value of a shader input parameter and extract a bitfield.
*/
static LLVMValueRef unpack_param(struct si_shader_context *ctx,
@@ -2698,6 +2712,7 @@ si_insert_input_ptr_as_2xi32(struct
si_shader_context *ctx, LLVMValueRef ret,
static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
{
struct si_shader_context *ctx = si_shader_context(bld_base);
+ LLVMBuilderRef builder = ctx->gallivm.builder;
LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
si_copy_tcs_inputs(bld_base);
@@ -2706,8 +2721,29 @@ static void si_llvm_emit_tcs_epilogue(struct
lp_build_tgsi_context *bld_base)
invocation_id = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
+ if (ctx->screen->b.chip_class >= GFX9) {
+ LLVMBasicBlockRef blocks[2] = {
+ LLVMGetInsertBlock(builder),
+ ctx->merged_wrap_if_state.entry_block
+ };
+ LLVMValueRef values[2];
+
+ lp_build_endif(&ctx->merged_wrap_if_state);
+
+ values[0] = rel_patch_id;
+ values[1] = LLVMGetUndef(ctx->i32);
+ rel_patch_id = build_phi(&ctx->ac, ctx->i32, 2,
values, blocks);
+
+ values[0] = tf_lds_offset;
+ values[1] = LLVMGetUndef(ctx->i32);
+ tf_lds_offset = build_phi(&ctx->ac, ctx->i32, 2,
values, blocks);
+
+ values[0] = invocation_id;
+ values[1] = ctx->i32_1; /* cause the epilog to skip
threads */
+ invocation_id = build_phi(&ctx->ac, ctx->i32, 2,
values, blocks);
+ }
+
/* Return epilog parameters from this function. */
- LLVMBuilderRef builder = ctx->gallivm.builder;
LLVMValueRef ret = ctx->return_value;
unsigned vgpr;
@@ -2935,6 +2971,9 @@ static void si_llvm_emit_gs_epilogue(struct
lp_build_tgsi_context *bld_base)
ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP |
AC_SENDMSG_GS_DONE,
si_get_gs_wave_id(ctx));
+
+ if (ctx->screen->b.chip_class >= GFX9)
+ lp_build_endif(&ctx->merged_wrap_if_state);
}
static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context *bld_base)
@@ -5502,14 +5541,20 @@ static bool si_compile_tgsi_main(struct
si_shader_context *ctx,
preload_ring_buffers(ctx);
- * - Set EXEC. If the prolog is present, set EXEC there instead.
+ * - Set EXEC for the first shader. If the prolog is present, set
+ * EXEC there instead.
* - Add a barrier before the second shader.
+ * - In the second shader, reset EXEC to ~0 and wrap the main part in
+ * an if-statement. This is required for correctness in geometry
+ * shaders, to ensure that empty GS waves do not send GS_EMIT and
+ * GS_CUT messages.
*
- * The same thing for monolithic shaders is done in
- * si_build_wrapper_function.
+ * For monolithic merged shaders, the first shader is wrapped in an
+ * if-block together with its prolog in
si_build_wrapper_function.
*/
- if (ctx->screen->b.chip_class >= GFX9 && !is_monolithic) {
- if (sel->info.num_instructions > 1 && /* not empty shader */
+ if (ctx->screen->b.chip_class >= GFX9) {
+ if (!is_monolithic &&
+ sel->info.num_instructions > 1 && /* not empty shader */
(shader->key.as_es || shader->key.as_ls) &&
(ctx->type == PIPE_SHADER_TESS_EVAL ||
(ctx->type == PIPE_SHADER_VERTEX &&
@@ -5518,9 +5563,19 @@ static bool si_compile_tgsi_main(struct
si_shader_context *ctx,
ctx->param_merged_wave_info, 0);
} else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
ctx->type == PIPE_SHADER_GEOMETRY) {
- si_init_exec_from_input(ctx,
-
ctx->param_merged_wave_info, 8);
+ if (!is_monolithic)
+ si_init_exec_full_mask(ctx);
+
+ /* The barrier must execute for all shaders in a
+ * threadgroup.
+ */
si_llvm_emit_barrier(NULL, bld_base, NULL);
+
+ LLVMValueRef num_threads = unpack_param(ctx,
ctx->param_merged_wave_info, 8, 8);
+ LLVMValueRef ena =
+ LLVMBuildICmp(ctx->ac.builder,
LLVMIntULT,
+
ac_get_thread_id(&ctx->ac), num_threads, "");
+ lp_build_if(&ctx->merged_wrap_if_state,
&ctx->gallivm, ena);
}
}
@@ -5991,15 +6046,9 @@ static void si_build_wrapper_function(struct
si_shader_context *ctx,
/* Merged shaders are executed conditionally depending
* on the number of enabled threads passed in the
input SGPRs. */
- if (is_merged_shader(ctx->shader) &&
- (part == 0 || part == next_shader_first_part)) {
+ if (is_merged_shader(ctx->shader) && part == 0) {
LLVMValueRef ena, count = initial[3];
- /* The thread count for the 2nd shader is at
bit-offset 8. */
- if (part == next_shader_first_part) {
- count = LLVMBuildLShr(builder, count,
-
LLVMConstInt(ctx->i32, 8, 0), "");
- }
count = LLVMBuildAnd(builder, count,
LLVMConstInt(ctx->i32,
0x7f, 0), "");
ena = LLVMBuildICmp(builder, LLVMIntULT,
@@ -6056,26 +6105,20 @@ static void si_build_wrapper_function(struct
si_shader_context *ctx,
ret = LLVMBuildCall(builder, parts[part], in,
num_params, "");
if (is_merged_shader(ctx->shader) &&
- (part + 1 == next_shader_first_part ||
- part + 1 == num_parts)) {
+ part + 1 == next_shader_first_part) {
lp_build_endif(&if_state);
- if (part + 1 == next_shader_first_part) {
- /* A barrier is required between 2
merged shaders. */
- si_llvm_emit_barrier(NULL,
&ctx->bld_base, NULL);
-
- /* The second half of the merged
shader should use
- * the inputs from the toplevel
(wrapper) function,
- * not the return value from the
last call.
- *
- * That's because the last call was
executed condi-
- * tionally, so we can't consume it
in the main
- * block.
- */
- memcpy(out, initial, sizeof(initial));
- num_out = initial_num_out;
- num_out_sgpr = initial_num_out_sgpr;
- }
+ /* The second half of the merged shader
should use
+ * the inputs from the toplevel (wrapper)
function,
+ * not the return value from the last call.
+ *
+ * That's because the last call was executed
condi-
+ * tionally, so we can't consume it in the main
+ * block.
+ */
+ memcpy(out, initial, sizeof(initial));
+ num_out = initial_num_out;
+ num_out_sgpr = initial_num_out_sgpr;
continue;
}
diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h
b/src/gallium/drivers/radeonsi/si_shader_internal.h
index 6e86e0b..6b98bca 100644
--- a/src/gallium/drivers/radeonsi/si_shader_internal.h
+++ b/src/gallium/drivers/radeonsi/si_shader_internal.h
@@ -25,6 +25,7 @@
#define SI_SHADER_PRIVATE_H
#include "si_shader.h"
+#include "gallivm/lp_bld_flow.h"
#include "gallivm/lp_bld_init.h"
#include "gallivm/lp_bld_tgsi.h"
#include "tgsi/tgsi_parse.h"
@@ -105,6 +106,8 @@ struct si_shader_context {
unsigned flow_depth;
unsigned flow_depth_max;
+ struct lp_build_if_state merged_wrap_if_state;
+
struct tgsi_array_info *temp_arrays;
LLVMValueRef *temp_array_allocas;
--
2.9.3
_______________________________________________
mesa-stable mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-stable
<https://lists.freedesktop.org/mailman/listinfo/mesa-stable>
--
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.
Emil Velikov
2017-08-11 20:40:12 UTC
Reply
Permalink
Raw Message
Hi Nicolai,

It seems that the first two patches [1] in this series landed with the
stable tag.
While this patch alongside the remaining three [2] did not.

Do we want the latter in stable as well or shall we leave them out?

Thanks
Emil

[1]
873789002f5 radeonsi/gfx9: fix vertex idx in ES with multiple waves
per threadgroup
081ac6e5c6d radeonsi/gfx9: always wrap GS and TCS in an if-block (v2)

[2]
760876a7b11 radeonsi: make sure TCS main output VGPRs don't alias inputs
4738dd9546c radeonsi/gfx9: dump previous stage LLVM IR for merged shaders
4dd86631f41 radeonsi: update a comment for merged shaders
06e20c4b8c5 radeonsi: bail out instead of crashing if the main shader
part failed to compile
Marek Olšák
2017-08-11 20:51:52 UTC
Reply
Permalink
Raw Message
Hi Nicolai,
It seems that the first two patches [1] in this series landed with the
stable tag.
While this patch alongside the remaining three [2] did not.
Do we want the latter in stable as well or shall we leave them out?
We can leave it out. It's just a tiny optimization.

Marek

Loading...