Discussion:
[Mesa-dev] [PATCH 00/25] amd/common, radeonsi: misc cleanups, refactorings, etc.
Nicolai Hähnle
2018-12-06 14:00:21 UTC
Permalink
this is a grab bag of random patches that I've been accumulating, without
any real unifying theme. The main highlights are:

- finally move the perfcounter code into the radeonsi directory
- unify some RW buffer handling
- new helpers for cross-wave scans and reductions

Please review!
Thanks,
Nicolai
--
src/amd/common/ac_debug.c | 2 +
src/amd/common/ac_llvm_build.c | 247 +++++-
src/amd/common/ac_llvm_build.h | 37 +
src/amd/common/ac_nir_to_llvm.c | 2 +-
src/amd/common/ac_surface.c | 8 +-
src/amd/common/gfx9d.h | 12 +-
src/amd/common/sid.h | 13 +-
src/amd/common/sid_tables.py | 2 +-
src/amd/vulkan/radv_image.c | 8 +-
src/gallium/drivers/r600/sb/sb_ir.h | 2 +-
.../drivers/radeon/r600_perfcounter.c | 639 ---------------
.../drivers/radeonsi/Makefile.sources | 1 -
src/gallium/drivers/radeonsi/meson.build | 1 -
src/gallium/drivers/radeonsi/si_blit.c | 2 +-
src/gallium/drivers/radeonsi/si_build_pm4.h | 8 +-
src/gallium/drivers/radeonsi/si_cp_dma.c | 3 +-
src/gallium/drivers/radeonsi/si_debug.c | 13 +-
.../drivers/radeonsi/si_descriptors.c | 112 +--
.../drivers/radeonsi/si_perfcounter.c | 730 +++++++++++++++--
src/gallium/drivers/radeonsi/si_pipe.c | 40 +-
src/gallium/drivers/radeonsi/si_pipe.h | 6 +-
src/gallium/drivers/radeonsi/si_query.c | 254 +++---
src/gallium/drivers/radeonsi/si_query.h | 111 +--
src/gallium/drivers/radeonsi/si_shader.c | 43 +-
.../drivers/radeonsi/si_shader_tgsi_mem.c | 6 +-
src/gallium/drivers/radeonsi/si_state.c | 12 +-
src/gallium/drivers/radeonsi/si_state.h | 12 +-
src/gallium/drivers/radeonsi/si_state_draw.c | 40 +-
.../drivers/radeonsi/si_state_shaders.c | 4 +-
.../drivers/radeonsi/si_state_streamout.c | 61 +-
src/gallium/drivers/radeonsi/si_texture.c | 11 +-
.../winsys/amdgpu/drm/amdgpu_winsys.c | 36 +
32 files changed, 1331 insertions(+), 1147 deletions(-)
Nicolai Hähnle
2018-12-06 14:00:24 UTC
Permalink
From: Nicolai Hähnle <***@amd.com>

The definition wasn't actually changed in gfx9, so having the suffix
makes no sense.
---
src/amd/common/ac_nir_to_llvm.c | 2 +-
src/amd/common/gfx9d.h | 12 ++++++------
src/amd/common/sid.h | 12 ++++++------
src/amd/vulkan/radv_image.c | 8 ++++----
src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c | 6 +++---
src/gallium/drivers/radeonsi/si_state.c | 10 +++++-----
6 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index fe65dfff8f3..cbb5be4b1a2 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -1238,21 +1238,21 @@ static LLVMValueRef lower_gather4_integer(struct ac_llvm_context *ctx,
if (stype == GLSL_TYPE_UINT)
/* Create a NUM FORMAT - 0x2 or 0x4 - USCALED or UINT */
tmp = LLVMBuildSelect(ctx->builder, compare_cube_wa, LLVMConstInt(ctx->i32, 0x8000000, false),
LLVMConstInt(ctx->i32, 0x10000000, false), "");
else
/* Create a NUM FORMAT - 0x3 or 0x5 - SSCALED or SINT */
tmp = LLVMBuildSelect(ctx->builder, compare_cube_wa, LLVMConstInt(ctx->i32, 0xc000000, false),
LLVMConstInt(ctx->i32, 0x14000000, false), "");

/* replace the NUM FORMAT in the descriptor */
- tmp2 = LLVMBuildAnd(ctx->builder, tmp2, LLVMConstInt(ctx->i32, C_008F14_NUM_FORMAT_GFX6, false), "");
+ tmp2 = LLVMBuildAnd(ctx->builder, tmp2, LLVMConstInt(ctx->i32, C_008F14_NUM_FORMAT, false), "");
tmp2 = LLVMBuildOr(ctx->builder, tmp2, tmp, "");

args->resource = LLVMBuildInsertElement(ctx->builder, args->resource, tmp2, ctx->i32_1, "");

/* don't modify the coordinates for this case */
for (unsigned c = 0; c < 2; ++c)
args->coords[c] = LLVMBuildSelect(
ctx->builder, compare_cube_wa,
orig_coords[c], args->coords[c], "");
}
diff --git a/src/amd/common/gfx9d.h b/src/amd/common/gfx9d.h
index 2e790c54699..5d3de5842a1 100644
--- a/src/amd/common/gfx9d.h
+++ b/src/amd/common/gfx9d.h
@@ -1262,23 +1262,23 @@
#define S_030F14_COUNT_HI(x) (((unsigned)(x) & 0x7FFFFFFF) << 0)
#define G_030F14_COUNT_HI(x) (((x) >> 0) & 0x7FFFFFFF)
#define C_030F14_COUNT_HI 0x80000000
#define R_008F14_SQ_IMG_RSRC_WORD1 0x008F14
#define S_008F14_BASE_ADDRESS_HI(x) (((unsigned)(x) & 0xFF) << 0)
#define G_008F14_BASE_ADDRESS_HI(x) (((x) >> 0) & 0xFF)
#define C_008F14_BASE_ADDRESS_HI 0xFFFFFF00
#define S_008F14_MIN_LOD(x) (((unsigned)(x) & 0xFFF) << 8)
#define G_008F14_MIN_LOD(x) (((x) >> 8) & 0xFFF)
#define C_008F14_MIN_LOD 0xFFF000FF
-#define S_008F14_DATA_FORMAT_GFX9(x) (((unsigned)(x) & 0x3F) << 20)
-#define G_008F14_DATA_FORMAT_GFX9(x) (((x) >> 20) & 0x3F)
-#define C_008F14_DATA_FORMAT_GFX9 0xFC0FFFFF
+#define S_008F14_DATA_FORMAT(x) (((unsigned)(x) & 0x3F) << 20)
+#define G_008F14_DATA_FORMAT(x) (((x) >> 20) & 0x3F)
+#define C_008F14_DATA_FORMAT 0xFC0FFFFF
#define V_008F14_IMG_DATA_FORMAT_INVALID 0x00
#define V_008F14_IMG_DATA_FORMAT_8 0x01
#define V_008F14_IMG_DATA_FORMAT_16 0x02
#define V_008F14_IMG_DATA_FORMAT_8_8 0x03
#define V_008F14_IMG_DATA_FORMAT_32 0x04
#define V_008F14_IMG_DATA_FORMAT_16_16 0x05
#define V_008F14_IMG_DATA_FORMAT_10_11_11 0x06
#define V_008F14_IMG_DATA_FORMAT_11_11_10 0x07
#define V_008F14_IMG_DATA_FORMAT_10_10_10_2 0x08
#define V_008F14_IMG_DATA_FORMAT_2_10_10_10 0x09
@@ -1329,23 +1329,23 @@
#define V_008F14_IMG_DATA_FORMAT_N_IN_16_16_16_16 0x36
#define V_008F14_IMG_DATA_FORMAT_N_IN_16_AS_16_16_16_16 0x37
#define V_008F14_IMG_DATA_FORMAT_RESERVED_56 0x38
#define V_008F14_IMG_DATA_FORMAT_4_4 0x39
#define V_008F14_IMG_DATA_FORMAT_6_5_5 0x3A
#define V_008F14_IMG_DATA_FORMAT_S8_16 0x3B
#define V_008F14_IMG_DATA_FORMAT_S8_32 0x3C
#define V_008F14_IMG_DATA_FORMAT_8_AS_32 0x3D
#define V_008F14_IMG_DATA_FORMAT_8_AS_32_32 0x3E
#define V_008F14_IMG_DATA_FORMAT_32_AS_32_32_32_32 0x3F
-#define S_008F14_NUM_FORMAT_GFX9(x) (((unsigned)(x) & 0x0F) << 26)
-#define G_008F14_NUM_FORMAT_GFX9(x) (((x) >> 26) & 0x0F)
-#define C_008F14_NUM_FORMAT_GFX9 0xC3FFFFFF
+#define S_008F14_NUM_FORMAT(x) (((unsigned)(x) & 0x0F) << 26)
+#define G_008F14_NUM_FORMAT(x) (((x) >> 26) & 0x0F)
+#define C_008F14_NUM_FORMAT 0xC3FFFFFF
#define V_008F14_IMG_NUM_FORMAT_UNORM 0x00
#define V_008F14_IMG_NUM_FORMAT_SNORM 0x01
#define V_008F14_IMG_NUM_FORMAT_USCALED 0x02
#define V_008F14_IMG_NUM_FORMAT_SSCALED 0x03
#define V_008F14_IMG_NUM_FORMAT_UINT 0x04
#define V_008F14_IMG_NUM_FORMAT_SINT 0x05
#define V_008F14_IMG_NUM_FORMAT_RESERVED_6 0x06
#define V_008F14_IMG_NUM_FORMAT_FLOAT 0x07
#define V_008F14_IMG_NUM_FORMAT_METADATA 0x08
#define V_008F14_IMG_NUM_FORMAT_SRGB 0x09
diff --git a/src/amd/common/sid.h b/src/amd/common/sid.h
index 49683f1aa5a..a6d0bc2fe42 100644
--- a/src/amd/common/sid.h
+++ b/src/amd/common/sid.h
@@ -2120,23 +2120,23 @@
#define S_030F14_COUNT_HI(x) (((unsigned)(x) & 0x7FFFFFFF) << 0)
#define G_030F14_COUNT_HI(x) (((x) >> 0) & 0x7FFFFFFF)
#define C_030F14_COUNT_HI 0x80000000
#define R_008F14_SQ_IMG_RSRC_WORD1 0x008F14
#define S_008F14_BASE_ADDRESS_HI(x) (((unsigned)(x) & 0xFF) << 0)
#define G_008F14_BASE_ADDRESS_HI(x) (((x) >> 0) & 0xFF)
#define C_008F14_BASE_ADDRESS_HI 0xFFFFFF00
#define S_008F14_MIN_LOD(x) (((unsigned)(x) & 0xFFF) << 8)
#define G_008F14_MIN_LOD(x) (((x) >> 8) & 0xFFF)
#define C_008F14_MIN_LOD 0xFFF000FF
-#define S_008F14_DATA_FORMAT_GFX6(x) (((unsigned)(x) & 0x3F) << 20)
-#define G_008F14_DATA_FORMAT_GFX6(x) (((x) >> 20) & 0x3F)
-#define C_008F14_DATA_FORMAT_GFX6 0xFC0FFFFF
+#define S_008F14_DATA_FORMAT(x) (((unsigned)(x) & 0x3F) << 20)
+#define G_008F14_DATA_FORMAT(x) (((x) >> 20) & 0x3F)
+#define C_008F14_DATA_FORMAT 0xFC0FFFFF
#define V_008F14_IMG_DATA_FORMAT_INVALID 0x00
#define V_008F14_IMG_DATA_FORMAT_8 0x01
#define V_008F14_IMG_DATA_FORMAT_16 0x02
#define V_008F14_IMG_DATA_FORMAT_8_8 0x03
#define V_008F14_IMG_DATA_FORMAT_32 0x04
#define V_008F14_IMG_DATA_FORMAT_16_16 0x05
#define V_008F14_IMG_DATA_FORMAT_10_11_11 0x06
#define V_008F14_IMG_DATA_FORMAT_11_11_10 0x07
#define V_008F14_IMG_DATA_FORMAT_10_10_10_2 0x08
#define V_008F14_IMG_DATA_FORMAT_2_10_10_10 0x09
@@ -2187,23 +2187,23 @@
#define V_008F14_IMG_DATA_FORMAT_FMASK32_S8_F8 0x36
#define V_008F14_IMG_DATA_FORMAT_FMASK64_S16_F4 0x37
#define V_008F14_IMG_DATA_FORMAT_FMASK64_S16_F8 0x38
#define V_008F14_IMG_DATA_FORMAT_4_4 0x39
#define V_008F14_IMG_DATA_FORMAT_6_5_5 0x3A
#define V_008F14_IMG_DATA_FORMAT_1 0x3B
#define V_008F14_IMG_DATA_FORMAT_1_REVERSED 0x3C
#define V_008F14_IMG_DATA_FORMAT_32_AS_8 0x3D /* not on stoney */
#define V_008F14_IMG_DATA_FORMAT_32_AS_8_8 0x3E /* not on stoney */
#define V_008F14_IMG_DATA_FORMAT_32_AS_32_32_32_32 0x3F
-#define S_008F14_NUM_FORMAT_GFX6(x) (((unsigned)(x) & 0x0F) << 26)
-#define G_008F14_NUM_FORMAT_GFX6(x) (((x) >> 26) & 0x0F)
-#define C_008F14_NUM_FORMAT_GFX6 0xC3FFFFFF
+#define S_008F14_NUM_FORMAT(x) (((unsigned)(x) & 0x0F) << 26)
+#define G_008F14_NUM_FORMAT(x) (((x) >> 26) & 0x0F)
+#define C_008F14_NUM_FORMAT 0xC3FFFFFF
#define V_008F14_IMG_NUM_FORMAT_UNORM 0x00
#define V_008F14_IMG_NUM_FORMAT_SNORM 0x01
#define V_008F14_IMG_NUM_FORMAT_USCALED 0x02
#define V_008F14_IMG_NUM_FORMAT_SSCALED 0x03
#define V_008F14_IMG_NUM_FORMAT_UINT 0x04
#define V_008F14_IMG_NUM_FORMAT_SINT 0x05
#define V_008F14_IMG_NUM_FORMAT_SNORM_OGL 0x06
#define V_008F14_IMG_NUM_FORMAT_FLOAT 0x07
#define V_008F14_IMG_NUM_FORMAT_RESERVED_8 0x08
#define V_008F14_IMG_NUM_FORMAT_SRGB 0x09
diff --git a/src/amd/vulkan/radv_image.c b/src/amd/vulkan/radv_image.c
index 090ca70a327..94cde4d19f2 100644
--- a/src/amd/vulkan/radv_image.c
+++ b/src/amd/vulkan/radv_image.c
@@ -528,22 +528,22 @@ si_make_texture_descriptor(struct radv_device *device,
height = 1;
depth = image->info.array_size;
} else if (type == V_008F1C_SQ_RSRC_IMG_2D_ARRAY ||
type == V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY) {
if (view_type != VK_IMAGE_VIEW_TYPE_3D)
depth = image->info.array_size;
} else if (type == V_008F1C_SQ_RSRC_IMG_CUBE)
depth = image->info.array_size / 6;

state[0] = 0;
- state[1] = (S_008F14_DATA_FORMAT_GFX6(data_format) |
- S_008F14_NUM_FORMAT_GFX6(num_format));
+ state[1] = (S_008F14_DATA_FORMAT(data_format) |
+ S_008F14_NUM_FORMAT(num_format));
state[2] = (S_008F18_WIDTH(width - 1) |
S_008F18_HEIGHT(height - 1) |
S_008F18_PERF_MOD(4));
state[3] = (S_008F1C_DST_SEL_X(radv_map_swizzle(swizzle[0])) |
S_008F1C_DST_SEL_Y(radv_map_swizzle(swizzle[1])) |
S_008F1C_DST_SEL_Z(radv_map_swizzle(swizzle[2])) |
S_008F1C_DST_SEL_W(radv_map_swizzle(swizzle[3])) |
S_008F1C_BASE_LEVEL(image->info.samples > 1 ?
0 : first_level) |
S_008F1C_LAST_LEVEL(image->info.samples > 1 ?
@@ -628,22 +628,22 @@ si_make_texture_descriptor(struct radv_device *device,
default:
assert(0);
fmask_format = V_008F14_IMG_DATA_FORMAT_INVALID;
}
num_format = V_008F14_IMG_NUM_FORMAT_UINT;
}

fmask_state[0] = va >> 8;
fmask_state[0] |= image->fmask.tile_swizzle;
fmask_state[1] = S_008F14_BASE_ADDRESS_HI(va >> 40) |
- S_008F14_DATA_FORMAT_GFX6(fmask_format) |
- S_008F14_NUM_FORMAT_GFX6(num_format);
+ S_008F14_DATA_FORMAT(fmask_format) |
+ S_008F14_NUM_FORMAT(num_format);
fmask_state[2] = S_008F18_WIDTH(width - 1) |
S_008F18_HEIGHT(height - 1);
fmask_state[3] = S_008F1C_DST_SEL_X(V_008F1C_SQ_SEL_X) |
S_008F1C_DST_SEL_Y(V_008F1C_SQ_SEL_X) |
S_008F1C_DST_SEL_Z(V_008F1C_SQ_SEL_X) |
S_008F1C_DST_SEL_W(V_008F1C_SQ_SEL_X) |
S_008F1C_TYPE(radv_tex_dim(image->type, view_type, image->info.array_size, 0, false, false));
fmask_state[4] = 0;
fmask_state[5] = S_008F24_BASE_ARRAY(first_layer);
fmask_state[6] = 0;
diff --git a/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c b/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c
index 6decedc4cce..1cb0f9d1c60 100644
--- a/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c
+++ b/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c
@@ -1223,24 +1223,24 @@ si_lower_gather4_integer(struct si_shader_context *ctx,
LLVMConstInt(ctx->i32, 20, false), "");
data_format = LLVMBuildAnd(builder, data_format,
LLVMConstInt(ctx->i32, (1u << 6) - 1, false), "");
wa_8888 = LLVMBuildICmp(
builder, LLVMIntEQ, data_format,
LLVMConstInt(ctx->i32, V_008F14_IMG_DATA_FORMAT_8_8_8_8, false),
"");

uint32_t wa_num_format =
return_type == TGSI_RETURN_TYPE_UINT ?
- S_008F14_NUM_FORMAT_GFX6(V_008F14_IMG_NUM_FORMAT_USCALED) :
- S_008F14_NUM_FORMAT_GFX6(V_008F14_IMG_NUM_FORMAT_SSCALED);
+ S_008F14_NUM_FORMAT(V_008F14_IMG_NUM_FORMAT_USCALED) :
+ S_008F14_NUM_FORMAT(V_008F14_IMG_NUM_FORMAT_SSCALED);
wa_formats = LLVMBuildAnd(builder, formats,
- LLVMConstInt(ctx->i32, C_008F14_NUM_FORMAT_GFX6, false),
+ LLVMConstInt(ctx->i32, C_008F14_NUM_FORMAT, false),
"");
wa_formats = LLVMBuildOr(builder, wa_formats,
LLVMConstInt(ctx->i32, wa_num_format, false), "");

formats = LLVMBuildSelect(builder, wa_8888, wa_formats, formats, "");
args->resource = LLVMBuildInsertElement(
builder, args->resource, formats, ctx->i32_1, "");
}

if (target == TGSI_TEXTURE_RECT ||
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 41aa4ef3336..0960f379c4f 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -3821,22 +3821,22 @@ si_make_texture_descriptor(struct si_screen *screen,
height = 1;
depth = res->array_size;
} else if (type == V_008F1C_SQ_RSRC_IMG_2D_ARRAY ||
type == V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY) {
if (sampler || res->target != PIPE_TEXTURE_3D)
depth = res->array_size;
} else if (type == V_008F1C_SQ_RSRC_IMG_CUBE)
depth = res->array_size / 6;

state[0] = 0;
- state[1] = (S_008F14_DATA_FORMAT_GFX6(data_format) |
- S_008F14_NUM_FORMAT_GFX6(num_format));
+ state[1] = (S_008F14_DATA_FORMAT(data_format) |
+ S_008F14_NUM_FORMAT(num_format));
state[2] = (S_008F18_WIDTH(width - 1) |
S_008F18_HEIGHT(height - 1) |
S_008F18_PERF_MOD(4));
state[3] = (S_008F1C_DST_SEL_X(si_map_swizzle(swizzle[0])) |
S_008F1C_DST_SEL_Y(si_map_swizzle(swizzle[1])) |
S_008F1C_DST_SEL_Z(si_map_swizzle(swizzle[2])) |
S_008F1C_DST_SEL_W(si_map_swizzle(swizzle[3])) |
S_008F1C_BASE_LEVEL(num_samples > 1 ? 0 : first_level) |
S_008F1C_LAST_LEVEL(num_samples > 1 ?
util_logbase2(num_samples) :
@@ -3977,22 +3977,22 @@ si_make_texture_descriptor(struct si_screen *screen,
break;
default:
unreachable("invalid nr_samples");
}
num_format = V_008F14_IMG_NUM_FORMAT_UINT;
}
#undef FMASK

fmask_state[0] = (va >> 8) | tex->surface.fmask_tile_swizzle;
fmask_state[1] = S_008F14_BASE_ADDRESS_HI(va >> 40) |
- S_008F14_DATA_FORMAT_GFX6(data_format) |
- S_008F14_NUM_FORMAT_GFX6(num_format);
+ S_008F14_DATA_FORMAT(data_format) |
+ S_008F14_NUM_FORMAT(num_format);
fmask_state[2] = S_008F18_WIDTH(width - 1) |
S_008F18_HEIGHT(height - 1);
fmask_state[3] = S_008F1C_DST_SEL_X(V_008F1C_SQ_SEL_X) |
S_008F1C_DST_SEL_Y(V_008F1C_SQ_SEL_X) |
S_008F1C_DST_SEL_Z(V_008F1C_SQ_SEL_X) |
S_008F1C_DST_SEL_W(V_008F1C_SQ_SEL_X) |
S_008F1C_TYPE(si_tex_dim(screen, tex, target, 0));
fmask_state[4] = 0;
fmask_state[5] = S_008F24_BASE_ARRAY(first_layer);
fmask_state[6] = 0;
@@ -4154,21 +4154,21 @@ si_create_sampler_view_custom(struct pipe_context *ctx,
state->u.tex.first_level,
state->format);

si_make_texture_descriptor(sctx->screen, tex, true,
state->target, pipe_format, state_swizzle,
first_level, last_level,
state->u.tex.first_layer, last_layer,
width, height, depth,
view->state, view->fmask_state);

- unsigned num_format = G_008F14_NUM_FORMAT_GFX6(view->state[1]);
+ unsigned num_format = G_008F14_NUM_FORMAT(view->state[1]);
view->is_integer =
num_format == V_008F14_IMG_NUM_FORMAT_USCALED ||
num_format == V_008F14_IMG_NUM_FORMAT_SSCALED ||
num_format == V_008F14_IMG_NUM_FORMAT_UINT ||
num_format == V_008F14_IMG_NUM_FORMAT_SINT;
view->base_level_info = &surflevel[base_level];
view->base_level = base_level;
view->block_width = util_format_get_blockwidth(pipe_format);
return &view->base;
}
--
2.19.1
Nicolai Hähnle
2018-12-06 14:00:22 UTC
Permalink
From: Nicolai Hähnle <***@amd.com>

---
src/gallium/drivers/r600/sb/sb_ir.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/r600/sb/sb_ir.h b/src/gallium/drivers/r600/sb/sb_ir.h
index c7a94fcb930..ef0fbd4e68f 100644
--- a/src/gallium/drivers/r600/sb/sb_ir.h
+++ b/src/gallium/drivers/r600/sb/sb_ir.h
@@ -1005,21 +1005,21 @@ public:
virtual bool fold_dispatch(expr_handler *ex);

void jump(cf_node *c) { jump_target = c; jump_after_target = false; }
void jump_after(cf_node *c) { jump_target = c; jump_after_target = true; }

friend class shader;
};

class alu_node : public node {
protected:
- alu_node() : node(NT_OP, NST_ALU_INST) { memset(&bc, 0, sizeof(bc_alu)); };
+ alu_node() : node(NT_OP, NST_ALU_INST) { memset(&bc, 0, sizeof(bc_alu)); }
public:
bc_alu bc;

virtual bool is_valid() { return subtype == NST_ALU_INST; }
virtual bool accept(vpass &p, bool enter);
virtual bool fold_dispatch(expr_handler *ex);

unsigned forced_bank_swizzle() {
return ((bc.op_ptr->flags & AF_INTERP) && (bc.slot_flags == AF_4V)) ?
VEC_210 : 0;
--
2.19.1
Nicolai Hähnle
2018-12-06 14:00:23 UTC
Permalink
From: Nicolai Hähnle <***@amd.com>

This happened to bite me while doing some experiments.
---
src/amd/common/sid_tables.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/amd/common/sid_tables.py b/src/amd/common/sid_tables.py
index 7b5e626e3e1..f12bed4b209 100644
--- a/src/amd/common/sid_tables.py
+++ b/src/amd/common/sid_tables.py
@@ -1,11 +1,11 @@
-from __future__ import print_function
+from __future__ import print_function, division, unicode_literals

CopyRight = '''
/*
* Copyright 2015 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* on the rights to use, copy, modify, merge, publish, distribute, sub
* license, and/or sell copies of the Software, and to permit persons to whom
--
2.19.1
Nicolai Hähnle
2018-12-06 14:00:26 UTC
Permalink
From: Nicolai Hähnle <***@amd.com>

---
src/amd/common/ac_llvm_build.c | 7 +++----
src/amd/common/ac_llvm_build.h | 1 +
2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index fba90205a2e..68c8bad9e83 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -2861,48 +2861,47 @@ void ac_build_endloop(struct ac_llvm_context *ctx, int label_id)

assert(current_loop->loop_entry_block);

emit_default_branch(ctx->builder, current_loop->loop_entry_block);

LLVMPositionBuilderAtEnd(ctx->builder, current_loop->next_block);
set_basicblock_name(current_loop->next_block, "endloop", label_id);
ctx->flow_depth--;
}

-static void if_cond_emit(struct ac_llvm_context *ctx, LLVMValueRef cond,
- int label_id)
+void ac_build_ifcc(struct ac_llvm_context *ctx, LLVMValueRef cond, int label_id)
{
struct ac_llvm_flow *flow = push_flow(ctx);
LLVMBasicBlockRef if_block;

if_block = append_basic_block(ctx, "IF");
flow->next_block = append_basic_block(ctx, "ELSE");
set_basicblock_name(if_block, "if", label_id);
LLVMBuildCondBr(ctx->builder, cond, if_block, flow->next_block);
LLVMPositionBuilderAtEnd(ctx->builder, if_block);
}

void ac_build_if(struct ac_llvm_context *ctx, LLVMValueRef value,
int label_id)
{
LLVMValueRef cond = LLVMBuildFCmp(ctx->builder, LLVMRealUNE,
value, ctx->f32_0, "");
- if_cond_emit(ctx, cond, label_id);
+ ac_build_ifcc(ctx, cond, label_id);
}

void ac_build_uif(struct ac_llvm_context *ctx, LLVMValueRef value,
int label_id)
{
LLVMValueRef cond = LLVMBuildICmp(ctx->builder, LLVMIntNE,
ac_to_integer(ctx, value),
ctx->i32_0, "");
- if_cond_emit(ctx, cond, label_id);
+ ac_build_ifcc(ctx, cond, label_id);
}

LLVMValueRef ac_build_alloca_undef(struct ac_llvm_context *ac, LLVMTypeRef type,
const char *name)
{
LLVMBuilderRef builder = ac->builder;
LLVMBasicBlockRef current_block = LLVMGetInsertBlock(builder);
LLVMValueRef function = LLVMGetBasicBlockParent(current_block);
LLVMBasicBlockRef first_block = LLVMGetEntryBasicBlock(function);
LLVMValueRef first_instr = LLVMGetFirstInstruction(first_block);
diff --git a/src/amd/common/ac_llvm_build.h b/src/amd/common/ac_llvm_build.h
index e90c8c21ad4..cf3e3cedf65 100644
--- a/src/amd/common/ac_llvm_build.h
+++ b/src/amd/common/ac_llvm_build.h
@@ -475,20 +475,21 @@ LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx,

LLVMTypeRef ac_array_in_const_addr_space(LLVMTypeRef elem_type);
LLVMTypeRef ac_array_in_const32_addr_space(LLVMTypeRef elem_type);

void ac_build_bgnloop(struct ac_llvm_context *ctx, int lable_id);
void ac_build_break(struct ac_llvm_context *ctx);
void ac_build_continue(struct ac_llvm_context *ctx);
void ac_build_else(struct ac_llvm_context *ctx, int lable_id);
void ac_build_endif(struct ac_llvm_context *ctx, int lable_id);
void ac_build_endloop(struct ac_llvm_context *ctx, int lable_id);
+void ac_build_ifcc(struct ac_llvm_context *ctx, LLVMValueRef cond, int label_id);
void ac_build_if(struct ac_llvm_context *ctx, LLVMValueRef value,
int lable_id);
void ac_build_uif(struct ac_llvm_context *ctx, LLVMValueRef value,
int lable_id);

LLVMValueRef ac_build_alloca(struct ac_llvm_context *ac, LLVMTypeRef type,
const char *name);
LLVMValueRef ac_build_alloca_undef(struct ac_llvm_context *ac, LLVMTypeRef type,
const char *name);
--
2.19.1
Nicolai Hähnle
2018-12-06 14:00:25 UTC
Permalink
From: Nicolai Hähnle <***@amd.com>

---
src/amd/common/ac_llvm_build.c | 18 ++++++++----------
1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index abc18da13db..fba90205a2e 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -3374,40 +3374,38 @@ ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, LLVMValu
tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
result = ac_build_alu_op(ctx, result, tmp, op);
return result;
}

LLVMValueRef
ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
{
ac_build_optimization_barrier(ctx, &src);
LLVMValueRef result;
- LLVMValueRef identity = get_reduction_identity(ctx, op,
- ac_get_type_size(LLVMTypeOf(src)));
- result = LLVMBuildBitCast(ctx->builder,
- ac_build_set_inactive(ctx, src, identity),
- LLVMTypeOf(identity), "");
+ LLVMValueRef identity =
+ get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
+ result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
+ LLVMTypeOf(identity), "");
result = ac_build_scan(ctx, op, result, identity);

return ac_build_wwm(ctx, result);
}

LLVMValueRef
ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
{
ac_build_optimization_barrier(ctx, &src);
LLVMValueRef result;
- LLVMValueRef identity = get_reduction_identity(ctx, op,
- ac_get_type_size(LLVMTypeOf(src)));
- result = LLVMBuildBitCast(ctx->builder,
- ac_build_set_inactive(ctx, src, identity),
- LLVMTypeOf(identity), "");
+ LLVMValueRef identity =
+ get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
+ result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
+ LLVMTypeOf(identity), "");
result = ac_build_dpp(ctx, identity, result, dpp_wf_sr1, 0xf, 0xf, false);
result = ac_build_scan(ctx, op, result, identity);

return ac_build_wwm(ctx, result);
}

LLVMValueRef
ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op, unsigned cluster_size)
{
if (cluster_size == 1) return src;
--
2.19.1
Nicolai Hähnle
2018-12-06 14:00:27 UTC
Permalink
From: Nicolai Hähnle <***@amd.com>

Order-aware scan/reduce can trade-off LDS traffic for external atomics
memory traffic in producer/consumer compute shaders.
---
src/amd/common/ac_llvm_build.c | 195 ++++++++++++++++++++++++++++++++-
src/amd/common/ac_llvm_build.h | 36 ++++++
2 files changed, 227 insertions(+), 4 deletions(-)

diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index 68c8bad9e83..932f4bbdeef 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -3345,68 +3345,88 @@ ac_build_alu_op(struct ac_llvm_context *ctx, LLVMValueRef lhs, LLVMValueRef rhs,
_64bit ? ctx->f64 : ctx->f32,
(LLVMValueRef[]){lhs, rhs}, 2, AC_FUNC_ATTR_READNONE);
case nir_op_iand: return LLVMBuildAnd(ctx->builder, lhs, rhs, "");
case nir_op_ior: return LLVMBuildOr(ctx->builder, lhs, rhs, "");
case nir_op_ixor: return LLVMBuildXor(ctx->builder, lhs, rhs, "");
default:
unreachable("bad reduction intrinsic");
}
}

-/* TODO: add inclusive and excluse scan functions for SI chip class. */
+/**
+ * \param maxprefix specifies that the result only needs to be correct for a
+ * prefix of this many threads
+ *
+ * TODO: add inclusive and excluse scan functions for SI chip class.
+ */
static LLVMValueRef
-ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, LLVMValueRef identity)
+ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, LLVMValueRef identity,
+ unsigned maxprefix)
{
LLVMValueRef result, tmp;
result = src;
+ if (maxprefix <= 1)
+ return result;
tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);
result = ac_build_alu_op(ctx, result, tmp, op);
+ if (maxprefix <= 2)
+ return result;
tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(2), 0xf, 0xf, false);
result = ac_build_alu_op(ctx, result, tmp, op);
+ if (maxprefix <= 3)
+ return result;
tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(3), 0xf, 0xf, false);
result = ac_build_alu_op(ctx, result, tmp, op);
+ if (maxprefix <= 4)
+ return result;
tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(4), 0xf, 0xe, false);
result = ac_build_alu_op(ctx, result, tmp, op);
+ if (maxprefix <= 8)
+ return result;
tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(8), 0xf, 0xc, false);
result = ac_build_alu_op(ctx, result, tmp, op);
+ if (maxprefix <= 16)
+ return result;
tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
result = ac_build_alu_op(ctx, result, tmp, op);
+ if (maxprefix <= 32)
+ return result;
tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
result = ac_build_alu_op(ctx, result, tmp, op);
return result;
}

LLVMValueRef
ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
{
ac_build_optimization_barrier(ctx, &src);
LLVMValueRef result;
LLVMValueRef identity =
get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
LLVMTypeOf(identity), "");
- result = ac_build_scan(ctx, op, result, identity);
+ result = ac_build_scan(ctx, op, result, identity, 64);

return ac_build_wwm(ctx, result);
}

LLVMValueRef
ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
{
ac_build_optimization_barrier(ctx, &src);
LLVMValueRef result;
LLVMValueRef identity =
get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
LLVMTypeOf(identity), "");
result = ac_build_dpp(ctx, identity, result, dpp_wf_sr1, 0xf, 0xf, false);
- result = ac_build_scan(ctx, op, result, identity);
+ result = ac_build_scan(ctx, op, result, identity, 64);

return ac_build_wwm(ctx, result);
}

LLVMValueRef
ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op, unsigned cluster_size)
{
if (cluster_size == 1) return src;
ac_build_optimization_barrier(ctx, &src);
LLVMValueRef result, swap;
@@ -3450,20 +3470,187 @@ ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op, unsign
result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 63, 0));
return ac_build_wwm(ctx, result);
} else {
swap = ac_build_readlane(ctx, result, ctx->i32_0);
result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 32, 0));
result = ac_build_alu_op(ctx, result, swap, op);
return ac_build_wwm(ctx, result);
}
}

+/**
+ * "Top half" of a scan that reduces per-wave values across an entire
+ * workgroup.
+ *
+ * The source value must be present in the highest lane of the wave, and the
+ * highest lane must be live.
+ */
+void
+ac_build_wg_wavescan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
+{
+ if (ws->maxwaves <= 1)
+ return;
+
+ const LLVMValueRef i32_63 = LLVMConstInt(ctx->i32, 63, false);
+ LLVMBuilderRef builder = ctx->builder;
+ LLVMValueRef tid = ac_get_thread_id(ctx);
+ LLVMValueRef tmp;
+
+ tmp = LLVMBuildICmp(builder, LLVMIntEQ, tid, i32_63, "");
+ ac_build_ifcc(ctx, tmp, 1000);
+ LLVMBuildStore(builder, ws->src, LLVMBuildGEP(builder, ws->scratch, &ws->waveidx, 1, ""));
+ ac_build_endif(ctx, 1000);
+}
+
+/**
+ * "Bottom half" of a scan that reduces per-wave values across an entire
+ * workgroup.
+ *
+ * The caller must place a barrier between the top and bottom halves.
+ */
+void
+ac_build_wg_wavescan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
+{
+ const LLVMTypeRef type = LLVMTypeOf(ws->src);
+ const LLVMValueRef identity =
+ get_reduction_identity(ctx, ws->op, ac_get_type_size(type));
+
+ if (ws->maxwaves <= 1) {
+ ws->result_reduce = ws->src;
+ ws->result_inclusive = ws->src;
+ ws->result_exclusive = identity;
+ return;
+ }
+ assert(ws->maxwaves <= 32);
+
+ LLVMBuilderRef builder = ctx->builder;
+ LLVMValueRef tid = ac_get_thread_id(ctx);
+ LLVMBasicBlockRef bbs[2];
+ LLVMValueRef phivalues_scan[2];
+ LLVMValueRef tmp, tmp2;
+
+ bbs[0] = LLVMGetInsertBlock(builder);
+ phivalues_scan[0] = LLVMGetUndef(type);
+
+ if (ws->enable_reduce)
+ tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->numwaves, "");
+ else if (ws->enable_inclusive)
+ tmp = LLVMBuildICmp(builder, LLVMIntULE, tid, ws->waveidx, "");
+ else
+ tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->waveidx, "");
+ ac_build_ifcc(ctx, tmp, 1001);
+ {
+ tmp = LLVMBuildLoad(builder, LLVMBuildGEP(builder, ws->scratch, &tid, 1, ""), "");
+
+ ac_build_optimization_barrier(ctx, &tmp);
+
+ bbs[1] = LLVMGetInsertBlock(builder);
+ phivalues_scan[1] = ac_build_scan(ctx, ws->op, tmp, identity, ws->maxwaves);
+ }
+ ac_build_endif(ctx, 1001);
+
+ const LLVMValueRef scan = ac_build_phi(ctx, type, 2, phivalues_scan, bbs);
+
+ if (ws->enable_reduce) {
+ tmp = LLVMBuildSub(builder, ws->numwaves, ctx->i32_1, "");
+ ws->result_reduce = ac_build_readlane(ctx, scan, tmp);
+ }
+ if (ws->enable_inclusive)
+ ws->result_inclusive = ac_build_readlane(ctx, scan, ws->waveidx);
+ if (ws->enable_exclusive) {
+ tmp = LLVMBuildSub(builder, ws->waveidx, ctx->i32_1, "");
+ tmp = ac_build_readlane(ctx, scan, tmp);
+ tmp2 = LLVMBuildICmp(builder, LLVMIntEQ, ws->waveidx, ctx->i32_0, "");
+ ws->result_exclusive = LLVMBuildSelect(builder, tmp2, identity, tmp, "");
+ }
+}
+
+/**
+ * Inclusive scan of a per-wave value across an entire workgroup.
+ *
+ * This implies an s_barrier instruction.
+ *
+ * Unlike ac_build_inclusive_scan, the caller \em must ensure that all threads
+ * of the workgroup are live. (This requirement cannot easily be relaxed in a
+ * useful manner because of the barrier in the algorithm.)
+ */
+void
+ac_build_wg_wavescan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
+{
+ ac_build_wg_wavescan_top(ctx, ws);
+ ac_build_s_barrier(ctx);
+ ac_build_wg_wavescan_bottom(ctx, ws);
+}
+
+/**
+ * "Top half" of a scan that reduces per-thread values across an entire
+ * workgroup.
+ *
+ * All lanes must be active when this code runs.
+ */
+void
+ac_build_wg_scan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
+{
+ if (ws->enable_exclusive) {
+ ws->extra = ac_build_exclusive_scan(ctx, ws->src, ws->op);
+ ws->src = ac_build_alu_op(ctx, ws->extra, ws->src, ws->op);
+ } else {
+ ws->src = ac_build_inclusive_scan(ctx, ws->src, ws->op);
+ }
+
+ bool enable_inclusive = ws->enable_inclusive;
+ bool enable_exclusive = ws->enable_exclusive;
+ ws->enable_inclusive = false;
+ ws->enable_exclusive = ws->enable_exclusive || enable_inclusive;
+ ac_build_wg_wavescan_top(ctx, ws);
+ ws->enable_inclusive = enable_inclusive;
+ ws->enable_exclusive = enable_exclusive;
+}
+
+/**
+ * "Bottom half" of a scan that reduces per-thread values across an entire
+ * workgroup.
+ *
+ * The caller must place a barrier between the top and bottom halves.
+ */
+void
+ac_build_wg_scan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
+{
+ bool enable_inclusive = ws->enable_inclusive;
+ bool enable_exclusive = ws->enable_exclusive;
+ ws->enable_inclusive = false;
+ ws->enable_exclusive = ws->enable_exclusive || enable_inclusive;
+ ac_build_wg_wavescan_bottom(ctx, ws);
+ ws->enable_inclusive = enable_inclusive;
+ ws->enable_exclusive = enable_exclusive;
+
+ /* ws->result_reduce is already the correct value */
+ if (ws->enable_inclusive)
+ ws->result_inclusive = ac_build_alu_op(ctx, ws->result_exclusive, ws->src, ws->op);
+ if (ws->enable_exclusive)
+ ws->result_exclusive = ac_build_alu_op(ctx, ws->result_exclusive, ws->extra, ws->op);
+}
+
+/**
+ * A scan that reduces per-thread values across an entire workgroup.
+ *
+ * The caller must ensure that all lanes are active when this code runs
+ * (WWM is insufficient!), because there is an implied barrier.
+ */
+void
+ac_build_wg_scan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
+{
+ ac_build_wg_scan_top(ctx, ws);
+ ac_build_s_barrier(ctx);
+ ac_build_wg_scan_bottom(ctx, ws);
+}
+
LLVMValueRef
ac_build_quad_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src,
unsigned lane0, unsigned lane1, unsigned lane2, unsigned lane3)
{
unsigned mask = dpp_quad_perm(lane0, lane1, lane2, lane3);
if (ctx->chip_class >= VI) {
return ac_build_dpp(ctx, src, src, mask, 0xf, 0xf, false);
} else {
return ac_build_ds_swizzle(ctx, src, (1 << 15) | mask);
}
diff --git a/src/amd/common/ac_llvm_build.h b/src/amd/common/ac_llvm_build.h
index cf3e3cedf65..cad131768d2 100644
--- a/src/amd/common/ac_llvm_build.h
+++ b/src/amd/common/ac_llvm_build.h
@@ -519,20 +519,56 @@ ac_build_mbcnt(struct ac_llvm_context *ctx, LLVMValueRef mask);

LLVMValueRef
ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op);

LLVMValueRef
ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op);

LLVMValueRef
ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op, unsigned cluster_size);

+/**
+ * Common arguments for a scan/reduce operation that accumulates per-wave
+ * values across an entire workgroup, while respecting the order of waves.
+ */
+struct ac_wg_scan {
+ bool enable_reduce;
+ bool enable_exclusive;
+ bool enable_inclusive;
+ nir_op op;
+ LLVMValueRef src; /* clobbered! */
+ LLVMValueRef result_reduce;
+ LLVMValueRef result_exclusive;
+ LLVMValueRef result_inclusive;
+ LLVMValueRef extra;
+ LLVMValueRef waveidx;
+ LLVMValueRef numwaves; /* only needed for "reduce" operations */
+
+ /* T addrspace(LDS) pointer to the same type as value, at least maxwaves entries */
+ LLVMValueRef scratch;
+ unsigned maxwaves;
+};
+
+void
+ac_build_wg_wavescan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws);
+void
+ac_build_wg_wavescan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws);
+void
+ac_build_wg_wavescan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws);
+
+void
+ac_build_wg_scan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws);
+void
+ac_build_wg_scan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws);
+void
+ac_build_wg_scan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws);
+
LLVMValueRef
ac_build_quad_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src,
unsigned lane0, unsigned lane1, unsigned lane2, unsigned lane3);

LLVMValueRef
ac_build_shuffle(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef index);

#ifdef __cplusplus
}
#endif
--
2.19.1
Connor Abbott
2018-12-06 14:20:19 UTC
Permalink
Is this going to be used by an extension? If you don't have a use for
it yet, it would probably be better to wait.
Post by Nicolai Hähnle
Order-aware scan/reduce can trade-off LDS traffic for external atomics
memory traffic in producer/consumer compute shaders.
---
src/amd/common/ac_llvm_build.c | 195 ++++++++++++++++++++++++++++++++-
src/amd/common/ac_llvm_build.h | 36 ++++++
2 files changed, 227 insertions(+), 4 deletions(-)
diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index 68c8bad9e83..932f4bbdeef 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -3345,68 +3345,88 @@ ac_build_alu_op(struct ac_llvm_context *ctx, LLVMValueRef lhs, LLVMValueRef rhs,
_64bit ? ctx->f64 : ctx->f32,
(LLVMValueRef[]){lhs, rhs}, 2, AC_FUNC_ATTR_READNONE);
case nir_op_iand: return LLVMBuildAnd(ctx->builder, lhs, rhs, "");
case nir_op_ior: return LLVMBuildOr(ctx->builder, lhs, rhs, "");
case nir_op_ixor: return LLVMBuildXor(ctx->builder, lhs, rhs, "");
unreachable("bad reduction intrinsic");
}
}
-/* TODO: add inclusive and excluse scan functions for SI chip class. */
+/**
+ * \param maxprefix specifies that the result only needs to be correct for a
+ * prefix of this many threads
+ *
+ * TODO: add inclusive and excluse scan functions for SI chip class.
+ */
static LLVMValueRef
-ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, LLVMValueRef identity)
+ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, LLVMValueRef identity,
+ unsigned maxprefix)
{
LLVMValueRef result, tmp;
result = src;
+ if (maxprefix <= 1)
+ return result;
tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);
result = ac_build_alu_op(ctx, result, tmp, op);
+ if (maxprefix <= 2)
+ return result;
tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(2), 0xf, 0xf, false);
result = ac_build_alu_op(ctx, result, tmp, op);
+ if (maxprefix <= 3)
+ return result;
tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(3), 0xf, 0xf, false);
result = ac_build_alu_op(ctx, result, tmp, op);
+ if (maxprefix <= 4)
+ return result;
tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(4), 0xf, 0xe, false);
result = ac_build_alu_op(ctx, result, tmp, op);
+ if (maxprefix <= 8)
+ return result;
tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(8), 0xf, 0xc, false);
result = ac_build_alu_op(ctx, result, tmp, op);
+ if (maxprefix <= 16)
+ return result;
tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
result = ac_build_alu_op(ctx, result, tmp, op);
+ if (maxprefix <= 32)
+ return result;
tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
result = ac_build_alu_op(ctx, result, tmp, op);
return result;
}
LLVMValueRef
ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
{
ac_build_optimization_barrier(ctx, &src);
LLVMValueRef result;
LLVMValueRef identity =
get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
LLVMTypeOf(identity), "");
- result = ac_build_scan(ctx, op, result, identity);
+ result = ac_build_scan(ctx, op, result, identity, 64);
return ac_build_wwm(ctx, result);
}
LLVMValueRef
ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
{
ac_build_optimization_barrier(ctx, &src);
LLVMValueRef result;
LLVMValueRef identity =
get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
LLVMTypeOf(identity), "");
result = ac_build_dpp(ctx, identity, result, dpp_wf_sr1, 0xf, 0xf, false);
- result = ac_build_scan(ctx, op, result, identity);
+ result = ac_build_scan(ctx, op, result, identity, 64);
return ac_build_wwm(ctx, result);
}
LLVMValueRef
ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op, unsigned cluster_size)
{
if (cluster_size == 1) return src;
ac_build_optimization_barrier(ctx, &src);
LLVMValueRef result, swap;
@@ -3450,20 +3470,187 @@ ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op, unsign
result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 63, 0));
return ac_build_wwm(ctx, result);
} else {
swap = ac_build_readlane(ctx, result, ctx->i32_0);
result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 32, 0));
result = ac_build_alu_op(ctx, result, swap, op);
return ac_build_wwm(ctx, result);
}
}
+/**
+ * "Top half" of a scan that reduces per-wave values across an entire
+ * workgroup.
+ *
+ * The source value must be present in the highest lane of the wave, and the
+ * highest lane must be live.
+ */
+void
+ac_build_wg_wavescan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
+{
+ if (ws->maxwaves <= 1)
+ return;
+
+ const LLVMValueRef i32_63 = LLVMConstInt(ctx->i32, 63, false);
+ LLVMBuilderRef builder = ctx->builder;
+ LLVMValueRef tid = ac_get_thread_id(ctx);
+ LLVMValueRef tmp;
+
+ tmp = LLVMBuildICmp(builder, LLVMIntEQ, tid, i32_63, "");
+ ac_build_ifcc(ctx, tmp, 1000);
+ LLVMBuildStore(builder, ws->src, LLVMBuildGEP(builder, ws->scratch, &ws->waveidx, 1, ""));
+ ac_build_endif(ctx, 1000);
+}
+
+/**
+ * "Bottom half" of a scan that reduces per-wave values across an entire
+ * workgroup.
+ *
+ * The caller must place a barrier between the top and bottom halves.
+ */
+void
+ac_build_wg_wavescan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
+{
+ const LLVMTypeRef type = LLVMTypeOf(ws->src);
+ const LLVMValueRef identity =
+ get_reduction_identity(ctx, ws->op, ac_get_type_size(type));
+
+ if (ws->maxwaves <= 1) {
+ ws->result_reduce = ws->src;
+ ws->result_inclusive = ws->src;
+ ws->result_exclusive = identity;
+ return;
+ }
+ assert(ws->maxwaves <= 32);
+
+ LLVMBuilderRef builder = ctx->builder;
+ LLVMValueRef tid = ac_get_thread_id(ctx);
+ LLVMBasicBlockRef bbs[2];
+ LLVMValueRef phivalues_scan[2];
+ LLVMValueRef tmp, tmp2;
+
+ bbs[0] = LLVMGetInsertBlock(builder);
+ phivalues_scan[0] = LLVMGetUndef(type);
+
+ if (ws->enable_reduce)
+ tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->numwaves, "");
+ else if (ws->enable_inclusive)
+ tmp = LLVMBuildICmp(builder, LLVMIntULE, tid, ws->waveidx, "");
+ else
+ tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->waveidx, "");
+ ac_build_ifcc(ctx, tmp, 1001);
+ {
+ tmp = LLVMBuildLoad(builder, LLVMBuildGEP(builder, ws->scratch, &tid, 1, ""), "");
+
+ ac_build_optimization_barrier(ctx, &tmp);
+
+ bbs[1] = LLVMGetInsertBlock(builder);
+ phivalues_scan[1] = ac_build_scan(ctx, ws->op, tmp, identity, ws->maxwaves);
+ }
+ ac_build_endif(ctx, 1001);
+
+ const LLVMValueRef scan = ac_build_phi(ctx, type, 2, phivalues_scan, bbs);
+
+ if (ws->enable_reduce) {
+ tmp = LLVMBuildSub(builder, ws->numwaves, ctx->i32_1, "");
+ ws->result_reduce = ac_build_readlane(ctx, scan, tmp);
+ }
+ if (ws->enable_inclusive)
+ ws->result_inclusive = ac_build_readlane(ctx, scan, ws->waveidx);
+ if (ws->enable_exclusive) {
+ tmp = LLVMBuildSub(builder, ws->waveidx, ctx->i32_1, "");
+ tmp = ac_build_readlane(ctx, scan, tmp);
+ tmp2 = LLVMBuildICmp(builder, LLVMIntEQ, ws->waveidx, ctx->i32_0, "");
+ ws->result_exclusive = LLVMBuildSelect(builder, tmp2, identity, tmp, "");
+ }
+}
+
+/**
+ * Inclusive scan of a per-wave value across an entire workgroup.
+ *
+ * This implies an s_barrier instruction.
+ *
+ * Unlike ac_build_inclusive_scan, the caller \em must ensure that all threads
+ * of the workgroup are live. (This requirement cannot easily be relaxed in a
+ * useful manner because of the barrier in the algorithm.)
+ */
+void
+ac_build_wg_wavescan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
+{
+ ac_build_wg_wavescan_top(ctx, ws);
+ ac_build_s_barrier(ctx);
+ ac_build_wg_wavescan_bottom(ctx, ws);
+}
+
+/**
+ * "Top half" of a scan that reduces per-thread values across an entire
+ * workgroup.
+ *
+ * All lanes must be active when this code runs.
+ */
+void
+ac_build_wg_scan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
+{
+ if (ws->enable_exclusive) {
+ ws->extra = ac_build_exclusive_scan(ctx, ws->src, ws->op);
+ ws->src = ac_build_alu_op(ctx, ws->extra, ws->src, ws->op);
+ } else {
+ ws->src = ac_build_inclusive_scan(ctx, ws->src, ws->op);
+ }
+
+ bool enable_inclusive = ws->enable_inclusive;
+ bool enable_exclusive = ws->enable_exclusive;
+ ws->enable_inclusive = false;
+ ws->enable_exclusive = ws->enable_exclusive || enable_inclusive;
+ ac_build_wg_wavescan_top(ctx, ws);
+ ws->enable_inclusive = enable_inclusive;
+ ws->enable_exclusive = enable_exclusive;
+}
+
+/**
+ * "Bottom half" of a scan that reduces per-thread values across an entire
+ * workgroup.
+ *
+ * The caller must place a barrier between the top and bottom halves.
+ */
+void
+ac_build_wg_scan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
+{
+ bool enable_inclusive = ws->enable_inclusive;
+ bool enable_exclusive = ws->enable_exclusive;
+ ws->enable_inclusive = false;
+ ws->enable_exclusive = ws->enable_exclusive || enable_inclusive;
+ ac_build_wg_wavescan_bottom(ctx, ws);
+ ws->enable_inclusive = enable_inclusive;
+ ws->enable_exclusive = enable_exclusive;
+
+ /* ws->result_reduce is already the correct value */
+ if (ws->enable_inclusive)
+ ws->result_inclusive = ac_build_alu_op(ctx, ws->result_exclusive, ws->src, ws->op);
+ if (ws->enable_exclusive)
+ ws->result_exclusive = ac_build_alu_op(ctx, ws->result_exclusive, ws->extra, ws->op);
+}
+
+/**
+ * A scan that reduces per-thread values across an entire workgroup.
+ *
+ * The caller must ensure that all lanes are active when this code runs
+ * (WWM is insufficient!), because there is an implied barrier.
+ */
+void
+ac_build_wg_scan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
+{
+ ac_build_wg_scan_top(ctx, ws);
+ ac_build_s_barrier(ctx);
+ ac_build_wg_scan_bottom(ctx, ws);
+}
+
LLVMValueRef
ac_build_quad_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src,
unsigned lane0, unsigned lane1, unsigned lane2, unsigned lane3)
{
unsigned mask = dpp_quad_perm(lane0, lane1, lane2, lane3);
if (ctx->chip_class >= VI) {
return ac_build_dpp(ctx, src, src, mask, 0xf, 0xf, false);
} else {
return ac_build_ds_swizzle(ctx, src, (1 << 15) | mask);
}
diff --git a/src/amd/common/ac_llvm_build.h b/src/amd/common/ac_llvm_build.h
index cf3e3cedf65..cad131768d2 100644
--- a/src/amd/common/ac_llvm_build.h
+++ b/src/amd/common/ac_llvm_build.h
@@ -519,20 +519,56 @@ ac_build_mbcnt(struct ac_llvm_context *ctx, LLVMValueRef mask);
LLVMValueRef
ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op);
LLVMValueRef
ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op);
LLVMValueRef
ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op, unsigned cluster_size);
+/**
+ * Common arguments for a scan/reduce operation that accumulates per-wave
+ * values across an entire workgroup, while respecting the order of waves.
+ */
+struct ac_wg_scan {
+ bool enable_reduce;
+ bool enable_exclusive;
+ bool enable_inclusive;
+ nir_op op;
+ LLVMValueRef src; /* clobbered! */
+ LLVMValueRef result_reduce;
+ LLVMValueRef result_exclusive;
+ LLVMValueRef result_inclusive;
+ LLVMValueRef extra;
+ LLVMValueRef waveidx;
+ LLVMValueRef numwaves; /* only needed for "reduce" operations */
+
+ /* T addrspace(LDS) pointer to the same type as value, at least maxwaves entries */
+ LLVMValueRef scratch;
+ unsigned maxwaves;
+};
+
+void
+ac_build_wg_wavescan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws);
+void
+ac_build_wg_wavescan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws);
+void
+ac_build_wg_wavescan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws);
+
+void
+ac_build_wg_scan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws);
+void
+ac_build_wg_scan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws);
+void
+ac_build_wg_scan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws);
+
LLVMValueRef
ac_build_quad_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src,
unsigned lane0, unsigned lane1, unsigned lane2, unsigned lane3);
LLVMValueRef
ac_build_shuffle(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef index);
#ifdef __cplusplus
}
#endif
--
2.19.1
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev
Haehnle, Nicolai
2018-12-07 14:32:24 UTC
Permalink
Post by Connor Abbott
Is this going to be used by an extension? If you don't have a use for
it yet, it would probably be better to wait.
Well, I have been using it quite extensively in a branch I've been
working on, but that's not quite ready yet.

Cheers,
Nicolai
Post by Connor Abbott
Post by Nicolai Hähnle
Order-aware scan/reduce can trade-off LDS traffic for external atomics
memory traffic in producer/consumer compute shaders.
---
src/amd/common/ac_llvm_build.c | 195 ++++++++++++++++++++++++++++++++-
src/amd/common/ac_llvm_build.h | 36 ++++++
2 files changed, 227 insertions(+), 4 deletions(-)
diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index 68c8bad9e83..932f4bbdeef 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -3345,68 +3345,88 @@ ac_build_alu_op(struct ac_llvm_context *ctx, LLVMValueRef lhs, LLVMValueRef rhs,
_64bit ? ctx->f64 : ctx->f32,
(LLVMValueRef[]){lhs, rhs}, 2, AC_FUNC_ATTR_READNONE);
case nir_op_iand: return LLVMBuildAnd(ctx->builder, lhs, rhs, "");
case nir_op_ior: return LLVMBuildOr(ctx->builder, lhs, rhs, "");
case nir_op_ixor: return LLVMBuildXor(ctx->builder, lhs, rhs, "");
unreachable("bad reduction intrinsic");
}
}
-/* TODO: add inclusive and excluse scan functions for SI chip class. */
+/**
+ * \param maxprefix specifies that the result only needs to be correct for a
+ * prefix of this many threads
+ *
+ * TODO: add inclusive and excluse scan functions for SI chip class.
+ */
static LLVMValueRef
-ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, LLVMValueRef identity)
+ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, LLVMValueRef identity,
+ unsigned maxprefix)
{
LLVMValueRef result, tmp;
result = src;
+ if (maxprefix <= 1)
+ return result;
tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);
result = ac_build_alu_op(ctx, result, tmp, op);
+ if (maxprefix <= 2)
+ return result;
tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(2), 0xf, 0xf, false);
result = ac_build_alu_op(ctx, result, tmp, op);
+ if (maxprefix <= 3)
+ return result;
tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(3), 0xf, 0xf, false);
result = ac_build_alu_op(ctx, result, tmp, op);
+ if (maxprefix <= 4)
+ return result;
tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(4), 0xf, 0xe, false);
result = ac_build_alu_op(ctx, result, tmp, op);
+ if (maxprefix <= 8)
+ return result;
tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(8), 0xf, 0xc, false);
result = ac_build_alu_op(ctx, result, tmp, op);
+ if (maxprefix <= 16)
+ return result;
tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
result = ac_build_alu_op(ctx, result, tmp, op);
+ if (maxprefix <= 32)
+ return result;
tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
result = ac_build_alu_op(ctx, result, tmp, op);
return result;
}
LLVMValueRef
ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
{
ac_build_optimization_barrier(ctx, &src);
LLVMValueRef result;
LLVMValueRef identity =
get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
LLVMTypeOf(identity), "");
- result = ac_build_scan(ctx, op, result, identity);
+ result = ac_build_scan(ctx, op, result, identity, 64);
return ac_build_wwm(ctx, result);
}
LLVMValueRef
ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
{
ac_build_optimization_barrier(ctx, &src);
LLVMValueRef result;
LLVMValueRef identity =
get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
LLVMTypeOf(identity), "");
result = ac_build_dpp(ctx, identity, result, dpp_wf_sr1, 0xf, 0xf, false);
- result = ac_build_scan(ctx, op, result, identity);
+ result = ac_build_scan(ctx, op, result, identity, 64);
return ac_build_wwm(ctx, result);
}
LLVMValueRef
ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op, unsigned cluster_size)
{
if (cluster_size == 1) return src;
ac_build_optimization_barrier(ctx, &src);
LLVMValueRef result, swap;
@@ -3450,20 +3470,187 @@ ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op, unsign
result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 63, 0));
return ac_build_wwm(ctx, result);
} else {
swap = ac_build_readlane(ctx, result, ctx->i32_0);
result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 32, 0));
result = ac_build_alu_op(ctx, result, swap, op);
return ac_build_wwm(ctx, result);
}
}
+/**
+ * "Top half" of a scan that reduces per-wave values across an entire
+ * workgroup.
+ *
+ * The source value must be present in the highest lane of the wave, and the
+ * highest lane must be live.
+ */
+void
+ac_build_wg_wavescan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
+{
+ if (ws->maxwaves <= 1)
+ return;
+
+ const LLVMValueRef i32_63 = LLVMConstInt(ctx->i32, 63, false);
+ LLVMBuilderRef builder = ctx->builder;
+ LLVMValueRef tid = ac_get_thread_id(ctx);
+ LLVMValueRef tmp;
+
+ tmp = LLVMBuildICmp(builder, LLVMIntEQ, tid, i32_63, "");
+ ac_build_ifcc(ctx, tmp, 1000);
+ LLVMBuildStore(builder, ws->src, LLVMBuildGEP(builder, ws->scratch, &ws->waveidx, 1, ""));
+ ac_build_endif(ctx, 1000);
+}
+
+/**
+ * "Bottom half" of a scan that reduces per-wave values across an entire
+ * workgroup.
+ *
+ * The caller must place a barrier between the top and bottom halves.
+ */
+void
+ac_build_wg_wavescan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
+{
+ const LLVMTypeRef type = LLVMTypeOf(ws->src);
+ const LLVMValueRef identity =
+ get_reduction_identity(ctx, ws->op, ac_get_type_size(type));
+
+ if (ws->maxwaves <= 1) {
+ ws->result_reduce = ws->src;
+ ws->result_inclusive = ws->src;
+ ws->result_exclusive = identity;
+ return;
+ }
+ assert(ws->maxwaves <= 32);
+
+ LLVMBuilderRef builder = ctx->builder;
+ LLVMValueRef tid = ac_get_thread_id(ctx);
+ LLVMBasicBlockRef bbs[2];
+ LLVMValueRef phivalues_scan[2];
+ LLVMValueRef tmp, tmp2;
+
+ bbs[0] = LLVMGetInsertBlock(builder);
+ phivalues_scan[0] = LLVMGetUndef(type);
+
+ if (ws->enable_reduce)
+ tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->numwaves, "");
+ else if (ws->enable_inclusive)
+ tmp = LLVMBuildICmp(builder, LLVMIntULE, tid, ws->waveidx, "");
+ else
+ tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->waveidx, "");
+ ac_build_ifcc(ctx, tmp, 1001);
+ {
+ tmp = LLVMBuildLoad(builder, LLVMBuildGEP(builder, ws->scratch, &tid, 1, ""), "");
+
+ ac_build_optimization_barrier(ctx, &tmp);
+
+ bbs[1] = LLVMGetInsertBlock(builder);
+ phivalues_scan[1] = ac_build_scan(ctx, ws->op, tmp, identity, ws->maxwaves);
+ }
+ ac_build_endif(ctx, 1001);
+
+ const LLVMValueRef scan = ac_build_phi(ctx, type, 2, phivalues_scan, bbs);
+
+ if (ws->enable_reduce) {
+ tmp = LLVMBuildSub(builder, ws->numwaves, ctx->i32_1, "");
+ ws->result_reduce = ac_build_readlane(ctx, scan, tmp);
+ }
+ if (ws->enable_inclusive)
+ ws->result_inclusive = ac_build_readlane(ctx, scan, ws->waveidx);
+ if (ws->enable_exclusive) {
+ tmp = LLVMBuildSub(builder, ws->waveidx, ctx->i32_1, "");
+ tmp = ac_build_readlane(ctx, scan, tmp);
+ tmp2 = LLVMBuildICmp(builder, LLVMIntEQ, ws->waveidx, ctx->i32_0, "");
+ ws->result_exclusive = LLVMBuildSelect(builder, tmp2, identity, tmp, "");
+ }
+}
+
+/**
+ * Inclusive scan of a per-wave value across an entire workgroup.
+ *
+ * This implies an s_barrier instruction.
+ *
+ * Unlike ac_build_inclusive_scan, the caller \em must ensure that all threads
+ * of the workgroup are live. (This requirement cannot easily be relaxed in a
+ * useful manner because of the barrier in the algorithm.)
+ */
+void
+ac_build_wg_wavescan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
+{
+ ac_build_wg_wavescan_top(ctx, ws);
+ ac_build_s_barrier(ctx);
+ ac_build_wg_wavescan_bottom(ctx, ws);
+}
+
+/**
+ * "Top half" of a scan that reduces per-thread values across an entire
+ * workgroup.
+ *
+ * All lanes must be active when this code runs.
+ */
+void
+ac_build_wg_scan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
+{
+ if (ws->enable_exclusive) {
+ ws->extra = ac_build_exclusive_scan(ctx, ws->src, ws->op);
+ ws->src = ac_build_alu_op(ctx, ws->extra, ws->src, ws->op);
+ } else {
+ ws->src = ac_build_inclusive_scan(ctx, ws->src, ws->op);
+ }
+
+ bool enable_inclusive = ws->enable_inclusive;
+ bool enable_exclusive = ws->enable_exclusive;
+ ws->enable_inclusive = false;
+ ws->enable_exclusive = ws->enable_exclusive || enable_inclusive;
+ ac_build_wg_wavescan_top(ctx, ws);
+ ws->enable_inclusive = enable_inclusive;
+ ws->enable_exclusive = enable_exclusive;
+}
+
+/**
+ * "Bottom half" of a scan that reduces per-thread values across an entire
+ * workgroup.
+ *
+ * The caller must place a barrier between the top and bottom halves.
+ */
+void
+ac_build_wg_scan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
+{
+ bool enable_inclusive = ws->enable_inclusive;
+ bool enable_exclusive = ws->enable_exclusive;
+ ws->enable_inclusive = false;
+ ws->enable_exclusive = ws->enable_exclusive || enable_inclusive;
+ ac_build_wg_wavescan_bottom(ctx, ws);
+ ws->enable_inclusive = enable_inclusive;
+ ws->enable_exclusive = enable_exclusive;
+
+ /* ws->result_reduce is already the correct value */
+ if (ws->enable_inclusive)
+ ws->result_inclusive = ac_build_alu_op(ctx, ws->result_exclusive, ws->src, ws->op);
+ if (ws->enable_exclusive)
+ ws->result_exclusive = ac_build_alu_op(ctx, ws->result_exclusive, ws->extra, ws->op);
+}
+
+/**
+ * A scan that reduces per-thread values across an entire workgroup.
+ *
+ * The caller must ensure that all lanes are active when this code runs
+ * (WWM is insufficient!), because there is an implied barrier.
+ */
+void
+ac_build_wg_scan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
+{
+ ac_build_wg_scan_top(ctx, ws);
+ ac_build_s_barrier(ctx);
+ ac_build_wg_scan_bottom(ctx, ws);
+}
+
LLVMValueRef
ac_build_quad_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src,
unsigned lane0, unsigned lane1, unsigned lane2, unsigned lane3)
{
unsigned mask = dpp_quad_perm(lane0, lane1, lane2, lane3);
if (ctx->chip_class >= VI) {
return ac_build_dpp(ctx, src, src, mask, 0xf, 0xf, false);
} else {
return ac_build_ds_swizzle(ctx, src, (1 << 15) | mask);
}
diff --git a/src/amd/common/ac_llvm_build.h b/src/amd/common/ac_llvm_build.h
index cf3e3cedf65..cad131768d2 100644
--- a/src/amd/common/ac_llvm_build.h
+++ b/src/amd/common/ac_llvm_build.h
@@ -519,20 +519,56 @@ ac_build_mbcnt(struct ac_llvm_context *ctx, LLVMValueRef mask);
LLVMValueRef
ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op);
LLVMValueRef
ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op);
LLVMValueRef
ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op, unsigned cluster_size);
+/**
+ * Common arguments for a scan/reduce operation that accumulates per-wave
+ * values across an entire workgroup, while respecting the order of waves.
+ */
+struct ac_wg_scan {
+ bool enable_reduce;
+ bool enable_exclusive;
+ bool enable_inclusive;
+ nir_op op;
+ LLVMValueRef src; /* clobbered! */
+ LLVMValueRef result_reduce;
+ LLVMValueRef result_exclusive;
+ LLVMValueRef result_inclusive;
+ LLVMValueRef extra;
+ LLVMValueRef waveidx;
+ LLVMValueRef numwaves; /* only needed for "reduce" operations */
+
+ /* T addrspace(LDS) pointer to the same type as value, at least maxwaves entries */
+ LLVMValueRef scratch;
+ unsigned maxwaves;
+};
+
+void
+ac_build_wg_wavescan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws);
+void
+ac_build_wg_wavescan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws);
+void
+ac_build_wg_wavescan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws);
+
+void
+ac_build_wg_scan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws);
+void
+ac_build_wg_scan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws);
+void
+ac_build_wg_scan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws);
+
LLVMValueRef
ac_build_quad_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src,
unsigned lane0, unsigned lane1, unsigned lane2, unsigned lane3);
LLVMValueRef
ac_build_shuffle(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef index);
#ifdef __cplusplus
}
#endif
--
2.19.1
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev
Nicolai Hähnle
2018-12-06 14:00:32 UTC
Permalink
From: Nicolai Hähnle <***@amd.com>

---
src/gallium/drivers/radeonsi/si_pipe.c | 4 +--
src/gallium/drivers/radeonsi/si_state.c | 2 --
src/gallium/drivers/radeonsi/si_state.h | 10 +------
src/gallium/drivers/radeonsi/si_state_draw.c | 28 +++++++++++++-------
4 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 7943af4d86e..fd8ff5fa202 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -494,44 +494,44 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen,
ws->buffer_map(sctx->border_color_buffer->buf,
NULL, PIPE_TRANSFER_WRITE);
if (!sctx->border_color_map)
goto fail;

si_init_all_descriptors(sctx);
si_init_fence_functions(sctx);
si_init_state_functions(sctx);
si_init_shader_functions(sctx);
si_init_viewport_functions(sctx);
- si_init_ia_multi_vgt_param_table(sctx);

if (sctx->chip_class >= CIK)
cik_init_sdma_functions(sctx);
else
si_init_dma_functions(sctx);

if (sscreen->debug_flags & DBG(FORCE_DMA))
sctx->b.resource_copy_region = sctx->dma_copy;

bool dst_stream_policy = SI_COMPUTE_DST_CACHE_POLICY != L2_LRU;
sctx->cs_clear_buffer = si_create_dma_compute_shader(&sctx->b,
SI_COMPUTE_CLEAR_DW_PER_THREAD,
dst_stream_policy, false);
sctx->cs_copy_buffer = si_create_dma_compute_shader(&sctx->b,
SI_COMPUTE_COPY_DW_PER_THREAD,
dst_stream_policy, true);

sctx->blitter = util_blitter_create(&sctx->b);
if (sctx->blitter == NULL)
goto fail;
- sctx->blitter->draw_rectangle = si_draw_rectangle;
sctx->blitter->skip_viewport_restore = true;

+ si_init_draw_functions(sctx);
+
sctx->sample_mask = 0xffff;

if (sctx->chip_class >= GFX9) {
sctx->wait_mem_scratch = r600_resource(
pipe_buffer_create(screen, 0, PIPE_USAGE_DEFAULT, 4));
if (!sctx->wait_mem_scratch)
goto fail;

/* Initialize the memory. */
struct radeon_cmdbuf *cs = sctx->gfx_cs;
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 0960f379c4f..86d7b3a16f9 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -4818,22 +4818,20 @@ void si_init_state_functions(struct si_context *sctx)
sctx->b.delete_vertex_elements_state = si_delete_vertex_element;
sctx->b.set_vertex_buffers = si_set_vertex_buffers;

sctx->b.texture_barrier = si_texture_barrier;
sctx->b.memory_barrier = si_memory_barrier;
sctx->b.set_min_samples = si_set_min_samples;
sctx->b.set_tess_state = si_set_tess_state;

sctx->b.set_active_query_state = si_set_active_query_state;

- sctx->b.draw_vbo = si_draw_vbo;
-
si_init_config(sctx);
}

void si_init_screen_state_functions(struct si_screen *sscreen)
{
sscreen->b.is_format_supported = si_is_format_supported;
}

static void si_set_grbm_gfx_index(struct si_context *sctx,
struct si_pm4_state *pm4, unsigned value)
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index 83589e6918c..bb186f530f0 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -534,31 +534,23 @@ bool si_init_shader_cache(struct si_screen *sscreen);
void si_destroy_shader_cache(struct si_screen *sscreen);
void si_schedule_initial_compile(struct si_context *sctx, unsigned processor,
struct util_queue_fence *ready_fence,
struct si_compiler_ctx_state *compiler_ctx_state,
void *job, util_queue_execute_func execute);
void si_get_active_slot_masks(const struct tgsi_shader_info *info,
uint32_t *const_and_shader_buffers,
uint64_t *samplers_and_images);

/* si_state_draw.c */
-void si_init_ia_multi_vgt_param_table(struct si_context *sctx);
void si_emit_cache_flush(struct si_context *sctx);
-void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *dinfo);
-void si_draw_rectangle(struct blitter_context *blitter,
- void *vertex_elements_cso,
- blitter_get_vs_func get_vs,
- int x1, int y1, int x2, int y2,
- float depth, unsigned num_instances,
- enum blitter_attrib_type type,
- const union blitter_attrib *attrib);
void si_trace_emit(struct si_context *sctx);
+void si_init_draw_functions(struct si_context *sctx);

/* si_state_msaa.c */
void si_init_msaa_functions(struct si_context *sctx);
void si_emit_sample_locations(struct radeon_cmdbuf *cs, int nr_samples);

/* si_state_streamout.c */
void si_streamout_buffers_dirty(struct si_context *sctx);
void si_emit_streamout_end(struct si_context *sctx);
void si_update_prims_generated_query_state(struct si_context *sctx,
unsigned type, int diff);
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index 612ca910cb9..254f9edeb75 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -448,21 +448,21 @@ si_get_init_multi_vgt_param(struct si_screen *sscreen,
S_028AA8_PARTIAL_VS_WAVE_ON(partial_vs_wave) |
S_028AA8_PARTIAL_ES_WAVE_ON(partial_es_wave) |
S_028AA8_WD_SWITCH_ON_EOP(sscreen->info.chip_class >= CIK ? wd_switch_on_eop : 0) |
/* The following field was moved to VGT_SHADER_STAGES_EN in GFX9. */
S_028AA8_MAX_PRIMGRP_IN_WAVE(sscreen->info.chip_class == VI ?
max_primgroup_in_wave : 0) |
S_030960_EN_INST_OPT_BASIC(sscreen->info.chip_class >= GFX9) |
S_030960_EN_INST_OPT_ADV(sscreen->info.chip_class >= GFX9);
}

-void si_init_ia_multi_vgt_param_table(struct si_context *sctx)
+static void si_init_ia_multi_vgt_param_table(struct si_context *sctx)
{
for (int prim = 0; prim <= SI_PRIM_RECTANGLE_LIST; prim++)
for (int uses_instancing = 0; uses_instancing < 2; uses_instancing++)
for (int multi_instances = 0; multi_instances < 2; multi_instances++)
for (int primitive_restart = 0; primitive_restart < 2; primitive_restart++)
for (int count_from_so = 0; count_from_so < 2; count_from_so++)
for (int line_stipple = 0; line_stipple < 2; line_stipple++)
for (int uses_tess = 0; uses_tess < 2; uses_tess++)
for (int tess_uses_primid = 0; tess_uses_primid < 2; tess_uses_primid++)
for (int uses_gs = 0; uses_gs < 2; uses_gs++) {
@@ -1241,21 +1241,21 @@ static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_i
(context_roll || sctx->context_roll_counter)) {
sctx->scissors.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1;
sctx->atoms.s.scissors.emit(sctx);
}

/* Emit draw states. */
si_emit_vs_state(sctx, info);
si_emit_draw_registers(sctx, info, num_patches);
}

-void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
+static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
{
struct si_context *sctx = (struct si_context *)ctx;
struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
struct pipe_resource *indexbuf = info->index.resource;
unsigned dirty_tex_counter;
enum pipe_prim_type rast_prim;
unsigned index_size = info->index_size;
unsigned index_offset = info->indirect ? info->start * index_size : 0;

if (likely(!info->indirect)) {
@@ -1521,27 +1521,28 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
sctx->num_mrt_draw_calls++;
if (info->primitive_restart)
sctx->num_prim_restart_calls++;
if (G_0286E8_WAVESIZE(sctx->spi_tmpring_size))
sctx->num_spill_draw_calls++;
}
if (index_size && indexbuf != info->index.resource)
pipe_resource_reference(&indexbuf, NULL);
}

-void si_draw_rectangle(struct blitter_context *blitter,
- void *vertex_elements_cso,
- blitter_get_vs_func get_vs,
- int x1, int y1, int x2, int y2,
- float depth, unsigned num_instances,
- enum blitter_attrib_type type,
- const union blitter_attrib *attrib)
+static void
+si_draw_rectangle(struct blitter_context *blitter,
+ void *vertex_elements_cso,
+ blitter_get_vs_func get_vs,
+ int x1, int y1, int x2, int y2,
+ float depth, unsigned num_instances,
+ enum blitter_attrib_type type,
+ const union blitter_attrib *attrib)
{
struct pipe_context *pipe = util_blitter_get_pipe(blitter);
struct si_context *sctx = (struct si_context*)pipe;

/* Pack position coordinates as signed int16. */
sctx->vs_blit_sh_data[0] = (uint32_t)(x1 & 0xffff) |
((uint32_t)(y1 & 0xffff) << 16);
sctx->vs_blit_sh_data[1] = (uint32_t)(x2 & 0xffff) |
((uint32_t)(y2 & 0xffff) << 16);
sctx->vs_blit_sh_data[2] = fui(depth);
@@ -1585,10 +1586,19 @@ void si_trace_emit(struct si_context *sctx)
S_370_ENGINE_SEL(V_370_ME));
radeon_emit(cs, va);
radeon_emit(cs, va >> 32);
radeon_emit(cs, trace_id);
radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
radeon_emit(cs, AC_ENCODE_TRACE_POINT(trace_id));

if (sctx->log)
u_log_flush(sctx->log);
}
+
+void si_init_draw_functions(struct si_context *sctx)
+{
+ sctx->b.draw_vbo = si_draw_vbo;
+
+ sctx->blitter->draw_rectangle = si_draw_rectangle;
+
+ si_init_ia_multi_vgt_param_table(sctx);
+}
--
2.19.1
Nicolai Hähnle
2018-12-06 14:00:30 UTC
Permalink
From: Nicolai Hähnle <***@amd.com>

This helps some debugging cases by initializing addrlib with
slightly more appropriate settings.
---
src/gallium/drivers/radeonsi/si_pipe.c | 34 ------------------
src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c | 36 +++++++++++++++++++
2 files changed, 36 insertions(+), 34 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 503d8331906..7943af4d86e 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -718,53 +718,20 @@ static void si_destroy_screen(struct pipe_screen* pscreen)
sscreen->ws->destroy(sscreen->ws);
FREE(sscreen);
}

static void si_init_gs_info(struct si_screen *sscreen)
{
sscreen->gs_table_depth = ac_get_gs_table_depth(sscreen->info.chip_class,
sscreen->info.family);
}

-static void si_handle_env_var_force_family(struct si_screen *sscreen)
-{
- const char *family = debug_get_option("SI_FORCE_FAMILY", NULL);
- unsigned i;
-
- if (!family)
- return;
-
- for (i = CHIP_TAHITI; i < CHIP_LAST; i++) {
- if (!strcmp(family, ac_get_llvm_processor_name(i))) {
- /* Override family and chip_class. */
- sscreen->info.family = i;
- sscreen->info.name = "GCN-NOOP";
-
- if (i >= CHIP_VEGA10)
- sscreen->info.chip_class = GFX9;
- else if (i >= CHIP_TONGA)
- sscreen->info.chip_class = VI;
- else if (i >= CHIP_BONAIRE)
- sscreen->info.chip_class = CIK;
- else
- sscreen->info.chip_class = SI;
-
- /* Don't submit any IBs. */
- setenv("RADEON_NOOP", "1", 1);
- return;
- }
- }
-
- fprintf(stderr, "radeonsi: Unknown family: %s\n", family);
- exit(1);
-}
-
static void si_test_vmfault(struct si_screen *sscreen)
{
struct pipe_context *ctx = sscreen->aux_context;
struct si_context *sctx = (struct si_context *)ctx;
struct pipe_resource *buf =
pipe_buffer_create_const0(&sscreen->b, 0, PIPE_USAGE_DEFAULT, 64);

if (!buf) {
puts("Buffer allocation failed.");
exit(1);
@@ -871,21 +838,20 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws,
{
struct si_screen *sscreen = CALLOC_STRUCT(si_screen);
unsigned hw_threads, num_comp_hi_threads, num_comp_lo_threads, i;

if (!sscreen) {
return NULL;
}

sscreen->ws = ws;
ws->query_info(ws, &sscreen->info);
- si_handle_env_var_force_family(sscreen);

if (sscreen->info.chip_class >= GFX9) {
sscreen->se_tile_repeat = 32 * sscreen->info.max_se;
} else {
ac_get_raster_config(&sscreen->info,
&sscreen->pa_sc_raster_config,
&sscreen->pa_sc_raster_config_1,
&sscreen->se_tile_repeat);
}

diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
index 6b7f484f239..79d2c1345ef 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
@@ -31,40 +31,76 @@
#include "amdgpu_public.h"

#include "util/u_cpu_detect.h"
#include "util/u_hash_table.h"
#include "util/hash_table.h"
#include "util/xmlconfig.h"
#include <amdgpu_drm.h>
#include <xf86drm.h>
#include <stdio.h>
#include <sys/stat.h>
+#include "amd/common/ac_llvm_util.h"
#include "amd/common/sid.h"
#include "amd/common/gfx9d.h"

#ifndef AMDGPU_INFO_NUM_VRAM_CPU_PAGE_FAULTS
#define AMDGPU_INFO_NUM_VRAM_CPU_PAGE_FAULTS 0x1E
#endif

static struct util_hash_table *dev_tab = NULL;
static simple_mtx_t dev_tab_mutex = _SIMPLE_MTX_INITIALIZER_NP;

DEBUG_GET_ONCE_BOOL_OPTION(all_bos, "RADEON_ALL_BOS", false)

+static void handle_env_var_force_family(struct amdgpu_winsys *ws)
+{
+ const char *family = debug_get_option("SI_FORCE_FAMILY", NULL);
+ unsigned i;
+
+ if (!family)
+ return;
+
+ for (i = CHIP_TAHITI; i < CHIP_LAST; i++) {
+ if (!strcmp(family, ac_get_llvm_processor_name(i))) {
+ /* Override family and chip_class. */
+ ws->info.family = i;
+ ws->info.name = "GCN-NOOP";
+
+ if (i >= CHIP_VEGA10)
+ ws->info.chip_class = GFX9;
+ else if (i >= CHIP_TONGA)
+ ws->info.chip_class = VI;
+ else if (i >= CHIP_BONAIRE)
+ ws->info.chip_class = CIK;
+ else
+ ws->info.chip_class = SI;
+
+ /* Don't submit any IBs. */
+ setenv("RADEON_NOOP", "1", 1);
+ return;
+ }
+ }
+
+ fprintf(stderr, "radeonsi: Unknown family: %s\n", family);
+ exit(1);
+}
+
/* Helper function to do the ioctls needed for setup and init. */
static bool do_winsys_init(struct amdgpu_winsys *ws,
const struct pipe_screen_config *config,
int fd)
{
if (!ac_query_gpu_info(fd, ws->dev, &ws->info, &ws->amdinfo))
goto fail;

+ handle_env_var_force_family(ws);
+
ws->addrlib = amdgpu_addr_create(&ws->info, &ws->amdinfo, &ws->info.max_alignment);
if (!ws->addrlib) {
fprintf(stderr, "amdgpu: Cannot create addrlib.\n");
goto fail;
}

ws->check_vm = strstr(debug_get_option("R600_DEBUG", ""), "check_vm") != NULL;
ws->debug_all_bos = debug_get_option_all_bos();
ws->reserve_vmid = strstr(debug_get_option("R600_DEBUG", ""), "reserve_vmid") != NULL;
ws->zero_all_vram_allocs = strstr(debug_get_option("R600_DEBUG", ""), "zerovram") != NULL ||
--
2.19.1
Nicolai Hähnle
2018-12-06 14:00:33 UTC
Permalink
From: Nicolai Hähnle <***@amd.com>

---
src/amd/common/ac_debug.c | 2 ++
src/amd/common/sid.h | 1 +
src/gallium/drivers/radeonsi/si_build_pm4.h | 8 +++++++-
src/gallium/drivers/radeonsi/si_state_draw.c | 12 ++++++++----
4 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/src/amd/common/ac_debug.c b/src/amd/common/ac_debug.c
index 3b15398a2a2..e5463b66616 100644
--- a/src/amd/common/ac_debug.c
+++ b/src/amd/common/ac_debug.c
@@ -226,39 +226,41 @@ static void ac_parse_packet3(FILE *f, uint32_t header, struct ac_ib_parser *ib,
for (i = 0; i < ARRAY_SIZE(packet3_table); i++)
if (packet3_table[i].op == op)
break;

if (i < ARRAY_SIZE(packet3_table)) {
const char *name = sid_strings + packet3_table[i].name_offset;

if (op == PKT3_SET_CONTEXT_REG ||
op == PKT3_SET_CONFIG_REG ||
op == PKT3_SET_UCONFIG_REG ||
+ op == PKT3_SET_UCONFIG_REG_INDEX ||
op == PKT3_SET_SH_REG)
fprintf(f, COLOR_CYAN "%s%s" COLOR_CYAN ":\n",
name, predicate);
else
fprintf(f, COLOR_GREEN "%s%s" COLOR_RESET ":\n",
name, predicate);
} else
fprintf(f, COLOR_RED "PKT3_UNKNOWN 0x%x%s" COLOR_RESET ":\n",
op, predicate);

/* Print the contents. */
switch (op) {
case PKT3_SET_CONTEXT_REG:
ac_parse_set_reg_packet(f, count, SI_CONTEXT_REG_OFFSET, ib);
break;
case PKT3_SET_CONFIG_REG:
ac_parse_set_reg_packet(f, count, SI_CONFIG_REG_OFFSET, ib);
break;
case PKT3_SET_UCONFIG_REG:
+ case PKT3_SET_UCONFIG_REG_INDEX:
ac_parse_set_reg_packet(f, count, CIK_UCONFIG_REG_OFFSET, ib);
break;
case PKT3_SET_SH_REG:
ac_parse_set_reg_packet(f, count, SI_SH_REG_OFFSET, ib);
break;
case PKT3_ACQUIRE_MEM:
ac_dump_reg(f, ib->chip_class, R_0301F0_CP_COHER_CNTL, ac_ib_get(ib), ~0);
ac_dump_reg(f, ib->chip_class, R_0301F4_CP_COHER_SIZE, ac_ib_get(ib), ~0);
ac_dump_reg(f, ib->chip_class, R_030230_CP_COHER_SIZE_HI, ac_ib_get(ib), ~0);
ac_dump_reg(f, ib->chip_class, R_0301F8_CP_COHER_BASE, ac_ib_get(ib), ~0);
diff --git a/src/amd/common/sid.h b/src/amd/common/sid.h
index a6d0bc2fe42..94709b486d0 100644
--- a/src/amd/common/sid.h
+++ b/src/amd/common/sid.h
@@ -204,20 +204,21 @@
/* fix CP DMA before uncommenting: */
/*#define PKT3_EVENT_WRITE_EOS 0x48*/ /* not on GFX9 */
#define PKT3_RELEASE_MEM 0x49 /* GFX9+ [any ring] or GFX8 [compute ring only] */
#define PKT3_ONE_REG_WRITE 0x57 /* not on CIK */
#define PKT3_ACQUIRE_MEM 0x58 /* new for CIK */
#define PKT3_SET_CONFIG_REG 0x68
#define PKT3_SET_CONTEXT_REG 0x69
#define PKT3_SET_SH_REG 0x76
#define PKT3_SET_SH_REG_OFFSET 0x77
#define PKT3_SET_UCONFIG_REG 0x79 /* new for CIK */
+#define PKT3_SET_UCONFIG_REG_INDEX 0x7A /* new for GFX9, CP ucode version >= 26 */
#define PKT3_LOAD_CONST_RAM 0x80
#define PKT3_WRITE_CONST_RAM 0x81
#define PKT3_DUMP_CONST_RAM 0x83
#define PKT3_INCREMENT_CE_COUNTER 0x84
#define PKT3_INCREMENT_DE_COUNTER 0x85
#define PKT3_WAIT_ON_CE_COUNTER 0x86
#define PKT3_LOAD_CONTEXT_REG 0x9F /* new for VI */

#define PKT_TYPE_S(x) (((unsigned)(x) & 0x3) << 30)
#define PKT_TYPE_G(x) (((x) >> 30) & 0x3)
diff --git a/src/gallium/drivers/radeonsi/si_build_pm4.h b/src/gallium/drivers/radeonsi/si_build_pm4.h
index 796adda0963..4e8890a5f97 100644
--- a/src/gallium/drivers/radeonsi/si_build_pm4.h
+++ b/src/gallium/drivers/radeonsi/si_build_pm4.h
@@ -93,26 +93,32 @@ static inline void radeon_set_uconfig_reg_seq(struct radeon_cmdbuf *cs, unsigned
radeon_emit(cs, (reg - CIK_UCONFIG_REG_OFFSET) >> 2);
}

static inline void radeon_set_uconfig_reg(struct radeon_cmdbuf *cs, unsigned reg, unsigned value)
{
radeon_set_uconfig_reg_seq(cs, reg, 1);
radeon_emit(cs, value);
}

static inline void radeon_set_uconfig_reg_idx(struct radeon_cmdbuf *cs,
+ struct si_screen *screen,
unsigned reg, unsigned idx,
unsigned value)
{
assert(reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END);
assert(cs->current.cdw + 3 <= cs->current.max_dw);
- radeon_emit(cs, PKT3(PKT3_SET_UCONFIG_REG, 1, 0));
+ assert(idx != 0);
+ unsigned opcode = PKT3_SET_UCONFIG_REG_INDEX;
+ if (screen->info.chip_class < GFX9 ||
+ (screen->info.chip_class == GFX9 && screen->info.me_fw_version < 26))
+ opcode = PKT3_SET_UCONFIG_REG;
+ radeon_emit(cs, PKT3(opcode, 1, 0));
radeon_emit(cs, (reg - CIK_UCONFIG_REG_OFFSET) >> 2 | (idx << 28));
radeon_emit(cs, value);
}

/* Emit PKT3_SET_CONTEXT_REG if the register value is different. */
static inline void radeon_opt_set_context_reg(struct si_context *sctx, unsigned offset,
enum si_tracked_reg reg, unsigned value)
{
struct radeon_cmdbuf *cs = sctx->gfx_cs;

diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index 254f9edeb75..d011adb2cad 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -611,31 +611,34 @@ static void si_emit_draw_registers(struct si_context *sctx,
{
struct radeon_cmdbuf *cs = sctx->gfx_cs;
unsigned prim = si_conv_pipe_prim(info->mode);
unsigned ia_multi_vgt_param;

ia_multi_vgt_param = si_get_ia_multi_vgt_param(sctx, info, num_patches);

/* Draw state. */
if (ia_multi_vgt_param != sctx->last_multi_vgt_param) {
if (sctx->chip_class >= GFX9)
- radeon_set_uconfig_reg_idx(cs, R_030960_IA_MULTI_VGT_PARAM, 4, ia_multi_vgt_param);
+ radeon_set_uconfig_reg_idx(cs, sctx->screen,
+ R_030960_IA_MULTI_VGT_PARAM, 4,
+ ia_multi_vgt_param);
else if (sctx->chip_class >= CIK)
radeon_set_context_reg_idx(cs, R_028AA8_IA_MULTI_VGT_PARAM, 1, ia_multi_vgt_param);
else
radeon_set_context_reg(cs, R_028AA8_IA_MULTI_VGT_PARAM, ia_multi_vgt_param);

sctx->last_multi_vgt_param = ia_multi_vgt_param;
}
if (prim != sctx->last_prim) {
if (sctx->chip_class >= CIK)
- radeon_set_uconfig_reg_idx(cs, R_030908_VGT_PRIMITIVE_TYPE, 1, prim);
+ radeon_set_uconfig_reg_idx(cs, sctx->screen,
+ R_030908_VGT_PRIMITIVE_TYPE, 1, prim);
else
radeon_set_config_reg(cs, R_008958_VGT_PRIMITIVE_TYPE, prim);

sctx->last_prim = prim;
}

/* Primitive restart. */
if (info->primitive_restart != sctx->last_primitive_restart_en) {
if (sctx->chip_class >= GFX9)
radeon_set_uconfig_reg(cs, R_03092C_VGT_MULTI_PRIM_IB_RESET_EN,
@@ -709,22 +712,23 @@ static void si_emit_draw_packets(struct si_context *sctx,
index_type = V_028A7C_VGT_INDEX_32 |
(SI_BIG_ENDIAN && sctx->chip_class <= CIK ?
V_028A7C_VGT_DMA_SWAP_32_BIT : 0);
break;
default:
assert(!"unreachable");
return;
}

if (sctx->chip_class >= GFX9) {
- radeon_set_uconfig_reg_idx(cs, R_03090C_VGT_INDEX_TYPE,
- 2, index_type);
+ radeon_set_uconfig_reg_idx(cs, sctx->screen,
+ R_03090C_VGT_INDEX_TYPE, 2,
+ index_type);
} else {
radeon_emit(cs, PKT3(PKT3_INDEX_TYPE, 0, 0));
radeon_emit(cs, index_type);
}

sctx->last_index_size = index_size;
}

index_max_size = (indexbuf->width0 - index_offset) /
index_size;
--
2.19.1
Nicolai Hähnle
2018-12-06 14:00:31 UTC
Permalink
From: Nicolai Hähnle <***@amd.com>

Prepare for some later refactoring.
---
src/gallium/drivers/radeonsi/si_shader.c | 43 ++++++++++++++----------
1 file changed, 25 insertions(+), 18 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index d455fb5db6a..1bc32f31020 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -4577,20 +4577,44 @@ static void declare_vs_input_vgprs(struct si_shader_context *ctx,

if (!shader->is_gs_copy_shader) {
/* Vertex load indices. */
ctx->param_vertex_index0 = fninfo->num_params;
for (unsigned i = 0; i < shader->selector->info.num_inputs; i++)
add_arg(fninfo, ARG_VGPR, ctx->i32);
*num_prolog_vgprs += shader->selector->info.num_inputs;
}
}

+static void declare_vs_blit_inputs(struct si_shader_context *ctx,
+ struct si_function_info *fninfo,
+ unsigned vs_blit_property)
+{
+ ctx->param_vs_blit_inputs = fninfo->num_params;
+ add_arg(fninfo, ARG_SGPR, ctx->i32); /* i16 x1, y1 */
+ add_arg(fninfo, ARG_SGPR, ctx->i32); /* i16 x2, y2 */
+ add_arg(fninfo, ARG_SGPR, ctx->f32); /* depth */
+
+ if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) {
+ add_arg(fninfo, ARG_SGPR, ctx->f32); /* color0 */
+ add_arg(fninfo, ARG_SGPR, ctx->f32); /* color1 */
+ add_arg(fninfo, ARG_SGPR, ctx->f32); /* color2 */
+ add_arg(fninfo, ARG_SGPR, ctx->f32); /* color3 */
+ } else if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD) {
+ add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.x1 */
+ add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.y1 */
+ add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.x2 */
+ add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.y2 */
+ add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.z */
+ add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.w */
+ }
+}
+
static void declare_tes_input_vgprs(struct si_shader_context *ctx,
struct si_function_info *fninfo)
{
ctx->param_tes_u = add_arg(fninfo, ARG_VGPR, ctx->f32);
ctx->param_tes_v = add_arg(fninfo, ARG_VGPR, ctx->f32);
ctx->param_tes_rel_patch_id = add_arg(fninfo, ARG_VGPR, ctx->i32);
add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.tes_patch_id);
}

enum {
@@ -4621,38 +4645,21 @@ static void create_function(struct si_shader_context *ctx)
type = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY;
}

LLVMTypeRef v3i32 = LLVMVectorType(ctx->i32, 3);

switch (type) {
case PIPE_SHADER_VERTEX:
declare_global_desc_pointers(ctx, &fninfo);

if (vs_blit_property) {
- ctx->param_vs_blit_inputs = fninfo.num_params;
- add_arg(&fninfo, ARG_SGPR, ctx->i32); /* i16 x1, y1 */
- add_arg(&fninfo, ARG_SGPR, ctx->i32); /* i16 x2, y2 */
- add_arg(&fninfo, ARG_SGPR, ctx->f32); /* depth */
-
- if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) {
- add_arg(&fninfo, ARG_SGPR, ctx->f32); /* color0 */
- add_arg(&fninfo, ARG_SGPR, ctx->f32); /* color1 */
- add_arg(&fninfo, ARG_SGPR, ctx->f32); /* color2 */
- add_arg(&fninfo, ARG_SGPR, ctx->f32); /* color3 */
- } else if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD) {
- add_arg(&fninfo, ARG_SGPR, ctx->f32); /* texcoord.x1 */
- add_arg(&fninfo, ARG_SGPR, ctx->f32); /* texcoord.y1 */
- add_arg(&fninfo, ARG_SGPR, ctx->f32); /* texcoord.x2 */
- add_arg(&fninfo, ARG_SGPR, ctx->f32); /* texcoord.y2 */
- add_arg(&fninfo, ARG_SGPR, ctx->f32); /* texcoord.z */
- add_arg(&fninfo, ARG_SGPR, ctx->f32); /* texcoord.w */
- }
+ declare_vs_blit_inputs(ctx, &fninfo, vs_blit_property);

/* VGPRs */
declare_vs_input_vgprs(ctx, &fninfo, &num_prolog_vgprs);
break;
}

declare_per_stage_desc_pointers(ctx, &fninfo, true);
declare_vs_specific_input_sgprs(ctx, &fninfo);
ctx->param_vertex_buffers = add_arg(&fninfo, ARG_SGPR,
ac_array_in_const32_addr_space(ctx->v4i32));
--
2.19.1
Nicolai Hähnle
2018-12-06 14:00:34 UTC
Permalink
From: Nicolai Hähnle <***@amd.com>

There is never a read-after-write hazard because the command doesn't read.
---
src/gallium/drivers/radeonsi/si_cp_dma.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c
index 33220d9f0fa..80673f3f5f2 100644
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -182,21 +182,22 @@ static void si_cp_dma_prepare(struct si_context *sctx, struct pipe_resource *dst
r600_resource(src),
RADEON_USAGE_READ, RADEON_PRIO_CP_DMA);
}

/* Flush the caches for the first copy only.
* Also wait for the previous CP DMA operations.
*/
if (!(user_flags & SI_CPDMA_SKIP_GFX_SYNC) && sctx->flags)
si_emit_cache_flush(sctx);

- if (!(user_flags & SI_CPDMA_SKIP_SYNC_BEFORE) && *is_first)
+ if (!(user_flags & SI_CPDMA_SKIP_SYNC_BEFORE) && *is_first &&
+ !(*packet_flags & CP_DMA_CLEAR))
*packet_flags |= CP_DMA_RAW_WAIT;

*is_first = false;

/* Do the synchronization after the last dma, so that all data
* is written to memory.
*/
if (!(user_flags & SI_CPDMA_SKIP_SYNC_AFTER) &&
byte_count == remaining_size) {
*packet_flags |= CP_DMA_SYNC;
--
2.19.1
Nicolai Hähnle
2018-12-06 14:00:35 UTC
Permalink
From: Nicolai Hähnle <***@amd.com>

---
src/gallium/drivers/radeonsi/si_blit.c | 2 +-
src/gallium/drivers/radeonsi/si_pipe.h | 2 +-
src/gallium/drivers/radeonsi/si_texture.c | 4 ++--
3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c
index 8f7aa0815b9..69b1af02db0 100644
--- a/src/gallium/drivers/radeonsi/si_blit.c
+++ b/src/gallium/drivers/radeonsi/si_blit.c
@@ -1186,21 +1186,21 @@ resolve_to_temp:
* a temporary texture and blit.
*/
memset(&templ, 0, sizeof(templ));
templ.target = PIPE_TEXTURE_2D;
templ.format = info->src.resource->format;
templ.width0 = info->src.resource->width0;
templ.height0 = info->src.resource->height0;
templ.depth0 = 1;
templ.array_size = 1;
templ.usage = PIPE_USAGE_DEFAULT;
- templ.flags = SI_RESOURCE_FLAG_FORCE_TILING |
+ templ.flags = SI_RESOURCE_FLAG_FORCE_MSAA_TILING |
SI_RESOURCE_FLAG_DISABLE_DCC;

/* The src and dst microtile modes must be the same. */
if (src->surface.micro_tile_mode == RADEON_MICRO_MODE_DISPLAY)
templ.bind = PIPE_BIND_SCANOUT;
else
templ.bind = 0;

tmp = ctx->screen->resource_create(ctx->screen, &templ);
if (!tmp)
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 1d677d29e88..179671e8871 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -96,21 +96,21 @@
#define SI_PREFETCH_PS (1 << 6)

#define SI_MAX_BORDER_COLORS 4096
#define SI_MAX_VIEWPORTS 16
#define SIX_BITS 0x3F
#define SI_MAP_BUFFER_ALIGNMENT 64
#define SI_MAX_VARIABLE_THREADS_PER_BLOCK 1024

#define SI_RESOURCE_FLAG_TRANSFER (PIPE_RESOURCE_FLAG_DRV_PRIV << 0)
#define SI_RESOURCE_FLAG_FLUSHED_DEPTH (PIPE_RESOURCE_FLAG_DRV_PRIV << 1)
-#define SI_RESOURCE_FLAG_FORCE_TILING (PIPE_RESOURCE_FLAG_DRV_PRIV << 2)
+#define SI_RESOURCE_FLAG_FORCE_MSAA_TILING (PIPE_RESOURCE_FLAG_DRV_PRIV << 2)
#define SI_RESOURCE_FLAG_DISABLE_DCC (PIPE_RESOURCE_FLAG_DRV_PRIV << 3)
#define SI_RESOURCE_FLAG_UNMAPPABLE (PIPE_RESOURCE_FLAG_DRV_PRIV << 4)
#define SI_RESOURCE_FLAG_READ_ONLY (PIPE_RESOURCE_FLAG_DRV_PRIV << 5)
#define SI_RESOURCE_FLAG_32BIT (PIPE_RESOURCE_FLAG_DRV_PRIV << 6)
#define SI_RESOURCE_FLAG_SO_FILLED_SIZE (PIPE_RESOURCE_FLAG_DRV_PRIV << 7)

/* Debug flags. */
enum {
/* Shader logging options: */
DBG_VS = PIPE_SHADER_VERTEX,
diff --git a/src/gallium/drivers/radeonsi/si_texture.c b/src/gallium/drivers/radeonsi/si_texture.c
index 95f1e8c9693..ac1a0aa6097 100644
--- a/src/gallium/drivers/radeonsi/si_texture.c
+++ b/src/gallium/drivers/radeonsi/si_texture.c
@@ -296,21 +296,21 @@ static int si_init_surface(struct si_screen *sscreen,
ptex->last_level == 0 &&
!(flags & RADEON_SURF_Z_OR_SBUFFER));

flags |= RADEON_SURF_SCANOUT;
}

if (ptex->bind & PIPE_BIND_SHARED)
flags |= RADEON_SURF_SHAREABLE;
if (is_imported)
flags |= RADEON_SURF_IMPORTED | RADEON_SURF_SHAREABLE;
- if (!(ptex->flags & SI_RESOURCE_FLAG_FORCE_TILING))
+ if (!(ptex->flags & SI_RESOURCE_FLAG_FORCE_MSAA_TILING))
flags |= RADEON_SURF_OPTIMIZE_FOR_SPACE;

r = sscreen->ws->surface_init(sscreen->ws, ptex, flags, bpe,
array_mode, surface);
if (r) {
return r;
}

unsigned pitch = pitch_in_bytes_override / bpe;

@@ -1286,21 +1286,21 @@ si_texture_create_object(struct pipe_screen *screen,
}

return tex;
}

static enum radeon_surf_mode
si_choose_tiling(struct si_screen *sscreen,
const struct pipe_resource *templ, bool tc_compatible_htile)
{
const struct util_format_description *desc = util_format_description(templ->format);
- bool force_tiling = templ->flags & SI_RESOURCE_FLAG_FORCE_TILING;
+ bool force_tiling = templ->flags & SI_RESOURCE_FLAG_FORCE_MSAA_TILING;
bool is_depth_stencil = util_format_is_depth_or_stencil(templ->format) &&
!(templ->flags & SI_RESOURCE_FLAG_FLUSHED_DEPTH);

/* MSAA resources must be 2D tiled. */
if (templ->nr_samples > 1)
return RADEON_SURF_MODE_2D;

/* Transfer resources should be linear. */
if (templ->flags & SI_RESOURCE_FLAG_TRANSFER)
return RADEON_SURF_MODE_LINEAR_ALIGNED;
--
2.19.1
Nicolai Hähnle
2018-12-06 14:00:28 UTC
Permalink
From: Nicolai Hähnle <***@amd.com>

Allow for a unified but efficient treatment of adding a bitmask over a
wave or an entire threadgroup.
---
src/amd/common/ac_llvm_build.c | 27 +++++++++++++++++++++++++--
1 file changed, 25 insertions(+), 2 deletions(-)

diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index 932f4bbdeef..eb840369d07 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -3391,36 +3391,57 @@ ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, LLVMValu
if (maxprefix <= 32)
return result;
tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
result = ac_build_alu_op(ctx, result, tmp, op);
return result;
}

LLVMValueRef
ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
{
- ac_build_optimization_barrier(ctx, &src);
LLVMValueRef result;
+
+ if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
+ LLVMBuilderRef builder = ctx->builder;
+ src = LLVMBuildZExt(builder, src, ctx->i32, "");
+ result = ac_build_ballot(ctx, src);
+ result = ac_build_mbcnt(ctx, result);
+ result = LLVMBuildAdd(builder, result, src, "");
+ return result;
+ }
+
+ ac_build_optimization_barrier(ctx, &src);
+
LLVMValueRef identity =
get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
LLVMTypeOf(identity), "");
result = ac_build_scan(ctx, op, result, identity, 64);

return ac_build_wwm(ctx, result);
}

LLVMValueRef
ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
{
- ac_build_optimization_barrier(ctx, &src);
LLVMValueRef result;
+
+ if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
+ LLVMBuilderRef builder = ctx->builder;
+ src = LLVMBuildZExt(builder, src, ctx->i32, "");
+ result = ac_build_ballot(ctx, src);
+ result = ac_build_mbcnt(ctx, result);
+ return result;
+ }
+
+ ac_build_optimization_barrier(ctx, &src);
+
LLVMValueRef identity =
get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
LLVMTypeOf(identity), "");
result = ac_build_dpp(ctx, identity, result, dpp_wf_sr1, 0xf, 0xf, false);
result = ac_build_scan(ctx, op, result, identity, 64);

return ac_build_wwm(ctx, result);
}

@@ -3585,20 +3606,22 @@ ac_build_wg_wavescan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
* "Top half" of a scan that reduces per-thread values across an entire
* workgroup.
*
* All lanes must be active when this code runs.
*/
void
ac_build_wg_scan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
{
if (ws->enable_exclusive) {
ws->extra = ac_build_exclusive_scan(ctx, ws->src, ws->op);
+ if (LLVMTypeOf(ws->src) == ctx->i1 && ws->op == nir_op_iadd)
+ ws->src = LLVMBuildZExt(ctx->builder, ws->src, ctx->i32, "");
ws->src = ac_build_alu_op(ctx, ws->extra, ws->src, ws->op);
} else {
ws->src = ac_build_inclusive_scan(ctx, ws->src, ws->op);
}

bool enable_inclusive = ws->enable_inclusive;
bool enable_exclusive = ws->enable_exclusive;
ws->enable_inclusive = false;
ws->enable_exclusive = ws->enable_exclusive || enable_inclusive;
ac_build_wg_wavescan_top(ctx, ws);
--
2.19.1
Nicolai Hähnle
2018-12-06 14:00:36 UTC
Permalink
From: Nicolai Hähnle <***@amd.com>

---
src/gallium/drivers/radeonsi/si_state_shaders.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index ad7d21e7816..0d4e1956037 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -330,24 +330,24 @@ void si_destroy_shader_cache(struct si_screen *sscreen)
{
if (sscreen->shader_cache)
_mesa_hash_table_destroy(sscreen->shader_cache,
si_destroy_shader_cache_entry);
mtx_destroy(&sscreen->shader_cache_mutex);
}

/* SHADER STATES */

static void si_set_tesseval_regs(struct si_screen *sscreen,
- struct si_shader_selector *tes,
+ const struct si_shader_selector *tes,
struct si_pm4_state *pm4)
{
- struct tgsi_shader_info *info = &tes->info;
+ const struct tgsi_shader_info *info = &tes->info;
unsigned tes_prim_mode = info->properties[TGSI_PROPERTY_TES_PRIM_MODE];
unsigned tes_spacing = info->properties[TGSI_PROPERTY_TES_SPACING];
bool tes_vertex_order_cw = info->properties[TGSI_PROPERTY_TES_VERTEX_ORDER_CW];
bool tes_point_mode = info->properties[TGSI_PROPERTY_TES_POINT_MODE];
unsigned type, partitioning, topology, distribution_mode;

switch (tes_prim_mode) {
case PIPE_PRIM_LINES:
type = V_028B6C_TESS_ISOLINE;
break;
--
2.19.1
Nicolai Hähnle
2018-12-06 14:00:29 UTC
Permalink
From: Nicolai Hähnle <***@amd.com>

---
src/amd/common/ac_surface.c | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/amd/common/ac_surface.c b/src/amd/common/ac_surface.c
index d8d927ee1c5..aeba5e161c9 100644
--- a/src/amd/common/ac_surface.c
+++ b/src/amd/common/ac_surface.c
@@ -1509,24 +1509,26 @@ static int gfx9_compute_surface(ADDR_HANDLE addrlib,
r = gfx9_compute_miptree(addrlib, config, surf, compressed,
&AddrSurfInfoIn);
if (r)
return r;
}

surf->is_linear = surf->u.gfx9.surf.swizzle_mode == ADDR_SW_LINEAR;

/* Query whether the surface is displayable. */
bool displayable = false;
- r = Addr2IsValidDisplaySwizzleMode(addrlib, surf->u.gfx9.surf.swizzle_mode,
+ if (!config->is_3d && !config->is_cube) {
+ r = Addr2IsValidDisplaySwizzleMode(addrlib, surf->u.gfx9.surf.swizzle_mode,
surf->bpe * 8, &displayable);
- if (r)
- return r;
+ if (r)
+ return r;
+ }
surf->is_displayable = displayable;

switch (surf->u.gfx9.surf.swizzle_mode) {
/* S = standard. */
case ADDR_SW_256B_S:
case ADDR_SW_4KB_S:
case ADDR_SW_64KB_S:
case ADDR_SW_VAR_S:
case ADDR_SW_64KB_S_T:
case ADDR_SW_4KB_S_X:
--
2.19.1
Nicolai Hähnle
2018-12-06 14:00:37 UTC
Permalink
From: Nicolai Hähnle <***@amd.com>

This is rather important for merged VS/TCS as LSHS shaders...
---
src/gallium/drivers/radeonsi/si_debug.c | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_debug.c b/src/gallium/drivers/radeonsi/si_debug.c
index ec4bd03c9a5..22019741d80 100644
--- a/src/gallium/drivers/radeonsi/si_debug.c
+++ b/src/gallium/drivers/radeonsi/si_debug.c
@@ -1045,37 +1045,43 @@ static void si_dump_debug_state(struct pipe_context *ctx, FILE *f,
si_dump_debug_registers(sctx, f);

si_dump_annotated_shaders(sctx, f);
si_dump_command("Active waves (raw data)", "umr -O halt_waves -wa | column -t", f);
si_dump_command("Wave information", "umr -O halt_waves,bits -wa", f);
}
}

void si_log_draw_state(struct si_context *sctx, struct u_log_context *log)
{
+ struct si_shader_ctx_state *tcs_shader;
+
if (!log)
return;

+ tcs_shader = &sctx->tcs_shader;
+ if (sctx->tes_shader.cso && !sctx->tcs_shader.cso)
+ tcs_shader = &sctx->fixed_func_tcs_shader;
+
si_dump_framebuffer(sctx, log);

si_dump_gfx_shader(sctx, &sctx->vs_shader, log);
- si_dump_gfx_shader(sctx, &sctx->tcs_shader, log);
+ si_dump_gfx_shader(sctx, tcs_shader, log);
si_dump_gfx_shader(sctx, &sctx->tes_shader, log);
si_dump_gfx_shader(sctx, &sctx->gs_shader, log);
si_dump_gfx_shader(sctx, &sctx->ps_shader, log);

si_dump_descriptor_list(sctx->screen,
&sctx->descriptors[SI_DESCS_RW_BUFFERS],
"", "RW buffers", 4, SI_NUM_RW_BUFFERS,
si_identity, log);
si_dump_gfx_descriptors(sctx, &sctx->vs_shader, log);
- si_dump_gfx_descriptors(sctx, &sctx->tcs_shader, log);
+ si_dump_gfx_descriptors(sctx, tcs_shader, log);
si_dump_gfx_descriptors(sctx, &sctx->tes_shader, log);
si_dump_gfx_descriptors(sctx, &sctx->gs_shader, log);
si_dump_gfx_descriptors(sctx, &sctx->ps_shader, log);
}

void si_log_compute_state(struct si_context *sctx, struct u_log_context *log)
{
if (!log)
return;
--
2.19.1
Nicolai Hähnle
2018-12-06 14:00:38 UTC
Permalink
From: Nicolai Hähnle <***@amd.com>

---
src/gallium/drivers/radeonsi/si_debug.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/radeonsi/si_debug.c b/src/gallium/drivers/radeonsi/si_debug.c
index 22019741d80..fe2970a0ea3 100644
--- a/src/gallium/drivers/radeonsi/si_debug.c
+++ b/src/gallium/drivers/radeonsi/si_debug.c
@@ -1064,21 +1064,22 @@ void si_log_draw_state(struct si_context *sctx, struct u_log_context *log)
si_dump_framebuffer(sctx, log);

si_dump_gfx_shader(sctx, &sctx->vs_shader, log);
si_dump_gfx_shader(sctx, tcs_shader, log);
si_dump_gfx_shader(sctx, &sctx->tes_shader, log);
si_dump_gfx_shader(sctx, &sctx->gs_shader, log);
si_dump_gfx_shader(sctx, &sctx->ps_shader, log);

si_dump_descriptor_list(sctx->screen,
&sctx->descriptors[SI_DESCS_RW_BUFFERS],
- "", "RW buffers", 4, SI_NUM_RW_BUFFERS,
+ "", "RW buffers", 4,
+ sctx->descriptors[SI_DESCS_RW_BUFFERS].num_active_slots,
si_identity, log);
si_dump_gfx_descriptors(sctx, &sctx->vs_shader, log);
si_dump_gfx_descriptors(sctx, tcs_shader, log);
si_dump_gfx_descriptors(sctx, &sctx->tes_shader, log);
si_dump_gfx_descriptors(sctx, &sctx->gs_shader, log);
si_dump_gfx_descriptors(sctx, &sctx->ps_shader, log);
}

void si_log_compute_state(struct si_context *sctx, struct u_log_context *log)
{
--
2.19.1
Nicolai Hähnle
2018-12-06 14:00:40 UTC
Permalink
From: Nicolai Hähnle <***@amd.com>

Reduce the number of places that encode buffer descriptors.
---
.../drivers/radeonsi/si_state_streamout.c | 61 ++++---------------
1 file changed, 11 insertions(+), 50 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_state_streamout.c b/src/gallium/drivers/radeonsi/si_state_streamout.c
index fd7e843bc48..83ca23a8bf2 100644
--- a/src/gallium/drivers/radeonsi/si_state_streamout.c
+++ b/src/gallium/drivers/radeonsi/si_state_streamout.c
@@ -86,24 +86,22 @@ void si_streamout_buffers_dirty(struct si_context *sctx)
si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_begin);
si_set_streamout_enable(sctx, true);
}

static void si_set_streamout_targets(struct pipe_context *ctx,
unsigned num_targets,
struct pipe_stream_output_target **targets,
const unsigned *offsets)
{
struct si_context *sctx = (struct si_context *)ctx;
- struct si_buffer_resources *buffers = &sctx->rw_buffers;
- struct si_descriptors *descs = &sctx->descriptors[SI_DESCS_RW_BUFFERS];
unsigned old_num_targets = sctx->streamout.num_targets;
- unsigned i, bufidx;
+ unsigned i;

/* We are going to unbind the buffers. Mark which caches need to be flushed. */
if (sctx->streamout.num_targets && sctx->streamout.begin_emitted) {
/* Since streamout uses vector writes which go through TC L2
* and most other clients can use TC L2 as well, we don't need
* to flush it.
*
* The only cases which requires flushing it is VGT DMA index
* fetching (on <= CIK) and indirect draw data, which are rare
* cases. Thus, flag the TC L2 dirtiness in the resource and
@@ -168,71 +166,34 @@ static void si_set_streamout_targets(struct pipe_context *ctx,
/* Update dirty state bits. */
if (num_targets) {
si_streamout_buffers_dirty(sctx);
} else {
si_set_atom_dirty(sctx, &sctx->atoms.s.streamout_begin, false);
si_set_streamout_enable(sctx, false);
}

/* Set the shader resources.*/
for (i = 0; i < num_targets; i++) {
- bufidx = SI_VS_STREAMOUT_BUF0 + i;
-
if (targets[i]) {
- struct pipe_resource *buffer = targets[i]->buffer;
- uint64_t va = r600_resource(buffer)->gpu_address;
-
- /* Set the descriptor.
- *
- * On VI, the format must be non-INVALID, otherwise
- * the buffer will be considered not bound and store
- * instructions will be no-ops.
- */
- uint32_t *desc = descs->list + bufidx*4;
- desc[0] = va;
- desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32);
- desc[2] = 0xffffffff;
- desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
- S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
- S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
- S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
- S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
-
- /* Set the resource. */
- pipe_resource_reference(&buffers->buffers[bufidx],
- buffer);
- radeon_add_to_gfx_buffer_list_check_mem(sctx,
- r600_resource(buffer),
- buffers->shader_usage,
- RADEON_PRIO_SHADER_RW_BUFFER,
- true);
- r600_resource(buffer)->bind_history |= PIPE_BIND_STREAM_OUTPUT;
-
- buffers->enabled_mask |= 1u << bufidx;
+ struct pipe_shader_buffer sbuf;
+ sbuf.buffer = targets[i]->buffer;
+ sbuf.buffer_offset = 0;
+ sbuf.buffer_size = targets[i]->buffer_offset +
+ targets[i]->buffer_size;
+ si_set_rw_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, &sbuf);
+ r600_resource(targets[i]->buffer)->bind_history |= PIPE_BIND_STREAM_OUTPUT;
} else {
- /* Clear the descriptor and unset the resource. */
- memset(descs->list + bufidx*4, 0,
- sizeof(uint32_t) * 4);
- pipe_resource_reference(&buffers->buffers[bufidx],
- NULL);
- buffers->enabled_mask &= ~(1u << bufidx);
+ si_set_rw_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL);
}
}
- for (; i < old_num_targets; i++) {
- bufidx = SI_VS_STREAMOUT_BUF0 + i;
- /* Clear the descriptor and unset the resource. */
- memset(descs->list + bufidx*4, 0, sizeof(uint32_t) * 4);
- pipe_resource_reference(&buffers->buffers[bufidx], NULL);
- buffers->enabled_mask &= ~(1u << bufidx);
- }
-
- sctx->descriptors_dirty |= 1u << SI_DESCS_RW_BUFFERS;
+ for (; i < old_num_targets; i++)
+ si_set_rw_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL);
}

static void si_flush_vgt_streamout(struct si_context *sctx)
{
struct radeon_cmdbuf *cs = sctx->gfx_cs;
unsigned reg_strmout_cntl;

/* The register is at different places on different ASICs. */
if (sctx->chip_class >= CIK) {
reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL;
--
2.19.1
Nicolai Hähnle
2018-12-06 14:00:46 UTC
Permalink
From: Nicolai Hähnle <***@amd.com>

---
src/gallium/drivers/radeonsi/si_perfcounter.c | 2 +-
src/gallium/drivers/radeonsi/si_query.c | 6 +++---
src/gallium/drivers/radeonsi/si_query.h | 2 +-
3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_perfcounter.c b/src/gallium/drivers/radeonsi/si_perfcounter.c
index 65197c0daa4..fc2c58854bc 100644
--- a/src/gallium/drivers/radeonsi/si_perfcounter.c
+++ b/src/gallium/drivers/radeonsi/si_perfcounter.c
@@ -908,21 +908,21 @@ static bool si_pc_query_get_result(struct si_context *sctx, struct si_query *rqu

while (results_base != qbuf->results_end) {
si_pc_query_add_result(query, map + results_base, result);
results_base += query->result_size;
}
}

return true;
}

-static struct si_query_ops batch_query_ops = {
+static const struct si_query_ops batch_query_ops = {
.destroy = si_pc_query_destroy,
.begin = si_pc_query_begin,
.end = si_pc_query_end,
.get_result = si_pc_query_get_result,

.suspend = si_pc_query_suspend,
.resume = si_pc_query_resume,
};

static struct si_query_group *get_group_state(struct si_screen *screen,
diff --git a/src/gallium/drivers/radeonsi/si_query.c b/src/gallium/drivers/radeonsi/si_query.c
index 5b0fba0ed92..093643bf684 100644
--- a/src/gallium/drivers/radeonsi/si_query.c
+++ b/src/gallium/drivers/radeonsi/si_query.c
@@ -27,21 +27,21 @@
#include "si_pipe.h"
#include "si_query.h"
#include "util/u_memory.h"
#include "util/u_upload_mgr.h"
#include "util/os_time.h"
#include "util/u_suballoc.h"
#include "amd/common/sid.h"

#define SI_MAX_STREAMS 4

-static struct si_query_ops query_hw_ops;
+static const struct si_query_ops query_hw_ops;

struct si_hw_query_params {
unsigned start_offset;
unsigned end_offset;
unsigned fence_offset;
unsigned pair_stride;
unsigned pair_count;
};

/* Queries without buffer handling or suspend/resume. */
@@ -492,21 +492,21 @@ static bool si_query_sw_get_result(struct si_context *sctx,
case SI_QUERY_CURRENT_GPU_SCLK:
case SI_QUERY_CURRENT_GPU_MCLK:
result->u64 *= 1000000;
break;
}

return true;
}


-static struct si_query_ops sw_query_ops = {
+static const struct si_query_ops sw_query_ops = {
.destroy = si_query_sw_destroy,
.begin = si_query_sw_begin,
.end = si_query_sw_end,
.get_result = si_query_sw_get_result,
.get_result_resource = NULL
};

static struct pipe_query *si_query_sw_create(unsigned query_type)
{
struct si_query_sw *query;
@@ -1336,21 +1336,21 @@ static void si_query_hw_add_result(struct si_screen *sscreen,
void si_query_hw_suspend(struct si_context *sctx, struct si_query *query)
{
si_query_hw_emit_stop(sctx, (struct si_query_hw *)query);
}

void si_query_hw_resume(struct si_context *sctx, struct si_query *query)
{
si_query_hw_emit_start(sctx, (struct si_query_hw *)query);
}

-static struct si_query_ops query_hw_ops = {
+static const struct si_query_ops query_hw_ops = {
.destroy = si_query_hw_destroy,
.begin = si_query_hw_begin,
.end = si_query_hw_end,
.get_result = si_query_hw_get_result,
.get_result_resource = si_query_hw_get_result_resource,

.suspend = si_query_hw_suspend,
.resume = si_query_hw_resume,
};

diff --git a/src/gallium/drivers/radeonsi/si_query.h b/src/gallium/drivers/radeonsi/si_query.h
index 63af760a271..0bc1d56f78a 100644
--- a/src/gallium/drivers/radeonsi/si_query.h
+++ b/src/gallium/drivers/radeonsi/si_query.h
@@ -134,21 +134,21 @@ struct si_query_ops {
int index,
struct pipe_resource *resource,
unsigned offset);

void (*suspend)(struct si_context *, struct si_query *);
void (*resume)(struct si_context *, struct si_query *);
};

struct si_query {
struct threaded_query b;
- struct si_query_ops *ops;
+ const struct si_query_ops *ops;

/* The PIPE_QUERY_xxx type of query */
unsigned type;

/* The number of dwords for suspend. */
unsigned num_cs_dw_suspend;

/* Linked list of queries that must be suspended at end of CS. */
struct list_head active_list;
};
--
2.19.1
Nicolai Hähnle
2018-12-06 14:00:44 UTC
Permalink
From: Nicolai Hähnle <***@amd.com>

This is a move towards using composition instead of inheritance for
different query types.

This change weakens out-of-memory error reporting somewhat, though this
should be acceptable since we didn't consistently report such errors in
the first place.
---
src/gallium/drivers/radeonsi/si_perfcounter.c | 8 +-
src/gallium/drivers/radeonsi/si_query.c | 177 +++++++++---------
src/gallium/drivers/radeonsi/si_query.h | 17 +-
src/gallium/drivers/radeonsi/si_texture.c | 7 +-
4 files changed, 99 insertions(+), 110 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_perfcounter.c b/src/gallium/drivers/radeonsi/si_perfcounter.c
index 0b3d8f89273..f0d10c054c4 100644
--- a/src/gallium/drivers/radeonsi/si_perfcounter.c
+++ b/src/gallium/drivers/radeonsi/si_perfcounter.c
@@ -761,23 +761,22 @@ static void si_pc_query_destroy(struct si_screen *sscreen,
struct si_query_group *group = query->groups;
query->groups = group->next;
FREE(group);
}

FREE(query->counters);

si_query_hw_destroy(sscreen, rquery);
}

-static bool si_pc_query_prepare_buffer(struct si_screen *screen,
- struct si_query_hw *hwquery,
- struct r600_resource *buffer)
+static bool si_pc_query_prepare_buffer(struct si_context *ctx,
+ struct si_query_buffer *qbuf)
{
/* no-op */
return true;
}

static void si_pc_query_emit_start(struct si_context *sctx,
struct si_query_hw *hwquery,
struct r600_resource *buffer, uint64_t va)
{
struct si_query_pc *query = (struct si_query_pc *)hwquery;
@@ -1055,23 +1054,20 @@ struct pipe_query *si_create_batch_query(struct pipe_context *ctx,
counter->base = group->result_base + j;
counter->stride = group->num_counters;

counter->qwords = 1;
if ((block->b->b->flags & SI_PC_BLOCK_SE) && group->se < 0)
counter->qwords = screen->info.max_se;
if (group->instance < 0)
counter->qwords *= block->num_instances;
}

- if (!si_query_hw_init(screen, &query->b))
- goto error;
-
return (struct pipe_query *)query;

error:
si_pc_query_destroy(screen, &query->b.b);
return NULL;
}

static bool si_init_block_names(struct si_screen *screen,
struct si_pc_block *block)
{
diff --git a/src/gallium/drivers/radeonsi/si_query.c b/src/gallium/drivers/radeonsi/si_query.c
index 479a1bbf2c4..5b0fba0ed92 100644
--- a/src/gallium/drivers/radeonsi/si_query.c
+++ b/src/gallium/drivers/radeonsi/si_query.c
@@ -514,86 +514,129 @@ static struct pipe_query *si_query_sw_create(unsigned query_type)
query = CALLOC_STRUCT(si_query_sw);
if (!query)
return NULL;

query->b.type = query_type;
query->b.ops = &sw_query_ops;

return (struct pipe_query *)query;
}

-void si_query_hw_destroy(struct si_screen *sscreen,
- struct si_query *rquery)
+void si_query_buffer_destroy(struct si_screen *sscreen, struct si_query_buffer *buffer)
{
- struct si_query_hw *query = (struct si_query_hw *)rquery;
- struct si_query_buffer *prev = query->buffer.previous;
+ struct si_query_buffer *prev = buffer->previous;

/* Release all query buffers. */
while (prev) {
struct si_query_buffer *qbuf = prev;
prev = prev->previous;
r600_resource_reference(&qbuf->buf, NULL);
FREE(qbuf);
}

- r600_resource_reference(&query->buffer.buf, NULL);
- r600_resource_reference(&query->workaround_buf, NULL);
- FREE(rquery);
+ r600_resource_reference(&buffer->buf, NULL);
+}
+
+void si_query_buffer_reset(struct si_context *sctx, struct si_query_buffer *buffer)
+{
+ /* Discard all query buffers except for the oldest. */
+ while (buffer->previous) {
+ struct si_query_buffer *qbuf = buffer->previous;
+ buffer->previous = qbuf->previous;
+
+ r600_resource_reference(&buffer->buf, NULL);
+ buffer->buf = qbuf->buf; /* move ownership */
+ FREE(qbuf);
+ }
+ buffer->results_end = 0;
+
+ /* Discard even the oldest buffer if it can't be mapped without a stall. */
+ if (buffer->buf &&
+ (si_rings_is_buffer_referenced(sctx, buffer->buf->buf, RADEON_USAGE_READWRITE) ||
+ !sctx->ws->buffer_wait(buffer->buf->buf, 0, RADEON_USAGE_READWRITE))) {
+ r600_resource_reference(&buffer->buf, NULL);
+ }
}

-static struct r600_resource *si_new_query_buffer(struct si_screen *sscreen,
- struct si_query_hw *query)
+bool si_query_buffer_alloc(struct si_context *sctx, struct si_query_buffer *buffer,
+ bool (*prepare_buffer)(struct si_context *, struct si_query_buffer*),
+ unsigned size)
{
- unsigned buf_size = MAX2(query->result_size,
- sscreen->info.min_alloc_size);
+ if (buffer->buf && buffer->results_end + size >= buffer->buf->b.b.width0)
+ return true;
+
+ if (buffer->buf) {
+ struct si_query_buffer *qbuf = MALLOC_STRUCT(si_query_buffer);
+ memcpy(qbuf, buffer, sizeof(*qbuf));
+ buffer->previous = qbuf;
+ }
+
+ buffer->results_end = 0;

/* Queries are normally read by the CPU after
* being written by the gpu, hence staging is probably a good
* usage pattern.
*/
- struct r600_resource *buf = r600_resource(
- pipe_buffer_create(&sscreen->b, 0,
- PIPE_USAGE_STAGING, buf_size));
- if (!buf)
- return NULL;
+ struct si_screen *screen = sctx->screen;
+ unsigned buf_size = MAX2(size, screen->info.min_alloc_size);
+ buffer->buf = r600_resource(
+ pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size));
+ if (unlikely(!buffer->buf))
+ return false;

- if (!query->ops->prepare_buffer(sscreen, query, buf)) {
- r600_resource_reference(&buf, NULL);
- return NULL;
+ if (prepare_buffer) {
+ if (unlikely(!prepare_buffer(sctx, buffer))) {
+ r600_resource_reference(&buffer->buf, NULL);
+ return false;
+ }
}

- return buf;
+ return true;
}

-static bool si_query_hw_prepare_buffer(struct si_screen *sscreen,
- struct si_query_hw *query,
- struct r600_resource *buffer)
+
+void si_query_hw_destroy(struct si_screen *sscreen,
+ struct si_query *rquery)
+{
+ struct si_query_hw *query = (struct si_query_hw *)rquery;
+
+ si_query_buffer_destroy(sscreen, &query->buffer);
+ r600_resource_reference(&query->workaround_buf, NULL);
+ FREE(rquery);
+}
+
+static bool si_query_hw_prepare_buffer(struct si_context *sctx,
+ struct si_query_buffer *qbuf)
{
- /* Callers ensure that the buffer is currently unused by the GPU. */
- uint32_t *results = sscreen->ws->buffer_map(buffer->buf, NULL,
+ static const struct si_query_hw si_query_hw_s;
+ struct si_query_hw *query = container_of(qbuf, &si_query_hw_s, buffer);
+ struct si_screen *screen = sctx->screen;
+
+ /* The caller ensures that the buffer is currently unused by the GPU. */
+ uint32_t *results = screen->ws->buffer_map(qbuf->buf->buf, NULL,
PIPE_TRANSFER_WRITE |
PIPE_TRANSFER_UNSYNCHRONIZED);
if (!results)
return false;

- memset(results, 0, buffer->b.b.width0);
+ memset(results, 0, qbuf->buf->b.b.width0);

if (query->b.type == PIPE_QUERY_OCCLUSION_COUNTER ||
query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE ||
query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
- unsigned max_rbs = sscreen->info.num_render_backends;
- unsigned enabled_rb_mask = sscreen->info.enabled_rb_mask;
+ unsigned max_rbs = screen->info.num_render_backends;
+ unsigned enabled_rb_mask = screen->info.enabled_rb_mask;
unsigned num_results;
unsigned i, j;

/* Set top bits for unused backends. */
- num_results = buffer->b.b.width0 / query->result_size;
+ num_results = qbuf->buf->b.b.width0 / query->result_size;
for (j = 0; j < num_results; j++) {
for (i = 0; i < max_rbs; i++) {
if (!(enabled_rb_mask & (1<<i))) {
results[(i * 4)+1] = 0x80000000;
results[(i * 4)+3] = 0x80000000;
}
}
results += 4 * max_rbs;
}
}
@@ -624,30 +667,20 @@ static void si_query_hw_clear_result(struct si_query_hw *,
union pipe_query_result *);

static struct si_query_hw_ops query_hw_default_hw_ops = {
.prepare_buffer = si_query_hw_prepare_buffer,
.emit_start = si_query_hw_do_emit_start,
.emit_stop = si_query_hw_do_emit_stop,
.clear_result = si_query_hw_clear_result,
.add_result = si_query_hw_add_result,
};

-bool si_query_hw_init(struct si_screen *sscreen,
- struct si_query_hw *query)
-{
- query->buffer.buf = si_new_query_buffer(sscreen, query);
- if (!query->buffer.buf)
- return false;
-
- return true;
-}
-
static struct pipe_query *si_query_hw_create(struct si_screen *sscreen,
unsigned query_type,
unsigned index)
{
struct si_query_hw *query = CALLOC_STRUCT(si_query_hw);
if (!query)
return NULL;

query->b.type = query_type;
query->b.ops = &query_hw_ops;
@@ -693,25 +726,20 @@ static struct pipe_query *si_query_hw_create(struct si_screen *sscreen,
query->result_size = 11 * 16;
query->result_size += 8; /* for the fence + alignment */
query->b.num_cs_dw_suspend = 6 + si_cp_write_fence_dwords(sscreen);
break;
default:
assert(0);
FREE(query);
return NULL;
}

- if (!si_query_hw_init(sscreen, query)) {
- FREE(query);
- return NULL;
- }
-
return (struct pipe_query *)query;
}

static void si_update_occlusion_query_state(struct si_context *sctx,
unsigned type, int diff)
{
if (type == PIPE_QUERY_OCCLUSION_COUNTER ||
type == PIPE_QUERY_OCCLUSION_PREDICATE ||
type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
bool old_enable = sctx->num_occlusion_queries != 0;
@@ -802,43 +830,31 @@ static void si_query_hw_do_emit_start(struct si_context *sctx,
}
radeon_add_to_buffer_list(sctx, sctx->gfx_cs, query->buffer.buf, RADEON_USAGE_WRITE,
RADEON_PRIO_QUERY);
}

static void si_query_hw_emit_start(struct si_context *sctx,
struct si_query_hw *query)
{
uint64_t va;

- if (!query->buffer.buf)
- return; // previous buffer allocation failure
+ if (!si_query_buffer_alloc(sctx, &query->buffer, query->ops->prepare_buffer,
+ query->result_size))
+ return;

si_update_occlusion_query_state(sctx, query->b.type, 1);
si_update_prims_generated_query_state(sctx, query->b.type, 1);

if (query->b.type != SI_QUERY_TIME_ELAPSED_SDMA)
si_need_gfx_cs_space(sctx);

- /* Get a new query buffer if needed. */
- if (query->buffer.results_end + query->result_size > query->buffer.buf->b.b.width0) {
- struct si_query_buffer *qbuf = MALLOC_STRUCT(si_query_buffer);
- *qbuf = query->buffer;
- query->buffer.results_end = 0;
- query->buffer.previous = qbuf;
- query->buffer.buf = si_new_query_buffer(sctx->screen, query);
- if (!query->buffer.buf)
- return;
- }
-
- /* emit begin query */
va = query->buffer.buf->gpu_address + query->buffer.results_end;
-
query->ops->emit_start(sctx, query, query->buffer.buf, va);
}

static void si_query_hw_do_emit_stop(struct si_context *sctx,
struct si_query_hw *query,
struct r600_resource *buffer,
uint64_t va)
{
struct radeon_cmdbuf *cs = sctx->gfx_cs;
uint64_t fence_va = 0;
@@ -905,26 +921,30 @@ static void si_query_hw_do_emit_stop(struct si_context *sctx,
query->buffer.buf, fence_va, 0x80000000,
query->b.type);
}
}

static void si_query_hw_emit_stop(struct si_context *sctx,
struct si_query_hw *query)
{
uint64_t va;

- if (!query->buffer.buf)
- return; // previous buffer allocation failure
-
/* The queries which need begin already called this in begin_query. */
- if (query->flags & SI_QUERY_HW_FLAG_NO_START)
+ if (query->flags & SI_QUERY_HW_FLAG_NO_START) {
si_need_gfx_cs_space(sctx);
+ if (!si_query_buffer_alloc(sctx, &query->buffer, query->ops->prepare_buffer,
+ query->result_size))
+ return;
+ }
+
+ if (!query->buffer.buf)
+ return; // previous buffer allocation failure

/* emit end query */
va = query->buffer.buf->gpu_address + query->buffer.results_end;

query->ops->emit_stop(sctx, query, query->buffer.buf, va);

query->buffer.results_end += query->result_size;

si_update_occlusion_query_state(sctx, query->b.type, -1);
si_update_prims_generated_query_state(sctx, query->b.type, -1);
@@ -1054,59 +1074,32 @@ static void si_destroy_query(struct pipe_context *ctx, struct pipe_query *query)

static boolean si_begin_query(struct pipe_context *ctx,
struct pipe_query *query)
{
struct si_context *sctx = (struct si_context *)ctx;
struct si_query *rquery = (struct si_query *)query;

return rquery->ops->begin(sctx, rquery);
}

-void si_query_hw_reset_buffers(struct si_context *sctx,
- struct si_query_hw *query)
-{
- struct si_query_buffer *prev = query->buffer.previous;
-
- /* Discard the old query buffers. */
- while (prev) {
- struct si_query_buffer *qbuf = prev;
- prev = prev->previous;
- r600_resource_reference(&qbuf->buf, NULL);
- FREE(qbuf);
- }
-
- query->buffer.results_end = 0;
- query->buffer.previous = NULL;
-
- /* Obtain a new buffer if the current one can't be mapped without a stall. */
- if (si_rings_is_buffer_referenced(sctx, query->buffer.buf->buf, RADEON_USAGE_READWRITE) ||
- !sctx->ws->buffer_wait(query->buffer.buf->buf, 0, RADEON_USAGE_READWRITE)) {
- r600_resource_reference(&query->buffer.buf, NULL);
- query->buffer.buf = si_new_query_buffer(sctx->screen, query);
- } else {
- if (!query->ops->prepare_buffer(sctx->screen, query, query->buffer.buf))
- r600_resource_reference(&query->buffer.buf, NULL);
- }
-}
-
bool si_query_hw_begin(struct si_context *sctx,
struct si_query *rquery)
{
struct si_query_hw *query = (struct si_query_hw *)rquery;

if (query->flags & SI_QUERY_HW_FLAG_NO_START) {
assert(0);
return false;
}

if (!(query->flags & SI_QUERY_HW_FLAG_BEGIN_RESUMES))
- si_query_hw_reset_buffers(sctx, query);
+ si_query_buffer_reset(sctx, &query->buffer);

r600_resource_reference(&query->workaround_buf, NULL);

si_query_hw_emit_start(sctx, query);
if (!query->buffer.buf)
return false;

LIST_ADDTAIL(&query->b.active_list, &sctx->active_queries);
sctx->num_cs_dw_queries_suspend += query->b.num_cs_dw_suspend;
return true;
@@ -1119,21 +1112,21 @@ static bool si_end_query(struct pipe_context *ctx, struct pipe_query *query)

return rquery->ops->end(sctx, rquery);
}

bool si_query_hw_end(struct si_context *sctx,
struct si_query *rquery)
{
struct si_query_hw *query = (struct si_query_hw *)rquery;

if (query->flags & SI_QUERY_HW_FLAG_NO_START)
- si_query_hw_reset_buffers(sctx, query);
+ si_query_buffer_reset(sctx, &query->buffer);

si_query_hw_emit_stop(sctx, query);

if (!(query->flags & SI_QUERY_HW_FLAG_NO_START)) {
LIST_DELINIT(&query->b.active_list);
sctx->num_cs_dw_queries_suspend -= query->b.num_cs_dw_suspend;
}

if (!query->buffer.buf)
return false;
diff --git a/src/gallium/drivers/radeonsi/si_query.h b/src/gallium/drivers/radeonsi/si_query.h
index ebd965a004f..63af760a271 100644
--- a/src/gallium/drivers/radeonsi/si_query.h
+++ b/src/gallium/drivers/radeonsi/si_query.h
@@ -27,20 +27,21 @@

#include "util/u_threaded_context.h"

struct pipe_context;
struct pipe_query;
struct pipe_resource;

struct si_screen;
struct si_context;
struct si_query;
+struct si_query_buffer;
struct si_query_hw;
struct r600_resource;

enum {
SI_QUERY_DRAW_CALLS = PIPE_QUERY_DRIVER_SPECIFIC,
SI_QUERY_DECOMPRESS_CALLS,
SI_QUERY_MRT_DRAW_CALLS,
SI_QUERY_PRIM_RESTART_CALLS,
SI_QUERY_SPILL_DRAW_CALLS,
SI_QUERY_COMPUTE_CALLS,
@@ -153,23 +154,21 @@ struct si_query {
};

enum {
SI_QUERY_HW_FLAG_NO_START = (1 << 0),
/* gap */
/* whether begin_query doesn't clear the result */
SI_QUERY_HW_FLAG_BEGIN_RESUMES = (1 << 2),
};

struct si_query_hw_ops {
- bool (*prepare_buffer)(struct si_screen *,
- struct si_query_hw *,
- struct r600_resource *);
+ bool (*prepare_buffer)(struct si_context *, struct si_query_buffer *);
void (*emit_start)(struct si_context *,
struct si_query_hw *,
struct r600_resource *buffer, uint64_t va);
void (*emit_stop)(struct si_context *,
struct si_query_hw *,
struct r600_resource *buffer, uint64_t va);
void (*clear_result)(struct si_query_hw *, union pipe_query_result *);
void (*add_result)(struct si_screen *screen,
struct si_query_hw *, void *buffer,
union pipe_query_result *result);
@@ -179,40 +178,45 @@ struct si_query_buffer {
/* The buffer where query results are stored. */
struct r600_resource *buf;
/* Offset of the next free result after current query data */
unsigned results_end;
/* If a query buffer is full, a new buffer is created and the old one
* is put in here. When we calculate the result, we sum up the samples
* from all buffers. */
struct si_query_buffer *previous;
};

+void si_query_buffer_destroy(struct si_screen *sctx, struct si_query_buffer *buffer);
+void si_query_buffer_reset(struct si_context *sctx, struct si_query_buffer *buffer);
+bool si_query_buffer_alloc(struct si_context *sctx, struct si_query_buffer *buffer,
+ bool (*prepare_buffer)(struct si_context *, struct si_query_buffer*),
+ unsigned size);
+
+
struct si_query_hw {
struct si_query b;
struct si_query_hw_ops *ops;
unsigned flags;

/* The query buffer and how many results are in it. */
struct si_query_buffer buffer;
/* Size of the result in memory for both begin_query and end_query,
* this can be one or two numbers, or it could even be a size of a structure. */
unsigned result_size;
/* For transform feedback: which stream the query is for */
unsigned stream;

/* Workaround via compute shader */
struct r600_resource *workaround_buf;
unsigned workaround_offset;
};

-bool si_query_hw_init(struct si_screen *sscreen,
- struct si_query_hw *query);
void si_query_hw_destroy(struct si_screen *sscreen,
struct si_query *rquery);
bool si_query_hw_begin(struct si_context *sctx,
struct si_query *rquery);
bool si_query_hw_end(struct si_context *sctx,
struct si_query *rquery);
bool si_query_hw_get_result(struct si_context *sctx,
struct si_query *rquery,
bool wait,
union pipe_query_result *result);
@@ -237,20 +241,17 @@ struct pipe_query *si_create_batch_query(struct pipe_context *ctx,
unsigned num_queries,
unsigned *query_types);

int si_get_perfcounter_info(struct si_screen *,
unsigned index,
struct pipe_driver_query_info *info);
int si_get_perfcounter_group_info(struct si_screen *,
unsigned index,
struct pipe_driver_query_group_info *info);

-void si_query_hw_reset_buffers(struct si_context *sctx,
- struct si_query_hw *query);
-
struct si_qbo_state {
void *saved_compute;
struct pipe_constant_buffer saved_const0;
struct pipe_shader_buffer saved_ssbo[3];
};

#endif /* SI_QUERY_H */
diff --git a/src/gallium/drivers/radeonsi/si_texture.c b/src/gallium/drivers/radeonsi/si_texture.c
index ac1a0aa6097..9df12e0f5bd 100644
--- a/src/gallium/drivers/radeonsi/si_texture.c
+++ b/src/gallium/drivers/radeonsi/si_texture.c
@@ -2276,25 +2276,24 @@ void vi_separate_dcc_process_and_reset_stats(struct pipe_context *ctx,
struct si_context *sctx = (struct si_context*)ctx;
struct pipe_query *tmp;
unsigned i = vi_get_context_dcc_stats_index(sctx, tex);
bool query_active = sctx->dcc_stats[i].query_active;
bool disable = false;

if (sctx->dcc_stats[i].ps_stats[2]) {
union pipe_query_result result;

/* Read the results. */
- ctx->get_query_result(ctx, sctx->dcc_stats[i].ps_stats[2],
+ struct pipe_query *query = sctx->dcc_stats[i].ps_stats[2];
+ ctx->get_query_result(ctx, query,
true, &result);
- si_query_hw_reset_buffers(sctx,
- (struct si_query_hw*)
- sctx->dcc_stats[i].ps_stats[2]);
+ si_query_buffer_reset(sctx, &((struct si_query_hw*)query)->buffer);

/* Compute the approximate number of fullscreen draws. */
tex->ps_draw_ratio =
result.pipeline_statistics.ps_invocations /
(tex->buffer.b.b.width0 * tex->buffer.b.b.height0);
sctx->last_tex_ps_draw_ratio = tex->ps_draw_ratio;

disable = tex->dcc_separate_buffer &&
!vi_should_enable_separate_dcc(tex);
}
--
2.19.1
Nicolai Hähnle
2018-12-06 14:00:43 UTC
Permalink
From: Nicolai Hähnle <***@amd.com>

---
src/gallium/drivers/radeonsi/si_perfcounter.c | 13 ++--
src/gallium/drivers/radeonsi/si_query.c | 75 ++++++++++---------
src/gallium/drivers/radeonsi/si_query.h | 18 +++--
3 files changed, 62 insertions(+), 44 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_perfcounter.c b/src/gallium/drivers/radeonsi/si_perfcounter.c
index 69e149c76b6..0b3d8f89273 100644
--- a/src/gallium/drivers/radeonsi/si_perfcounter.c
+++ b/src/gallium/drivers/radeonsi/si_perfcounter.c
@@ -861,21 +861,24 @@ static void si_pc_query_add_result(struct si_screen *screen,
uint32_t value = results[counter->base + j * counter->stride];
result->batch[i].u64 += value;
}
}
}

static struct si_query_ops batch_query_ops = {
.destroy = si_pc_query_destroy,
.begin = si_query_hw_begin,
.end = si_query_hw_end,
- .get_result = si_query_hw_get_result
+ .get_result = si_query_hw_get_result,
+
+ .suspend = si_query_hw_suspend,
+ .resume = si_query_hw_resume,
};

static struct si_query_hw_ops batch_query_hw_ops = {
.prepare_buffer = si_pc_query_prepare_buffer,
.emit_start = si_pc_query_emit_start,
.emit_stop = si_pc_query_emit_stop,
.clear_result = si_pc_query_clear_result,
.add_result = si_pc_query_add_result,
};

@@ -994,41 +997,41 @@ struct pipe_query *si_create_batch_query(struct pipe_context *ctx,
fprintf(stderr,
"perfcounter group %s: too many selected\n",
block->b->b->name);
goto error;
}
group->selectors[group->num_counters] = sub_index;
++group->num_counters;
}

/* Compute result bases and CS size per group */
- query->b.num_cs_dw_end = pc->num_stop_cs_dwords;
- query->b.num_cs_dw_end += pc->num_instance_cs_dwords;
+ query->b.b.num_cs_dw_suspend = pc->num_stop_cs_dwords;
+ query->b.b.num_cs_dw_suspend += pc->num_instance_cs_dwords;

i = 0;
for (group = query->groups; group; group = group->next) {
struct si_pc_block *block = group->block;
unsigned read_dw;
unsigned instances = 1;

if ((block->b->b->flags & SI_PC_BLOCK_SE) && group->se < 0)
instances = screen->info.max_se;
if (group->instance < 0)
instances *= block->num_instances;

group->result_base = i;
query->b.result_size += sizeof(uint64_t) * instances * group->num_counters;
i += instances * group->num_counters;

read_dw = 6 * group->num_counters;
- query->b.num_cs_dw_end += instances * read_dw;
- query->b.num_cs_dw_end += instances * pc->num_instance_cs_dwords;
+ query->b.b.num_cs_dw_suspend += instances * read_dw;
+ query->b.b.num_cs_dw_suspend += instances * pc->num_instance_cs_dwords;
}

if (query->shaders) {
if (query->shaders == SI_PC_SHADERS_WINDOWING)
query->shaders = 0xffffffff;
}

/* Map user-supplied query array to result indices */
query->counters = CALLOC(num_queries, sizeof(*query->counters));
for (i = 0; i < num_queries; ++i) {
diff --git a/src/gallium/drivers/radeonsi/si_query.c b/src/gallium/drivers/radeonsi/si_query.c
index aed3e1e80c1..479a1bbf2c4 100644
--- a/src/gallium/drivers/radeonsi/si_query.c
+++ b/src/gallium/drivers/radeonsi/si_query.c
@@ -27,20 +27,22 @@
#include "si_pipe.h"
#include "si_query.h"
#include "util/u_memory.h"
#include "util/u_upload_mgr.h"
#include "util/os_time.h"
#include "util/u_suballoc.h"
#include "amd/common/sid.h"

#define SI_MAX_STREAMS 4

+static struct si_query_ops query_hw_ops;
+
struct si_hw_query_params {
unsigned start_offset;
unsigned end_offset;
unsigned fence_offset;
unsigned pair_stride;
unsigned pair_count;
};

/* Queries without buffer handling or suspend/resume. */
struct si_query_sw {
@@ -600,28 +602,20 @@ static bool si_query_hw_prepare_buffer(struct si_screen *sscreen,
}

static void si_query_hw_get_result_resource(struct si_context *sctx,
struct si_query *rquery,
bool wait,
enum pipe_query_value_type result_type,
int index,
struct pipe_resource *resource,
unsigned offset);

-static struct si_query_ops query_hw_ops = {
- .destroy = si_query_hw_destroy,
- .begin = si_query_hw_begin,
- .end = si_query_hw_end,
- .get_result = si_query_hw_get_result,
- .get_result_resource = si_query_hw_get_result_resource,
-};
-
static void si_query_hw_do_emit_start(struct si_context *sctx,
struct si_query_hw *query,
struct r600_resource *buffer,
uint64_t va);
static void si_query_hw_do_emit_stop(struct si_context *sctx,
struct si_query_hw *query,
struct r600_resource *buffer,
uint64_t va);
static void si_query_hw_add_result(struct si_screen *sscreen,
struct si_query_hw *, void *buffer,
@@ -658,55 +652,54 @@ static struct pipe_query *si_query_hw_create(struct si_screen *sscreen,
query->b.type = query_type;
query->b.ops = &query_hw_ops;
query->ops = &query_hw_default_hw_ops;

switch (query_type) {
case PIPE_QUERY_OCCLUSION_COUNTER:
case PIPE_QUERY_OCCLUSION_PREDICATE:
case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
query->result_size = 16 * sscreen->info.num_render_backends;
query->result_size += 16; /* for the fence + alignment */
- query->num_cs_dw_end = 6 + si_cp_write_fence_dwords(sscreen);
+ query->b.num_cs_dw_suspend = 6 + si_cp_write_fence_dwords(sscreen);
break;
case SI_QUERY_TIME_ELAPSED_SDMA:
/* GET_GLOBAL_TIMESTAMP only works if the offset is a multiple of 32. */
query->result_size = 64;
- query->num_cs_dw_end = 0;
break;
case PIPE_QUERY_TIME_ELAPSED:
query->result_size = 24;
- query->num_cs_dw_end = 8 + si_cp_write_fence_dwords(sscreen);
+ query->b.num_cs_dw_suspend = 8 + si_cp_write_fence_dwords(sscreen);
break;
case PIPE_QUERY_TIMESTAMP:
query->result_size = 16;
- query->num_cs_dw_end = 8 + si_cp_write_fence_dwords(sscreen);
+ query->b.num_cs_dw_suspend = 8 + si_cp_write_fence_dwords(sscreen);
query->flags = SI_QUERY_HW_FLAG_NO_START;
break;
case PIPE_QUERY_PRIMITIVES_EMITTED:
case PIPE_QUERY_PRIMITIVES_GENERATED:
case PIPE_QUERY_SO_STATISTICS:
case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
/* NumPrimitivesWritten, PrimitiveStorageNeeded. */
query->result_size = 32;
- query->num_cs_dw_end = 6;
+ query->b.num_cs_dw_suspend = 6;
query->stream = index;
break;
case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
/* NumPrimitivesWritten, PrimitiveStorageNeeded. */
query->result_size = 32 * SI_MAX_STREAMS;
- query->num_cs_dw_end = 6 * SI_MAX_STREAMS;
+ query->b.num_cs_dw_suspend = 6 * SI_MAX_STREAMS;
break;
case PIPE_QUERY_PIPELINE_STATISTICS:
/* 11 values on GCN. */
query->result_size = 11 * 16;
query->result_size += 8; /* for the fence + alignment */
- query->num_cs_dw_end = 6 + si_cp_write_fence_dwords(sscreen);
+ query->b.num_cs_dw_suspend = 6 + si_cp_write_fence_dwords(sscreen);
break;
default:
assert(0);
FREE(query);
return NULL;
}

if (!si_query_hw_init(sscreen, query)) {
FREE(query);
return NULL;
@@ -833,22 +826,20 @@ static void si_query_hw_emit_start(struct si_context *sctx,
query->buffer.previous = qbuf;
query->buffer.buf = si_new_query_buffer(sctx->screen, query);
if (!query->buffer.buf)
return;
}

/* emit begin query */
va = query->buffer.buf->gpu_address + query->buffer.results_end;

query->ops->emit_start(sctx, query, query->buffer.buf, va);
-
- sctx->num_cs_dw_queries_suspend += query->num_cs_dw_end;
}

static void si_query_hw_do_emit_stop(struct si_context *sctx,
struct si_query_hw *query,
struct r600_resource *buffer,
uint64_t va)
{
struct radeon_cmdbuf *cs = sctx->gfx_cs;
uint64_t fence_va = 0;

@@ -928,23 +919,20 @@ static void si_query_hw_emit_stop(struct si_context *sctx,
if (query->flags & SI_QUERY_HW_FLAG_NO_START)
si_need_gfx_cs_space(sctx);

/* emit end query */
va = query->buffer.buf->gpu_address + query->buffer.results_end;

query->ops->emit_stop(sctx, query, query->buffer.buf, va);

query->buffer.results_end += query->result_size;

- if (!(query->flags & SI_QUERY_HW_FLAG_NO_START))
- sctx->num_cs_dw_queries_suspend -= query->num_cs_dw_end;
-
si_update_occlusion_query_state(sctx, query->b.type, -1);
si_update_prims_generated_query_state(sctx, query->b.type, -1);
}

static void emit_set_predicate(struct si_context *ctx,
struct r600_resource *buf, uint64_t va,
uint32_t op)
{
struct radeon_cmdbuf *cs = ctx->gfx_cs;

@@ -1112,21 +1100,22 @@ bool si_query_hw_begin(struct si_context *sctx,

if (!(query->flags & SI_QUERY_HW_FLAG_BEGIN_RESUMES))
si_query_hw_reset_buffers(sctx, query);

r600_resource_reference(&query->workaround_buf, NULL);

si_query_hw_emit_start(sctx, query);
if (!query->buffer.buf)
return false;

- LIST_ADDTAIL(&query->list, &sctx->active_queries);
+ LIST_ADDTAIL(&query->b.active_list, &sctx->active_queries);
+ sctx->num_cs_dw_queries_suspend += query->b.num_cs_dw_suspend;
return true;
}

static bool si_end_query(struct pipe_context *ctx, struct pipe_query *query)
{
struct si_context *sctx = (struct si_context *)ctx;
struct si_query *rquery = (struct si_query *)query;

return rquery->ops->end(sctx, rquery);
}
@@ -1134,22 +1123,24 @@ static bool si_end_query(struct pipe_context *ctx, struct pipe_query *query)
bool si_query_hw_end(struct si_context *sctx,
struct si_query *rquery)
{
struct si_query_hw *query = (struct si_query_hw *)rquery;

if (query->flags & SI_QUERY_HW_FLAG_NO_START)
si_query_hw_reset_buffers(sctx, query);

si_query_hw_emit_stop(sctx, query);

- if (!(query->flags & SI_QUERY_HW_FLAG_NO_START))
- LIST_DELINIT(&query->list);
+ if (!(query->flags & SI_QUERY_HW_FLAG_NO_START)) {
+ LIST_DELINIT(&query->b.active_list);
+ sctx->num_cs_dw_queries_suspend -= query->b.num_cs_dw_suspend;
+ }

if (!query->buffer.buf)
return false;

return true;
}

static void si_get_hw_query_params(struct si_context *sctx,
struct si_query_hw *rquery, int index,
struct si_hw_query_params *params)
@@ -1342,20 +1333,41 @@ static void si_query_hw_add_result(struct si_screen *sscreen,
result->pipeline_statistics.c_primitives,
result->pipeline_statistics.ps_invocations,
result->pipeline_statistics.cs_invocations);
#endif
break;
default:
assert(0);
}
}

+void si_query_hw_suspend(struct si_context *sctx, struct si_query *query)
+{
+ si_query_hw_emit_stop(sctx, (struct si_query_hw *)query);
+}
+
+void si_query_hw_resume(struct si_context *sctx, struct si_query *query)
+{
+ si_query_hw_emit_start(sctx, (struct si_query_hw *)query);
+}
+
+static struct si_query_ops query_hw_ops = {
+ .destroy = si_query_hw_destroy,
+ .begin = si_query_hw_begin,
+ .end = si_query_hw_end,
+ .get_result = si_query_hw_get_result,
+ .get_result_resource = si_query_hw_get_result_resource,
+
+ .suspend = si_query_hw_suspend,
+ .resume = si_query_hw_resume,
+};
+
static boolean si_get_query_result(struct pipe_context *ctx,
struct pipe_query *query, boolean wait,
union pipe_query_result *result)
{
struct si_context *sctx = (struct si_context *)ctx;
struct si_query *rquery = (struct si_query *)query;

return rquery->ops->get_result(sctx, rquery, wait, result);
}

@@ -1637,40 +1649,35 @@ static void si_render_condition(struct pipe_context *ctx,

sctx->render_cond = query;
sctx->render_cond_invert = condition;
sctx->render_cond_mode = mode;

si_set_atom_dirty(sctx, atom, query != NULL);
}

void si_suspend_queries(struct si_context *sctx)
{
- struct si_query_hw *query;
+ struct si_query *query;

- LIST_FOR_EACH_ENTRY(query, &sctx->active_queries, list) {
- si_query_hw_emit_stop(sctx, query);
- }
- assert(sctx->num_cs_dw_queries_suspend == 0);
+ LIST_FOR_EACH_ENTRY(query, &sctx->active_queries, active_list)
+ query->ops->suspend(sctx, query);
}

void si_resume_queries(struct si_context *sctx)
{
- struct si_query_hw *query;
-
- assert(sctx->num_cs_dw_queries_suspend == 0);
+ struct si_query *query;

/* Check CS space here. Resuming must not be interrupted by flushes. */
si_need_gfx_cs_space(sctx);

- LIST_FOR_EACH_ENTRY(query, &sctx->active_queries, list) {
- si_query_hw_emit_start(sctx, query);
- }
+ LIST_FOR_EACH_ENTRY(query, &sctx->active_queries, active_list)
+ query->ops->resume(sctx, query);
}

#define XFULL(name_, query_type_, type_, result_type_, group_id_) \
{ \
.name = name_, \
.query_type = SI_QUERY_##query_type_, \
.type = PIPE_DRIVER_QUERY_TYPE_##type_, \
.result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_##result_type_, \
.group_id = group_id_ \
}
diff --git a/src/gallium/drivers/radeonsi/si_query.h b/src/gallium/drivers/radeonsi/si_query.h
index 032946edf4d..ebd965a004f 100644
--- a/src/gallium/drivers/radeonsi/si_query.h
+++ b/src/gallium/drivers/radeonsi/si_query.h
@@ -126,28 +126,37 @@ struct si_query_ops {
bool (*end)(struct si_context *, struct si_query *);
bool (*get_result)(struct si_context *,
struct si_query *, bool wait,
union pipe_query_result *result);
void (*get_result_resource)(struct si_context *,
struct si_query *, bool wait,
enum pipe_query_value_type result_type,
int index,
struct pipe_resource *resource,
unsigned offset);
+
+ void (*suspend)(struct si_context *, struct si_query *);
+ void (*resume)(struct si_context *, struct si_query *);
};

struct si_query {
struct threaded_query b;
struct si_query_ops *ops;

- /* The type of query */
+ /* The PIPE_QUERY_xxx type of query */
unsigned type;
+
+ /* The number of dwords for suspend. */
+ unsigned num_cs_dw_suspend;
+
+ /* Linked list of queries that must be suspended at end of CS. */
+ struct list_head active_list;
};

enum {
SI_QUERY_HW_FLAG_NO_START = (1 << 0),
/* gap */
/* whether begin_query doesn't clear the result */
SI_QUERY_HW_FLAG_BEGIN_RESUMES = (1 << 2),
};

struct si_query_hw_ops {
@@ -180,44 +189,43 @@ struct si_query_buffer {
struct si_query_hw {
struct si_query b;
struct si_query_hw_ops *ops;
unsigned flags;

/* The query buffer and how many results are in it. */
struct si_query_buffer buffer;
/* Size of the result in memory for both begin_query and end_query,
* this can be one or two numbers, or it could even be a size of a structure. */
unsigned result_size;
- /* The number of dwords for end_query. */
- unsigned num_cs_dw_end;
- /* Linked list of queries */
- struct list_head list;
/* For transform feedback: which stream the query is for */
unsigned stream;

/* Workaround via compute shader */
struct r600_resource *workaround_buf;
unsigned workaround_offset;
};

bool si_query_hw_init(struct si_screen *sscreen,
struct si_query_hw *query);
void si_query_hw_destroy(struct si_screen *sscreen,
struct si_query *rquery);
bool si_query_hw_begin(struct si_context *sctx,
struct si_query *rquery);
bool si_query_hw_end(struct si_context *sctx,
struct si_query *rquery);
bool si_query_hw_get_result(struct si_context *sctx,
struct si_query *rquery,
bool wait,
union pipe_query_result *result);
+void si_query_hw_suspend(struct si_context *sctx, struct si_query *query);
+void si_query_hw_resume(struct si_context *sctx, struct si_query *query);
+

/* Performance counters */
struct si_perfcounters {
unsigned num_groups;
unsigned num_blocks;
struct si_pc_block *blocks;

unsigned num_stop_cs_dwords;
unsigned num_instance_cs_dwords;
--
2.19.1
Nicolai Hähnle
2018-12-06 14:00:39 UTC
Permalink
From: Nicolai Hähnle <***@amd.com>

---
src/gallium/drivers/radeonsi/si_descriptors.c | 107 ++++++++++--------
src/gallium/drivers/radeonsi/si_state.h | 2 +
2 files changed, 64 insertions(+), 45 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index 06e95e863eb..111169a8c3d 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -1262,27 +1262,20 @@ static void si_set_constant_buffer(struct si_context *sctx,
buffers->enabled_mask |= 1u << slot;
} else {
/* Clear the descriptor. */
memset(descs->list + slot*4, 0, sizeof(uint32_t) * 4);
buffers->enabled_mask &= ~(1u << slot);
}

sctx->descriptors_dirty |= 1u << descriptors_idx;
}

-void si_set_rw_buffer(struct si_context *sctx,
- uint slot, const struct pipe_constant_buffer *input)
-{
- si_set_constant_buffer(sctx, &sctx->rw_buffers,
- SI_DESCS_RW_BUFFERS, slot, input);
-}
-
static void si_pipe_set_constant_buffer(struct pipe_context *ctx,
enum pipe_shader_type shader, uint slot,
const struct pipe_constant_buffer *input)
{
struct si_context *sctx = (struct si_context *)ctx;

if (shader >= SI_NUM_SHADERS)
return;

if (slot == 0 && input && input->buffer &&
@@ -1303,74 +1296,84 @@ void si_get_pipe_constant_buffer(struct si_context *sctx, uint shader,
cbuf->user_buffer = NULL;
si_get_buffer_from_descriptors(
&sctx->const_and_shader_buffers[shader],
si_const_and_shader_buffer_descriptors(sctx, shader),
si_get_constbuf_slot(slot),
&cbuf->buffer, &cbuf->buffer_offset, &cbuf->buffer_size);
}

/* SHADER BUFFERS */

+static void si_set_shader_buffer(struct si_context *sctx,
+ struct si_buffer_resources *buffers,
+ unsigned descriptors_idx,
+ uint slot, const struct pipe_shader_buffer *sbuffer,
+ enum radeon_bo_priority priority)
+{
+ struct si_descriptors *descs = &sctx->descriptors[descriptors_idx];
+ uint32_t *desc = descs->list + slot * 4;
+
+ if (!sbuffer || !sbuffer->buffer) {
+ pipe_resource_reference(&buffers->buffers[slot], NULL);
+ memset(desc, 0, sizeof(uint32_t) * 4);
+ buffers->enabled_mask &= ~(1u << slot);
+ sctx->descriptors_dirty |= 1u << descriptors_idx;
+ return;
+ }
+
+ struct r600_resource *buf = r600_resource(sbuffer->buffer);
+ uint64_t va = buf->gpu_address + sbuffer->buffer_offset;
+
+ desc[0] = va;
+ desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
+ S_008F04_STRIDE(0);
+ desc[2] = sbuffer->buffer_size;
+ desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
+ S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+ S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
+ S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
+ S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
+ S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
+
+ pipe_resource_reference(&buffers->buffers[slot], &buf->b.b);
+ radeon_add_to_gfx_buffer_list_check_mem(sctx, buf,
+ buffers->shader_usage,
+ priority, true);
+
+ buffers->enabled_mask |= 1u << slot;
+ sctx->descriptors_dirty |= 1u << descriptors_idx;
+
+ util_range_add(&buf->valid_buffer_range, sbuffer->buffer_offset,
+ sbuffer->buffer_offset + sbuffer->buffer_size);
+}
+
static void si_set_shader_buffers(struct pipe_context *ctx,
enum pipe_shader_type shader,
unsigned start_slot, unsigned count,
const struct pipe_shader_buffer *sbuffers)
{
struct si_context *sctx = (struct si_context *)ctx;
struct si_buffer_resources *buffers = &sctx->const_and_shader_buffers[shader];
- struct si_descriptors *descs = si_const_and_shader_buffer_descriptors(sctx, shader);
+ unsigned descriptors_idx = si_const_and_shader_buffer_descriptors_idx(shader);
unsigned i;

assert(start_slot + count <= SI_NUM_SHADER_BUFFERS);

for (i = 0; i < count; ++i) {
const struct pipe_shader_buffer *sbuffer = sbuffers ? &sbuffers[i] : NULL;
- struct r600_resource *buf;
unsigned slot = si_get_shaderbuf_slot(start_slot + i);
- uint32_t *desc = descs->list + slot * 4;
- uint64_t va;

- if (!sbuffer || !sbuffer->buffer) {
- pipe_resource_reference(&buffers->buffers[slot], NULL);
- memset(desc, 0, sizeof(uint32_t) * 4);
- buffers->enabled_mask &= ~(1u << slot);
- sctx->descriptors_dirty |=
- 1u << si_const_and_shader_buffer_descriptors_idx(shader);
- continue;
- }
+ if (sbuffer && sbuffer->buffer)
+ r600_resource(sbuffer->buffer)->bind_history |= PIPE_BIND_SHADER_BUFFER;

- buf = r600_resource(sbuffer->buffer);
- va = buf->gpu_address + sbuffer->buffer_offset;
-
- desc[0] = va;
- desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
- S_008F04_STRIDE(0);
- desc[2] = sbuffer->buffer_size;
- desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
- S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
- S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
- S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
- S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
- S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
-
- pipe_resource_reference(&buffers->buffers[slot], &buf->b.b);
- radeon_add_to_gfx_buffer_list_check_mem(sctx, buf,
- buffers->shader_usage,
- buffers->priority, true);
- buf->bind_history |= PIPE_BIND_SHADER_BUFFER;
-
- buffers->enabled_mask |= 1u << slot;
- sctx->descriptors_dirty |=
- 1u << si_const_and_shader_buffer_descriptors_idx(shader);
-
- util_range_add(&buf->valid_buffer_range, sbuffer->buffer_offset,
- sbuffer->buffer_offset + sbuffer->buffer_size);
+ si_set_shader_buffer(sctx, buffers, descriptors_idx, slot, sbuffer,
+ buffers->priority);
}
}

void si_get_shader_buffers(struct si_context *sctx,
enum pipe_shader_type shader,
uint start_slot, uint count,
struct pipe_shader_buffer *sbuf)
{
struct si_buffer_resources *buffers = &sctx->const_and_shader_buffers[shader];
struct si_descriptors *descs = si_const_and_shader_buffer_descriptors(sctx, shader);
@@ -1379,20 +1382,34 @@ void si_get_shader_buffers(struct si_context *sctx,
si_get_buffer_from_descriptors(
buffers, descs,
si_get_shaderbuf_slot(start_slot + i),
&sbuf[i].buffer, &sbuf[i].buffer_offset,
&sbuf[i].buffer_size);
}
}

/* RING BUFFERS */

+void si_set_rw_buffer(struct si_context *sctx,
+ uint slot, const struct pipe_constant_buffer *input)
+{
+ si_set_constant_buffer(sctx, &sctx->rw_buffers, SI_DESCS_RW_BUFFERS,
+ slot, input);
+}
+
+void si_set_rw_shader_buffer(struct si_context *sctx, uint slot,
+ const struct pipe_shader_buffer *sbuffer)
+{
+ si_set_shader_buffer(sctx, &sctx->rw_buffers, SI_DESCS_RW_BUFFERS,
+ slot, sbuffer, RADEON_PRIO_SHADER_RW_BUFFER);
+}
+
void si_set_ring_buffer(struct si_context *sctx, uint slot,
struct pipe_resource *buffer,
unsigned stride, unsigned num_records,
bool add_tid, bool swizzle,
unsigned element_size, unsigned index_stride, uint64_t offset)
{
struct si_buffer_resources *buffers = &sctx->rw_buffers;
struct si_descriptors *descs = &sctx->descriptors[SI_DESCS_RW_BUFFERS];

/* The stride field in the resource descriptor has 14 bits */
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index bb186f530f0..fdc83e0c2be 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -467,20 +467,22 @@ void si_all_descriptors_begin_new_cs(struct si_context *sctx);
void si_all_resident_buffers_begin_new_cs(struct si_context *sctx);
void si_upload_const_buffer(struct si_context *sctx, struct r600_resource **rbuffer,
const uint8_t *ptr, unsigned size, uint32_t *const_offset);
void si_update_all_texture_descriptors(struct si_context *sctx);
void si_shader_change_notify(struct si_context *sctx);
void si_update_needs_color_decompress_masks(struct si_context *sctx);
void si_emit_graphics_shader_pointers(struct si_context *sctx);
void si_emit_compute_shader_pointers(struct si_context *sctx);
void si_set_rw_buffer(struct si_context *sctx,
uint slot, const struct pipe_constant_buffer *input);
+void si_set_rw_shader_buffer(struct si_context *sctx, uint slot,
+ const struct pipe_shader_buffer *sbuffer);
void si_set_active_descriptors(struct si_context *sctx, unsigned desc_idx,
uint64_t new_active_mask);
void si_set_active_descriptors_for_shader(struct si_context *sctx,
struct si_shader_selector *sel);
bool si_bindless_descriptor_can_reclaim_slab(void *priv,
struct pb_slab_entry *entry);
struct pb_slab *si_bindless_descriptor_slab_alloc(void *priv, unsigned heap,
unsigned entry_size,
unsigned group_index);
void si_bindless_descriptor_slab_free(void *priv, struct pb_slab *pslab);
--
2.19.1
Nicolai Hähnle
2018-12-06 14:00:41 UTC
Permalink
From: Nicolai Hähnle <***@amd.com>

Other callers of si_set_constant_buffer don't need it.
---
src/gallium/drivers/radeonsi/si_descriptors.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index 111169a8c3d..81f21f2cfc1 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -1230,22 +1230,20 @@ static void si_set_constant_buffer(struct si_context *sctx,
input->buffer_size, &buffer_offset);
if (!buffer) {
/* Just unbind on failure. */
si_set_constant_buffer(sctx, buffers, descriptors_idx, slot, NULL);
return;
}
va = r600_resource(buffer)->gpu_address + buffer_offset;
} else {
pipe_resource_reference(&buffer, input->buffer);
va = r600_resource(buffer)->gpu_address + input->buffer_offset;
- /* Only track usage for non-user buffers. */
- r600_resource(buffer)->bind_history |= PIPE_BIND_CONSTANT_BUFFER;
}

/* Set the descriptor. */
uint32_t *desc = descs->list + slot*4;
desc[0] = va;
desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
S_008F04_STRIDE(0);
desc[2] = input->buffer_size;
desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
@@ -1277,20 +1275,23 @@ static void si_pipe_set_constant_buffer(struct pipe_context *ctx,

if (shader >= SI_NUM_SHADERS)
return;

if (slot == 0 && input && input->buffer &&
!(r600_resource(input->buffer)->flags & RADEON_FLAG_32BIT)) {
assert(!"constant buffer 0 must have a 32-bit VM address, use const_uploader");
return;
}

+ if (input && input->buffer)
+ r600_resource(input->buffer)->bind_history |= PIPE_BIND_CONSTANT_BUFFER;
+
slot = si_get_constbuf_slot(slot);
si_set_constant_buffer(sctx, &sctx->const_and_shader_buffers[shader],
si_const_and_shader_buffer_descriptors_idx(shader),
slot, input);
}

void si_get_pipe_constant_buffer(struct si_context *sctx, uint shader,
uint slot, struct pipe_constant_buffer *cbuf)
{
cbuf->user_buffer = NULL;
--
2.19.1
Nicolai Hähnle
2018-12-06 14:00:42 UTC
Permalink
From: Nicolai Hähnle <***@amd.com>

---
src/gallium/drivers/radeon/r600_perfcounter.c | 639 ----------------
src/gallium/drivers/radeonsi/Makefile.sources | 1 -
src/gallium/drivers/radeonsi/meson.build | 1 -
src/gallium/drivers/radeonsi/si_perfcounter.c | 688 ++++++++++++++++--
src/gallium/drivers/radeonsi/si_pipe.c | 2 +-
src/gallium/drivers/radeonsi/si_pipe.h | 4 +-
src/gallium/drivers/radeonsi/si_query.h | 74 +-
7 files changed, 643 insertions(+), 766 deletions(-)
delete mode 100644 src/gallium/drivers/radeon/r600_perfcounter.c

diff --git a/src/gallium/drivers/radeon/r600_perfcounter.c b/src/gallium/drivers/radeon/r600_perfcounter.c
deleted file mode 100644
index 57c3246898a..00000000000
--- a/src/gallium/drivers/radeon/r600_perfcounter.c
+++ /dev/null
@@ -1,639 +0,0 @@
-/*
- * Copyright 2015 Advanced Micro Devices, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "util/u_memory.h"
-#include "radeonsi/si_query.h"
-#include "radeonsi/si_pipe.h"
-#include "amd/common/sid.h"
-
-/* Max counters per HW block */
-#define SI_QUERY_MAX_COUNTERS 16
-
-static struct si_perfcounter_block *
-lookup_counter(struct si_perfcounters *pc, unsigned index,
- unsigned *base_gid, unsigned *sub_index)
-{
- struct si_perfcounter_block *block = pc->blocks;
- unsigned bid;
-
- *base_gid = 0;
- for (bid = 0; bid < pc->num_blocks; ++bid, ++block) {
- unsigned total = block->num_groups * block->num_selectors;
-
- if (index < total) {
- *sub_index = index;
- return block;
- }
-
- index -= total;
- *base_gid += block->num_groups;
- }
-
- return NULL;
-}
-
-static struct si_perfcounter_block *
-lookup_group(struct si_perfcounters *pc, unsigned *index)
-{
- unsigned bid;
- struct si_perfcounter_block *block = pc->blocks;
-
- for (bid = 0; bid < pc->num_blocks; ++bid, ++block) {
- if (*index < block->num_groups)
- return block;
- *index -= block->num_groups;
- }
-
- return NULL;
-}
-
-struct si_pc_group {
- struct si_pc_group *next;
- struct si_perfcounter_block *block;
- unsigned sub_gid; /* only used during init */
- unsigned result_base; /* only used during init */
- int se;
- int instance;
- unsigned num_counters;
- unsigned selectors[SI_QUERY_MAX_COUNTERS];
-};
-
-struct si_pc_counter {
- unsigned base;
- unsigned qwords;
- unsigned stride; /* in uint64s */
-};
-
-#define SI_PC_SHADERS_WINDOWING (1 << 31)
-
-struct si_query_pc {
- struct si_query_hw b;
-
- unsigned shaders;
- unsigned num_counters;
- struct si_pc_counter *counters;
- struct si_pc_group *groups;
-};
-
-static void si_pc_query_destroy(struct si_screen *sscreen,
- struct si_query *rquery)
-{
- struct si_query_pc *query = (struct si_query_pc *)rquery;
-
- while (query->groups) {
- struct si_pc_group *group = query->groups;
- query->groups = group->next;
- FREE(group);
- }
-
- FREE(query->counters);
-
- si_query_hw_destroy(sscreen, rquery);
-}
-
-static bool si_pc_query_prepare_buffer(struct si_screen *screen,
- struct si_query_hw *hwquery,
- struct r600_resource *buffer)
-{
- /* no-op */
- return true;
-}
-
-static void si_pc_query_emit_start(struct si_context *sctx,
- struct si_query_hw *hwquery,
- struct r600_resource *buffer, uint64_t va)
-{
- struct si_perfcounters *pc = sctx->screen->perfcounters;
- struct si_query_pc *query = (struct si_query_pc *)hwquery;
- struct si_pc_group *group;
- int current_se = -1;
- int current_instance = -1;
-
- if (query->shaders)
- pc->emit_shaders(sctx, query->shaders);
-
- for (group = query->groups; group; group = group->next) {
- struct si_perfcounter_block *block = group->block;
-
- if (group->se != current_se || group->instance != current_instance) {
- current_se = group->se;
- current_instance = group->instance;
- pc->emit_instance(sctx, group->se, group->instance);
- }
-
- pc->emit_select(sctx, block, group->num_counters, group->selectors);
- }
-
- if (current_se != -1 || current_instance != -1)
- pc->emit_instance(sctx, -1, -1);
-
- pc->emit_start(sctx, buffer, va);
-}
-
-static void si_pc_query_emit_stop(struct si_context *sctx,
- struct si_query_hw *hwquery,
- struct r600_resource *buffer, uint64_t va)
-{
- struct si_perfcounters *pc = sctx->screen->perfcounters;
- struct si_query_pc *query = (struct si_query_pc *)hwquery;
- struct si_pc_group *group;
-
- pc->emit_stop(sctx, buffer, va);
-
- for (group = query->groups; group; group = group->next) {
- struct si_perfcounter_block *block = group->block;
- unsigned se = group->se >= 0 ? group->se : 0;
- unsigned se_end = se + 1;
-
- if ((block->flags & SI_PC_BLOCK_SE) && (group->se < 0))
- se_end = sctx->screen->info.max_se;
-
- do {
- unsigned instance = group->instance >= 0 ? group->instance : 0;
-
- do {
- pc->emit_instance(sctx, se, instance);
- pc->emit_read(sctx, block,
- group->num_counters, group->selectors,
- buffer, va);
- va += sizeof(uint64_t) * group->num_counters;
- } while (group->instance < 0 && ++instance < block->num_instances);
- } while (++se < se_end);
- }
-
- pc->emit_instance(sctx, -1, -1);
-}
-
-static void si_pc_query_clear_result(struct si_query_hw *hwquery,
- union pipe_query_result *result)
-{
- struct si_query_pc *query = (struct si_query_pc *)hwquery;
-
- memset(result, 0, sizeof(result->batch[0]) * query->num_counters);
-}
-
-static void si_pc_query_add_result(struct si_screen *sscreen,
- struct si_query_hw *hwquery,
- void *buffer,
- union pipe_query_result *result)
-{
- struct si_query_pc *query = (struct si_query_pc *)hwquery;
- uint64_t *results = buffer;
- unsigned i, j;
-
- for (i = 0; i < query->num_counters; ++i) {
- struct si_pc_counter *counter = &query->counters[i];
-
- for (j = 0; j < counter->qwords; ++j) {
- uint32_t value = results[counter->base + j * counter->stride];
- result->batch[i].u64 += value;
- }
- }
-}
-
-static struct si_query_ops batch_query_ops = {
- .destroy = si_pc_query_destroy,
- .begin = si_query_hw_begin,
- .end = si_query_hw_end,
- .get_result = si_query_hw_get_result
-};
-
-static struct si_query_hw_ops batch_query_hw_ops = {
- .prepare_buffer = si_pc_query_prepare_buffer,
- .emit_start = si_pc_query_emit_start,
- .emit_stop = si_pc_query_emit_stop,
- .clear_result = si_pc_query_clear_result,
- .add_result = si_pc_query_add_result,
-};
-
-static struct si_pc_group *get_group_state(struct si_screen *screen,
- struct si_query_pc *query,
- struct si_perfcounter_block *block,
- unsigned sub_gid)
-{
- struct si_pc_group *group = query->groups;
-
- while (group) {
- if (group->block == block && group->sub_gid == sub_gid)
- return group;
- group = group->next;
- }
-
- group = CALLOC_STRUCT(si_pc_group);
- if (!group)
- return NULL;
-
- group->block = block;
- group->sub_gid = sub_gid;
-
- if (block->flags & SI_PC_BLOCK_SHADER) {
- unsigned sub_gids = block->num_instances;
- unsigned shader_id;
- unsigned shaders;
- unsigned query_shaders;
-
- if (block->flags & SI_PC_BLOCK_SE_GROUPS)
- sub_gids = sub_gids * screen->info.max_se;
- shader_id = sub_gid / sub_gids;
- sub_gid = sub_gid % sub_gids;
-
- shaders = screen->perfcounters->shader_type_bits[shader_id];
-
- query_shaders = query->shaders & ~SI_PC_SHADERS_WINDOWING;
- if (query_shaders && query_shaders != shaders) {
- fprintf(stderr, "si_perfcounter: incompatible shader groups\n");
- FREE(group);
- return NULL;
- }
- query->shaders = shaders;
- }
-
- if (block->flags & SI_PC_BLOCK_SHADER_WINDOWED && !query->shaders) {
- // A non-zero value in query->shaders ensures that the shader
- // masking is reset unless the user explicitly requests one.
- query->shaders = SI_PC_SHADERS_WINDOWING;
- }
-
- if (block->flags & SI_PC_BLOCK_SE_GROUPS) {
- group->se = sub_gid / block->num_instances;
- sub_gid = sub_gid % block->num_instances;
- } else {
- group->se = -1;
- }
-
- if (block->flags & SI_PC_BLOCK_INSTANCE_GROUPS) {
- group->instance = sub_gid;
- } else {
- group->instance = -1;
- }
-
- group->next = query->groups;
- query->groups = group;
-
- return group;
-}
-
-struct pipe_query *si_create_batch_query(struct pipe_context *ctx,
- unsigned num_queries,
- unsigned *query_types)
-{
- struct si_screen *screen =
- (struct si_screen *)ctx->screen;
- struct si_perfcounters *pc = screen->perfcounters;
- struct si_perfcounter_block *block;
- struct si_pc_group *group;
- struct si_query_pc *query;
- unsigned base_gid, sub_gid, sub_index;
- unsigned i, j;
-
- if (!pc)
- return NULL;
-
- query = CALLOC_STRUCT(si_query_pc);
- if (!query)
- return NULL;
-
- query->b.b.ops = &batch_query_ops;
- query->b.ops = &batch_query_hw_ops;
-
- query->num_counters = num_queries;
-
- /* Collect selectors per group */
- for (i = 0; i < num_queries; ++i) {
- unsigned sub_gid;
-
- if (query_types[i] < SI_QUERY_FIRST_PERFCOUNTER)
- goto error;
-
- block = lookup_counter(pc, query_types[i] - SI_QUERY_FIRST_PERFCOUNTER,
- &base_gid, &sub_index);
- if (!block)
- goto error;
-
- sub_gid = sub_index / block->num_selectors;
- sub_index = sub_index % block->num_selectors;
-
- group = get_group_state(screen, query, block, sub_gid);
- if (!group)
- goto error;
-
- if (group->num_counters >= block->num_counters) {
- fprintf(stderr,
- "perfcounter group %s: too many selected\n",
- block->basename);
- goto error;
- }
- group->selectors[group->num_counters] = sub_index;
- ++group->num_counters;
- }
-
- /* Compute result bases and CS size per group */
- query->b.num_cs_dw_end = pc->num_stop_cs_dwords;
- query->b.num_cs_dw_end += pc->num_instance_cs_dwords;
-
- i = 0;
- for (group = query->groups; group; group = group->next) {
- struct si_perfcounter_block *block = group->block;
- unsigned read_dw;
- unsigned instances = 1;
-
- if ((block->flags & SI_PC_BLOCK_SE) && group->se < 0)
- instances = screen->info.max_se;
- if (group->instance < 0)
- instances *= block->num_instances;
-
- group->result_base = i;
- query->b.result_size += sizeof(uint64_t) * instances * group->num_counters;
- i += instances * group->num_counters;
-
- read_dw = 6 * group->num_counters;
- query->b.num_cs_dw_end += instances * read_dw;
- query->b.num_cs_dw_end += instances * pc->num_instance_cs_dwords;
- }
-
- if (query->shaders) {
- if (query->shaders == SI_PC_SHADERS_WINDOWING)
- query->shaders = 0xffffffff;
- }
-
- /* Map user-supplied query array to result indices */
- query->counters = CALLOC(num_queries, sizeof(*query->counters));
- for (i = 0; i < num_queries; ++i) {
- struct si_pc_counter *counter = &query->counters[i];
- struct si_perfcounter_block *block;
-
- block = lookup_counter(pc, query_types[i] - SI_QUERY_FIRST_PERFCOUNTER,
- &base_gid, &sub_index);
-
- sub_gid = sub_index / block->num_selectors;
- sub_index = sub_index % block->num_selectors;
-
- group = get_group_state(screen, query, block, sub_gid);
- assert(group != NULL);
-
- for (j = 0; j < group->num_counters; ++j) {
- if (group->selectors[j] == sub_index)
- break;
- }
-
- counter->base = group->result_base + j;
- counter->stride = group->num_counters;
-
- counter->qwords = 1;
- if ((block->flags & SI_PC_BLOCK_SE) && group->se < 0)
- counter->qwords = screen->info.max_se;
- if (group->instance < 0)
- counter->qwords *= block->num_instances;
- }
-
- if (!si_query_hw_init(screen, &query->b))
- goto error;
-
- return (struct pipe_query *)query;
-
-error:
- si_pc_query_destroy(screen, &query->b.b);
- return NULL;
-}
-
-static bool si_init_block_names(struct si_screen *screen,
- struct si_perfcounter_block *block)
-{
- unsigned i, j, k;
- unsigned groups_shader = 1, groups_se = 1, groups_instance = 1;
- unsigned namelen;
- char *groupname;
- char *p;
-
- if (block->flags & SI_PC_BLOCK_INSTANCE_GROUPS)
- groups_instance = block->num_instances;
- if (block->flags & SI_PC_BLOCK_SE_GROUPS)
- groups_se = screen->info.max_se;
- if (block->flags & SI_PC_BLOCK_SHADER)
- groups_shader = screen->perfcounters->num_shader_types;
-
- namelen = strlen(block->basename);
- block->group_name_stride = namelen + 1;
- if (block->flags & SI_PC_BLOCK_SHADER)
- block->group_name_stride += 3;
- if (block->flags & SI_PC_BLOCK_SE_GROUPS) {
- assert(groups_se <= 10);
- block->group_name_stride += 1;
-
- if (block->flags & SI_PC_BLOCK_INSTANCE_GROUPS)
- block->group_name_stride += 1;
- }
- if (block->flags & SI_PC_BLOCK_INSTANCE_GROUPS) {
- assert(groups_instance <= 100);
- block->group_name_stride += 2;
- }
-
- block->group_names = MALLOC(block->num_groups * block->group_name_stride);
- if (!block->group_names)
- return false;
-
- groupname = block->group_names;
- for (i = 0; i < groups_shader; ++i) {
- const char *shader_suffix = screen->perfcounters->shader_type_suffixes[i];
- unsigned shaderlen = strlen(shader_suffix);
- for (j = 0; j < groups_se; ++j) {
- for (k = 0; k < groups_instance; ++k) {
- strcpy(groupname, block->basename);
- p = groupname + namelen;
-
- if (block->flags & SI_PC_BLOCK_SHADER) {
- strcpy(p, shader_suffix);
- p += shaderlen;
- }
-
- if (block->flags & SI_PC_BLOCK_SE_GROUPS) {
- p += sprintf(p, "%d", j);
- if (block->flags & SI_PC_BLOCK_INSTANCE_GROUPS)
- *p++ = '_';
- }
-
- if (block->flags & SI_PC_BLOCK_INSTANCE_GROUPS)
- p += sprintf(p, "%d", k);
-
- groupname += block->group_name_stride;
- }
- }
- }
-
- assert(block->num_selectors <= 1000);
- block->selector_name_stride = block->group_name_stride + 4;
- block->selector_names = MALLOC(block->num_groups * block->num_selectors *
- block->selector_name_stride);
- if (!block->selector_names)
- return false;
-
- groupname = block->group_names;
- p = block->selector_names;
- for (i = 0; i < block->num_groups; ++i) {
- for (j = 0; j < block->num_selectors; ++j) {
- sprintf(p, "%s_%03d", groupname, j);
- p += block->selector_name_stride;
- }
- groupname += block->group_name_stride;
- }
-
- return true;
-}
-
-int si_get_perfcounter_info(struct si_screen *screen,
- unsigned index,
- struct pipe_driver_query_info *info)
-{
- struct si_perfcounters *pc = screen->perfcounters;
- struct si_perfcounter_block *block;
- unsigned base_gid, sub;
-
- if (!pc)
- return 0;
-
- if (!info) {
- unsigned bid, num_queries = 0;
-
- for (bid = 0; bid < pc->num_blocks; ++bid) {
- num_queries += pc->blocks[bid].num_selectors *
- pc->blocks[bid].num_groups;
- }
-
- return num_queries;
- }
-
- block = lookup_counter(pc, index, &base_gid, &sub);
- if (!block)
- return 0;
-
- if (!block->selector_names) {
- if (!si_init_block_names(screen, block))
- return 0;
- }
- info->name = block->selector_names + sub * block->selector_name_stride;
- info->query_type = SI_QUERY_FIRST_PERFCOUNTER + index;
- info->max_value.u64 = 0;
- info->type = PIPE_DRIVER_QUERY_TYPE_UINT64;
- info->result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_AVERAGE;
- info->group_id = base_gid + sub / block->num_selectors;
- info->flags = PIPE_DRIVER_QUERY_FLAG_BATCH;
- if (sub > 0 && sub + 1 < block->num_selectors * block->num_groups)
- info->flags |= PIPE_DRIVER_QUERY_FLAG_DONT_LIST;
- return 1;
-}
-
-int si_get_perfcounter_group_info(struct si_screen *screen,
- unsigned index,
- struct pipe_driver_query_group_info *info)
-{
- struct si_perfcounters *pc = screen->perfcounters;
- struct si_perfcounter_block *block;
-
- if (!pc)
- return 0;
-
- if (!info)
- return pc->num_groups;
-
- block = lookup_group(pc, &index);
- if (!block)
- return 0;
-
- if (!block->group_names) {
- if (!si_init_block_names(screen, block))
- return 0;
- }
- info->name = block->group_names + index * block->group_name_stride;
- info->num_queries = block->num_selectors;
- info->max_active_queries = block->num_counters;
- return 1;
-}
-
-void si_perfcounters_destroy(struct si_screen *sscreen)
-{
- if (sscreen->perfcounters)
- sscreen->perfcounters->cleanup(sscreen);
-}
-
-bool si_perfcounters_init(struct si_perfcounters *pc,
- unsigned num_blocks)
-{
- pc->blocks = CALLOC(num_blocks, sizeof(struct si_perfcounter_block));
- if (!pc->blocks)
- return false;
-
- pc->separate_se = debug_get_bool_option("RADEON_PC_SEPARATE_SE", false);
- pc->separate_instance = debug_get_bool_option("RADEON_PC_SEPARATE_INSTANCE", false);
-
- return true;
-}
-
-void si_perfcounters_add_block(struct si_screen *sscreen,
- struct si_perfcounters *pc,
- const char *name, unsigned flags,
- unsigned counters, unsigned selectors,
- unsigned instances, void *data)
-{
- struct si_perfcounter_block *block = &pc->blocks[pc->num_blocks];
-
- assert(counters <= SI_QUERY_MAX_COUNTERS);
-
- block->basename = name;
- block->flags = flags;
- block->num_counters = counters;
- block->num_selectors = selectors;
- block->num_instances = MAX2(instances, 1);
- block->data = data;
-
- if (pc->separate_se && (block->flags & SI_PC_BLOCK_SE))
- block->flags |= SI_PC_BLOCK_SE_GROUPS;
- if (pc->separate_instance && block->num_instances > 1)
- block->flags |= SI_PC_BLOCK_INSTANCE_GROUPS;
-
- if (block->flags & SI_PC_BLOCK_INSTANCE_GROUPS) {
- block->num_groups = block->num_instances;
- } else {
- block->num_groups = 1;
- }
-
- if (block->flags & SI_PC_BLOCK_SE_GROUPS)
- block->num_groups *= sscreen->info.max_se;
- if (block->flags & SI_PC_BLOCK_SHADER)
- block->num_groups *= pc->num_shader_types;
-
- ++pc->num_blocks;
- pc->num_groups += block->num_groups;
-}
-
-void si_perfcounters_do_destroy(struct si_perfcounters *pc)
-{
- unsigned i;
-
- for (i = 0; i < pc->num_blocks; ++i) {
- FREE(pc->blocks[i].group_names);
- FREE(pc->blocks[i].selector_names);
- }
- FREE(pc->blocks);
- FREE(pc);
-}
diff --git a/src/gallium/drivers/radeonsi/Makefile.sources b/src/gallium/drivers/radeonsi/Makefile.sources
index dcaf446effe..713629c6e87 100644
--- a/src/gallium/drivers/radeonsi/Makefile.sources
+++ b/src/gallium/drivers/radeonsi/Makefile.sources
@@ -42,21 +42,20 @@ C_SOURCES := \
si_state_draw.c \
si_state_msaa.c \
si_state_shaders.c \
si_state_streamout.c \
si_state_viewport.c \
si_state.h \
si_test_dma.c \
si_test_dma_perf.c \
si_texture.c \
si_uvd.c \
- ../radeon/r600_perfcounter.c \
../radeon/radeon_uvd.c \
../radeon/radeon_uvd.h \
../radeon/radeon_vcn_dec_jpeg.c \
../radeon/radeon_vcn_dec.c \
../radeon/radeon_vcn_dec.h \
../radeon/radeon_vcn_enc_1_2.c \
../radeon/radeon_vcn_enc.c \
../radeon/radeon_vcn_enc.h \
../radeon/radeon_uvd_enc_1_1.c \
../radeon/radeon_uvd_enc.c \
diff --git a/src/gallium/drivers/radeonsi/meson.build b/src/gallium/drivers/radeonsi/meson.build
index ac8ed949ea0..cf3b24cd358 100644
--- a/src/gallium/drivers/radeonsi/meson.build
+++ b/src/gallium/drivers/radeonsi/meson.build
@@ -58,21 +58,20 @@ files_libradeonsi = files(
'si_state_binning.c',
'si_state_draw.c',
'si_state_msaa.c',
'si_state_shaders.c',
'si_state_streamout.c',
'si_state_viewport.c',
'si_test_dma.c',
'si_test_dma_perf.c',
'si_texture.c',
'si_uvd.c',
- '../radeon/r600_perfcounter.c',
'../radeon/radeon_uvd.c',
'../radeon/radeon_uvd.h',
'../radeon/radeon_vcn_enc_1_2.c',
'../radeon/radeon_vcn_enc.c',
'../radeon/radeon_vcn_enc.h',
'../radeon/radeon_vcn_dec_jpeg.c',
'../radeon/radeon_vcn_dec.c',
'../radeon/radeon_vcn_dec.h',
'../radeon/radeon_uvd_enc_1_1.c',
'../radeon/radeon_uvd_enc.c',
diff --git a/src/gallium/drivers/radeonsi/si_perfcounter.c b/src/gallium/drivers/radeonsi/si_perfcounter.c
index cea7d57e518..69e149c76b6 100644
--- a/src/gallium/drivers/radeonsi/si_perfcounter.c
+++ b/src/gallium/drivers/radeonsi/si_perfcounter.c
@@ -20,20 +20,38 @@
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/

#include "si_build_pm4.h"
#include "si_query.h"
#include "util/u_memory.h"


+enum si_pc_block_flags {
+ /* This block is part of the shader engine */
+ SI_PC_BLOCK_SE = (1 << 0),
+
+ /* Expose per-instance groups instead of summing all instances (within
+ * an SE). */
+ SI_PC_BLOCK_INSTANCE_GROUPS = (1 << 1),
+
+ /* Expose per-SE groups instead of summing instances across SEs. */
+ SI_PC_BLOCK_SE_GROUPS = (1 << 2),
+
+ /* Shader block */
+ SI_PC_BLOCK_SHADER = (1 << 3),
+
+ /* Non-shader block with perfcounters windowed by shaders. */
+ SI_PC_BLOCK_SHADER_WINDOWED = (1 << 4),
+};
+
enum si_pc_reg_layout {
/* All secondary selector dwords follow as one block after the primary
* selector dwords for the counters that have secondary selectors.
*/
SI_PC_MULTI_BLOCK = 0,

/* Each secondary selector dword follows immediately afters the
* corresponding primary.
*/
SI_PC_MULTI_ALTERNATE = 1,
@@ -62,44 +80,88 @@ struct si_pc_block_base {
unsigned select_or;
unsigned select0;
unsigned counter0_lo;
unsigned *select;
unsigned *counters;
unsigned num_multi;
unsigned num_prelude;
unsigned layout;
};

-struct si_pc_block {
+struct si_pc_block_gfxdescr {
struct si_pc_block_base *b;
unsigned selectors;
unsigned instances;
};

+struct si_pc_block {
+ const struct si_pc_block_gfxdescr *b;
+ unsigned num_instances;
+
+ unsigned num_groups;
+ char *group_names;
+ unsigned group_name_stride;
+
+ char *selector_names;
+ unsigned selector_name_stride;
+};
+
/* The order is chosen to be compatible with GPUPerfStudio's hardcoding of
* performance counter group IDs.
*/
static const char * const si_pc_shader_type_suffixes[] = {
"", "_ES", "_GS", "_VS", "_PS", "_LS", "_HS", "_CS"
};

static const unsigned si_pc_shader_type_bits[] = {
0x7f,
S_036780_ES_EN(1),
S_036780_GS_EN(1),
S_036780_VS_EN(1),
S_036780_PS_EN(1),
S_036780_LS_EN(1),
S_036780_HS_EN(1),
S_036780_CS_EN(1),
};

+/* Max counters per HW block */
+#define SI_QUERY_MAX_COUNTERS 16
+
+#define SI_PC_SHADERS_WINDOWING (1 << 31)
+
+struct si_query_group {
+ struct si_query_group *next;
+ struct si_pc_block *block;
+ unsigned sub_gid; /* only used during init */
+ unsigned result_base; /* only used during init */
+ int se;
+ int instance;
+ unsigned num_counters;
+ unsigned selectors[SI_QUERY_MAX_COUNTERS];
+};
+
+struct si_query_counter {
+ unsigned base;
+ unsigned qwords;
+ unsigned stride; /* in uint64s */
+};
+
+struct si_query_pc {
+ struct si_query_hw b;
+
+ unsigned shaders;
+ unsigned num_counters;
+ struct si_query_counter *counters;
+ struct si_query_group *groups;
+};
+
+
static struct si_pc_block_base cik_CB = {
.name = "CB",
.num_counters = 4,
.flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS,

.select0 = R_037000_CB_PERFCOUNTER_FILTER,
.counter0_lo = R_035018_CB_PERFCOUNTER0_LO,
.num_multi = 1,
.num_prelude = 1,
.layout = SI_PC_MULTI_ALTERNATE,
@@ -337,21 +399,21 @@ static struct si_pc_block_base cik_SRBM = {
};

/* Both the number of instances and selectors varies between chips of the same
* class. We only differentiate by class here and simply expose the maximum
* number over all chips in a class.
*
* Unfortunately, GPUPerfStudio uses the order of performance counter groups
* blindly once it believes it has identified the hardware, so the order of
* blocks here matters.
*/
-static struct si_pc_block groups_CIK[] = {
+static struct si_pc_block_gfxdescr groups_CIK[] = {
{ &cik_CB, 226},
{ &cik_CPF, 17 },
{ &cik_DB, 257},
{ &cik_GRBM, 34 },
{ &cik_GRBMSE, 15 },
{ &cik_PA_SU, 153 },
{ &cik_PA_SC, 395 },
{ &cik_SPI, 186 },
{ &cik_SQ, 252 },
{ &cik_SX, 32 },
@@ -364,21 +426,21 @@ static struct si_pc_block groups_CIK[] = {
{ &cik_VGT, 140 },
{ &cik_IA, 22 },
{ &cik_MC, 22 },
{ &cik_SRBM, 19 },
{ &cik_WD, 22 },
{ &cik_CPG, 46 },
{ &cik_CPC, 22 },

};

-static struct si_pc_block groups_VI[] = {
+static struct si_pc_block_gfxdescr groups_VI[] = {
{ &cik_CB, 405},
{ &cik_CPF, 19 },
{ &cik_DB, 257},
{ &cik_GRBM, 34 },
{ &cik_GRBMSE, 15 },
{ &cik_PA_SU, 154 },
{ &cik_PA_SC, 397 },
{ &cik_SPI, 197 },
{ &cik_SQ, 273 },
{ &cik_SX, 34 },
@@ -391,21 +453,21 @@ static struct si_pc_block groups_VI[] = {
{ &cik_VGT, 147 },
{ &cik_IA, 24 },
{ &cik_MC, 22 },
{ &cik_SRBM, 27 },
{ &cik_WD, 37 },
{ &cik_CPG, 48 },
{ &cik_CPC, 24 },

};

-static struct si_pc_block groups_gfx9[] = {
+static struct si_pc_block_gfxdescr groups_gfx9[] = {
{ &cik_CB, 438},
{ &cik_CPF, 32 },
{ &cik_DB, 328},
{ &cik_GRBM, 38 },
{ &cik_GRBMSE, 16 },
{ &cik_PA_SU, 292 },
{ &cik_PA_SC, 491 },
{ &cik_SPI, 196 },
{ &cik_SQ, 374 },
{ &cik_SX, 208 },
@@ -415,20 +477,72 @@ static struct si_pc_block groups_gfx9[] = {
{ &cik_TD, 57, 16 },
{ &cik_TCP, 85, 16 },
{ &cik_GDS, 121 },
{ &cik_VGT, 148 },
{ &cik_IA, 32 },
{ &cik_WD, 58 },
{ &cik_CPG, 59 },
{ &cik_CPC, 35 },
};

+static bool si_pc_block_has_per_se_groups(const struct si_perfcounters *pc,
+ const struct si_pc_block *block)
+{
+ return block->b->b->flags & SI_PC_BLOCK_SE_GROUPS ||
+ (block->b->b->flags & SI_PC_BLOCK_SE && pc->separate_se);
+}
+
+static bool si_pc_block_has_per_instance_groups(const struct si_perfcounters *pc,
+ const struct si_pc_block *block)
+{
+ return block->b->b->flags & SI_PC_BLOCK_INSTANCE_GROUPS ||
+ (block->num_instances > 1 && pc->separate_instance);
+}
+
+static struct si_pc_block *
+lookup_counter(struct si_perfcounters *pc, unsigned index,
+ unsigned *base_gid, unsigned *sub_index)
+{
+ struct si_pc_block *block = pc->blocks;
+ unsigned bid;
+
+ *base_gid = 0;
+ for (bid = 0; bid < pc->num_blocks; ++bid, ++block) {
+ unsigned total = block->num_groups * block->b->selectors;
+
+ if (index < total) {
+ *sub_index = index;
+ return block;
+ }
+
+ index -= total;
+ *base_gid += block->num_groups;
+ }
+
+ return NULL;
+}
+
+static struct si_pc_block *
+lookup_group(struct si_perfcounters *pc, unsigned *index)
+{
+ unsigned bid;
+ struct si_pc_block *block = pc->blocks;
+
+ for (bid = 0; bid < pc->num_blocks; ++bid, ++block) {
+ if (*index < block->num_groups)
+ return block;
+ *index -= block->num_groups;
+ }
+
+ return NULL;
+}
+
static void si_pc_emit_instance(struct si_context *sctx,
int se, int instance)
{
struct radeon_cmdbuf *cs = sctx->gfx_cs;
unsigned value = S_030800_SH_BROADCAST_WRITES(1);

if (se >= 0) {
value |= S_030800_SE_INDEX(se);
} else {
value |= S_030800_SE_BROADCAST_WRITES(1);
@@ -447,25 +561,24 @@ static void si_pc_emit_shaders(struct si_context *sctx,
unsigned shaders)
{
struct radeon_cmdbuf *cs = sctx->gfx_cs;

radeon_set_uconfig_reg_seq(cs, R_036780_SQ_PERFCOUNTER_CTRL, 2);
radeon_emit(cs, shaders & 0x7f);
radeon_emit(cs, 0xffffffff);
}

static void si_pc_emit_select(struct si_context *sctx,
- struct si_perfcounter_block *group,
+ struct si_pc_block *block,
unsigned count, unsigned *selectors)
{
- struct si_pc_block *sigroup = (struct si_pc_block *)group->data;
- struct si_pc_block_base *regs = sigroup->b;
+ struct si_pc_block_base *regs = block->b->b;
struct radeon_cmdbuf *cs = sctx->gfx_cs;
unsigned idx;
unsigned layout_multi = regs->layout & SI_PC_MULTI_MASK;
unsigned dw;

assert(count <= regs->num_counters);

if (regs->layout & SI_PC_FAKE)
return;

@@ -589,26 +702,24 @@ static void si_pc_emit_stop(struct si_context *sctx,
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_SAMPLE) | EVENT_INDEX(0));
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_STOP) | EVENT_INDEX(0));
radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
S_036020_PERFMON_STATE(V_036020_STOP_COUNTING) |
S_036020_PERFMON_SAMPLE_ENABLE(1));
}

static void si_pc_emit_read(struct si_context *sctx,
- struct si_perfcounter_block *group,
- unsigned count, unsigned *selectors,
- struct r600_resource *buffer, uint64_t va)
+ struct si_pc_block *block,
+ unsigned count, uint64_t va)
{
- struct si_pc_block *sigroup = (struct si_pc_block *)group->data;
- struct si_pc_block_base *regs = sigroup->b;
+ struct si_pc_block_base *regs = block->b->b;
struct radeon_cmdbuf *cs = sctx->gfx_cs;
unsigned idx;
unsigned reg = regs->counter0_lo;
unsigned reg_delta = 8;

if (!(regs->layout & SI_PC_FAKE)) {
if (regs->layout & SI_PC_REG_REVERSE)
reg_delta = -reg_delta;

for (idx = 0; idx < count; ++idx) {
@@ -634,30 +745,513 @@ static void si_pc_emit_read(struct si_context *sctx,
COPY_DATA_COUNT_SEL);
radeon_emit(cs, 0); /* immediate */
radeon_emit(cs, 0);
radeon_emit(cs, va);
radeon_emit(cs, va >> 32);
va += sizeof(uint64_t);
}
}
}

-static void si_pc_cleanup(struct si_screen *sscreen)
+static void si_pc_query_destroy(struct si_screen *sscreen,
+ struct si_query *rquery)
+{
+ struct si_query_pc *query = (struct si_query_pc *)rquery;
+
+ while (query->groups) {
+ struct si_query_group *group = query->groups;
+ query->groups = group->next;
+ FREE(group);
+ }
+
+ FREE(query->counters);
+
+ si_query_hw_destroy(sscreen, rquery);
+}
+
+static bool si_pc_query_prepare_buffer(struct si_screen *screen,
+ struct si_query_hw *hwquery,
+ struct r600_resource *buffer)
+{
+ /* no-op */
+ return true;
+}
+
+static void si_pc_query_emit_start(struct si_context *sctx,
+ struct si_query_hw *hwquery,
+ struct r600_resource *buffer, uint64_t va)
+{
+ struct si_query_pc *query = (struct si_query_pc *)hwquery;
+ struct si_query_group *group;
+ int current_se = -1;
+ int current_instance = -1;
+
+ if (query->shaders)
+ si_pc_emit_shaders(sctx, query->shaders);
+
+ for (group = query->groups; group; group = group->next) {
+ struct si_pc_block *block = group->block;
+
+ if (group->se != current_se || group->instance != current_instance) {
+ current_se = group->se;
+ current_instance = group->instance;
+ si_pc_emit_instance(sctx, group->se, group->instance);
+ }
+
+ si_pc_emit_select(sctx, block, group->num_counters, group->selectors);
+ }
+
+ if (current_se != -1 || current_instance != -1)
+ si_pc_emit_instance(sctx, -1, -1);
+
+ si_pc_emit_start(sctx, buffer, va);
+}
+
+static void si_pc_query_emit_stop(struct si_context *sctx,
+ struct si_query_hw *hwquery,
+ struct r600_resource *buffer, uint64_t va)
+{
+ struct si_query_pc *query = (struct si_query_pc *)hwquery;
+ struct si_query_group *group;
+
+ si_pc_emit_stop(sctx, buffer, va);
+
+ for (group = query->groups; group; group = group->next) {
+ struct si_pc_block *block = group->block;
+ unsigned se = group->se >= 0 ? group->se : 0;
+ unsigned se_end = se + 1;
+
+ if ((block->b->b->flags & SI_PC_BLOCK_SE) && (group->se < 0))
+ se_end = sctx->screen->info.max_se;
+
+ do {
+ unsigned instance = group->instance >= 0 ? group->instance : 0;
+
+ do {
+ si_pc_emit_instance(sctx, se, instance);
+ si_pc_emit_read(sctx, block, group->num_counters, va);
+ va += sizeof(uint64_t) * group->num_counters;
+ } while (group->instance < 0 && ++instance < block->num_instances);
+ } while (++se < se_end);
+ }
+
+ si_pc_emit_instance(sctx, -1, -1);
+}
+
+static void si_pc_query_clear_result(struct si_query_hw *hwquery,
+ union pipe_query_result *result)
+{
+ struct si_query_pc *query = (struct si_query_pc *)hwquery;
+
+ memset(result, 0, sizeof(result->batch[0]) * query->num_counters);
+}
+
+static void si_pc_query_add_result(struct si_screen *screen,
+ struct si_query_hw *hwquery,
+ void *buffer,
+ union pipe_query_result *result)
+{
+ struct si_query_pc *query = (struct si_query_pc *)hwquery;
+ uint64_t *results = buffer;
+ unsigned i, j;
+
+ for (i = 0; i < query->num_counters; ++i) {
+ struct si_query_counter *counter = &query->counters[i];
+
+ for (j = 0; j < counter->qwords; ++j) {
+ uint32_t value = results[counter->base + j * counter->stride];
+ result->batch[i].u64 += value;
+ }
+ }
+}
+
+static struct si_query_ops batch_query_ops = {
+ .destroy = si_pc_query_destroy,
+ .begin = si_query_hw_begin,
+ .end = si_query_hw_end,
+ .get_result = si_query_hw_get_result
+};
+
+static struct si_query_hw_ops batch_query_hw_ops = {
+ .prepare_buffer = si_pc_query_prepare_buffer,
+ .emit_start = si_pc_query_emit_start,
+ .emit_stop = si_pc_query_emit_stop,
+ .clear_result = si_pc_query_clear_result,
+ .add_result = si_pc_query_add_result,
+};
+
+static struct si_query_group *get_group_state(struct si_screen *screen,
+ struct si_query_pc *query,
+ struct si_pc_block *block,
+ unsigned sub_gid)
+{
+ struct si_query_group *group = query->groups;
+
+ while (group) {
+ if (group->block == block && group->sub_gid == sub_gid)
+ return group;
+ group = group->next;
+ }
+
+ group = CALLOC_STRUCT(si_query_group);
+ if (!group)
+ return NULL;
+
+ group->block = block;
+ group->sub_gid = sub_gid;
+
+ if (block->b->b->flags & SI_PC_BLOCK_SHADER) {
+ unsigned sub_gids = block->num_instances;
+ unsigned shader_id;
+ unsigned shaders;
+ unsigned query_shaders;
+
+ if (si_pc_block_has_per_se_groups(screen->perfcounters, block))
+ sub_gids = sub_gids * screen->info.max_se;
+ shader_id = sub_gid / sub_gids;
+ sub_gid = sub_gid % sub_gids;
+
+ shaders = si_pc_shader_type_bits[shader_id];
+
+ query_shaders = query->shaders & ~SI_PC_SHADERS_WINDOWING;
+ if (query_shaders && query_shaders != shaders) {
+ fprintf(stderr, "si_perfcounter: incompatible shader groups\n");
+ FREE(group);
+ return NULL;
+ }
+ query->shaders = shaders;
+ }
+
+ if (block->b->b->flags & SI_PC_BLOCK_SHADER_WINDOWED && !query->shaders) {
+ // A non-zero value in query->shaders ensures that the shader
+ // masking is reset unless the user explicitly requests one.
+ query->shaders = SI_PC_SHADERS_WINDOWING;
+ }
+
+ if (si_pc_block_has_per_se_groups(screen->perfcounters, block)) {
+ group->se = sub_gid / block->num_instances;
+ sub_gid = sub_gid % block->num_instances;
+ } else {
+ group->se = -1;
+ }
+
+ if (si_pc_block_has_per_instance_groups(screen->perfcounters, block)) {
+ group->instance = sub_gid;
+ } else {
+ group->instance = -1;
+ }
+
+ group->next = query->groups;
+ query->groups = group;
+
+ return group;
+}
+
+struct pipe_query *si_create_batch_query(struct pipe_context *ctx,
+ unsigned num_queries,
+ unsigned *query_types)
+{
+ struct si_screen *screen =
+ (struct si_screen *)ctx->screen;
+ struct si_perfcounters *pc = screen->perfcounters;
+ struct si_pc_block *block;
+ struct si_query_group *group;
+ struct si_query_pc *query;
+ unsigned base_gid, sub_gid, sub_index;
+ unsigned i, j;
+
+ if (!pc)
+ return NULL;
+
+ query = CALLOC_STRUCT(si_query_pc);
+ if (!query)
+ return NULL;
+
+ query->b.b.ops = &batch_query_ops;
+ query->b.ops = &batch_query_hw_ops;
+
+ query->num_counters = num_queries;
+
+ /* Collect selectors per group */
+ for (i = 0; i < num_queries; ++i) {
+ unsigned sub_gid;
+
+ if (query_types[i] < SI_QUERY_FIRST_PERFCOUNTER)
+ goto error;
+
+ block = lookup_counter(pc, query_types[i] - SI_QUERY_FIRST_PERFCOUNTER,
+ &base_gid, &sub_index);
+ if (!block)
+ goto error;
+
+ sub_gid = sub_index / block->b->selectors;
+ sub_index = sub_index % block->b->selectors;
+
+ group = get_group_state(screen, query, block, sub_gid);
+ if (!group)
+ goto error;
+
+ if (group->num_counters >= block->b->b->num_counters) {
+ fprintf(stderr,
+ "perfcounter group %s: too many selected\n",
+ block->b->b->name);
+ goto error;
+ }
+ group->selectors[group->num_counters] = sub_index;
+ ++group->num_counters;
+ }
+
+ /* Compute result bases and CS size per group */
+ query->b.num_cs_dw_end = pc->num_stop_cs_dwords;
+ query->b.num_cs_dw_end += pc->num_instance_cs_dwords;
+
+ i = 0;
+ for (group = query->groups; group; group = group->next) {
+ struct si_pc_block *block = group->block;
+ unsigned read_dw;
+ unsigned instances = 1;
+
+ if ((block->b->b->flags & SI_PC_BLOCK_SE) && group->se < 0)
+ instances = screen->info.max_se;
+ if (group->instance < 0)
+ instances *= block->num_instances;
+
+ group->result_base = i;
+ query->b.result_size += sizeof(uint64_t) * instances * group->num_counters;
+ i += instances * group->num_counters;
+
+ read_dw = 6 * group->num_counters;
+ query->b.num_cs_dw_end += instances * read_dw;
+ query->b.num_cs_dw_end += instances * pc->num_instance_cs_dwords;
+ }
+
+ if (query->shaders) {
+ if (query->shaders == SI_PC_SHADERS_WINDOWING)
+ query->shaders = 0xffffffff;
+ }
+
+ /* Map user-supplied query array to result indices */
+ query->counters = CALLOC(num_queries, sizeof(*query->counters));
+ for (i = 0; i < num_queries; ++i) {
+ struct si_query_counter *counter = &query->counters[i];
+ struct si_pc_block *block;
+
+ block = lookup_counter(pc, query_types[i] - SI_QUERY_FIRST_PERFCOUNTER,
+ &base_gid, &sub_index);
+
+ sub_gid = sub_index / block->b->selectors;
+ sub_index = sub_index % block->b->selectors;
+
+ group = get_group_state(screen, query, block, sub_gid);
+ assert(group != NULL);
+
+ for (j = 0; j < group->num_counters; ++j) {
+ if (group->selectors[j] == sub_index)
+ break;
+ }
+
+ counter->base = group->result_base + j;
+ counter->stride = group->num_counters;
+
+ counter->qwords = 1;
+ if ((block->b->b->flags & SI_PC_BLOCK_SE) && group->se < 0)
+ counter->qwords = screen->info.max_se;
+ if (group->instance < 0)
+ counter->qwords *= block->num_instances;
+ }
+
+ if (!si_query_hw_init(screen, &query->b))
+ goto error;
+
+ return (struct pipe_query *)query;
+
+error:
+ si_pc_query_destroy(screen, &query->b.b);
+ return NULL;
+}
+
+static bool si_init_block_names(struct si_screen *screen,
+ struct si_pc_block *block)
+{
+ bool per_instance_groups = si_pc_block_has_per_instance_groups(screen->perfcounters, block);
+ bool per_se_groups = si_pc_block_has_per_se_groups(screen->perfcounters, block);
+ unsigned i, j, k;
+ unsigned groups_shader = 1, groups_se = 1, groups_instance = 1;
+ unsigned namelen;
+ char *groupname;
+ char *p;
+
+ if (per_instance_groups)
+ groups_instance = block->num_instances;
+ if (per_se_groups)
+ groups_se = screen->info.max_se;
+ if (block->b->b->flags & SI_PC_BLOCK_SHADER)
+ groups_shader = ARRAY_SIZE(si_pc_shader_type_bits);
+
+ namelen = strlen(block->b->b->name);
+ block->group_name_stride = namelen + 1;
+ if (block->b->b->flags & SI_PC_BLOCK_SHADER)
+ block->group_name_stride += 3;
+ if (per_se_groups) {
+ assert(groups_se <= 10);
+ block->group_name_stride += 1;
+
+ if (per_instance_groups)
+ block->group_name_stride += 1;
+ }
+ if (per_instance_groups) {
+ assert(groups_instance <= 100);
+ block->group_name_stride += 2;
+ }
+
+ block->group_names = MALLOC(block->num_groups * block->group_name_stride);
+ if (!block->group_names)
+ return false;
+
+ groupname = block->group_names;
+ for (i = 0; i < groups_shader; ++i) {
+ const char *shader_suffix = si_pc_shader_type_suffixes[i];
+ unsigned shaderlen = strlen(shader_suffix);
+ for (j = 0; j < groups_se; ++j) {
+ for (k = 0; k < groups_instance; ++k) {
+ strcpy(groupname, block->b->b->name);
+ p = groupname + namelen;
+
+ if (block->b->b->flags & SI_PC_BLOCK_SHADER) {
+ strcpy(p, shader_suffix);
+ p += shaderlen;
+ }
+
+ if (per_se_groups) {
+ p += sprintf(p, "%d", j);
+ if (per_instance_groups)
+ *p++ = '_';
+ }
+
+ if (per_instance_groups)
+ p += sprintf(p, "%d", k);
+
+ groupname += block->group_name_stride;
+ }
+ }
+ }
+
+ assert(block->b->selectors <= 1000);
+ block->selector_name_stride = block->group_name_stride + 4;
+ block->selector_names = MALLOC(block->num_groups * block->b->selectors *
+ block->selector_name_stride);
+ if (!block->selector_names)
+ return false;
+
+ groupname = block->group_names;
+ p = block->selector_names;
+ for (i = 0; i < block->num_groups; ++i) {
+ for (j = 0; j < block->b->selectors; ++j) {
+ sprintf(p, "%s_%03d", groupname, j);
+ p += block->selector_name_stride;
+ }
+ groupname += block->group_name_stride;
+ }
+
+ return true;
+}
+
+int si_get_perfcounter_info(struct si_screen *screen,
+ unsigned index,
+ struct pipe_driver_query_info *info)
+{
+ struct si_perfcounters *pc = screen->perfcounters;
+ struct si_pc_block *block;
+ unsigned base_gid, sub;
+
+ if (!pc)
+ return 0;
+
+ if (!info) {
+ unsigned bid, num_queries = 0;
+
+ for (bid = 0; bid < pc->num_blocks; ++bid) {
+ num_queries += pc->blocks[bid].b->selectors *
+ pc->blocks[bid].num_groups;
+ }
+
+ return num_queries;
+ }
+
+ block = lookup_counter(pc, index, &base_gid, &sub);
+ if (!block)
+ return 0;
+
+ if (!block->selector_names) {
+ if (!si_init_block_names(screen, block))
+ return 0;
+ }
+ info->name = block->selector_names + sub * block->selector_name_stride;
+ info->query_type = SI_QUERY_FIRST_PERFCOUNTER + index;
+ info->max_value.u64 = 0;
+ info->type = PIPE_DRIVER_QUERY_TYPE_UINT64;
+ info->result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_AVERAGE;
+ info->group_id = base_gid + sub / block->b->selectors;
+ info->flags = PIPE_DRIVER_QUERY_FLAG_BATCH;
+ if (sub > 0 && sub + 1 < block->b->selectors * block->num_groups)
+ info->flags |= PIPE_DRIVER_QUERY_FLAG_DONT_LIST;
+ return 1;
+}
+
+int si_get_perfcounter_group_info(struct si_screen *screen,
+ unsigned index,
+ struct pipe_driver_query_group_info *info)
{
- si_perfcounters_do_destroy(sscreen->perfcounters);
- sscreen->perfcounters = NULL;
+ struct si_perfcounters *pc = screen->perfcounters;
+ struct si_pc_block *block;
+
+ if (!pc)
+ return 0;
+
+ if (!info)
+ return pc->num_groups;
+
+ block = lookup_group(pc, &index);
+ if (!block)
+ return 0;
+
+ if (!block->group_names) {
+ if (!si_init_block_names(screen, block))
+ return 0;
+ }
+ info->name = block->group_names + index * block->group_name_stride;
+ info->num_queries = block->b->selectors;
+ info->max_active_queries = block->b->b->num_counters;
+ return 1;
+}
+
+void si_destroy_perfcounters(struct si_screen *screen)
+{
+ struct si_perfcounters *pc = screen->perfcounters;
+ unsigned i;
+
+ if (!pc)
+ return;
+
+ for (i = 0; i < pc->num_blocks; ++i) {
+ FREE(pc->blocks[i].group_names);
+ FREE(pc->blocks[i].selector_names);
+ }
+ FREE(pc->blocks);
+ FREE(pc);
+ screen->perfcounters = NULL;
}

void si_init_perfcounters(struct si_screen *screen)
{
struct si_perfcounters *pc;
- struct si_pc_block *blocks;
+ const struct si_pc_block_gfxdescr *blocks;
unsigned num_blocks;
unsigned i;

switch (screen->info.chip_class) {
case CIK:
blocks = groups_CIK;
num_blocks = ARRAY_SIZE(groups_CIK);
break;
case VI:
blocks = groups_VI;
@@ -672,59 +1266,57 @@ void si_init_perfcounters(struct si_screen *screen)
return; /* not implemented */
}

if (screen->info.max_sh_per_se != 1) {
/* This should not happen on non-SI chips. */
fprintf(stderr, "si_init_perfcounters: max_sh_per_se = %d not "
"supported (inaccurate performance counters)\n",
screen->info.max_sh_per_se);
}

- pc = CALLOC_STRUCT(si_perfcounters);
+ screen->perfcounters = pc = CALLOC_STRUCT(si_perfcounters);
if (!pc)
return;

pc->num_stop_cs_dwords = 14 + si_cp_write_fence_dwords(screen);
pc->num_instance_cs_dwords = 3;

- pc->num_shader_types = ARRAY_SIZE(si_pc_shader_type_bits);
- pc->shader_type_suffixes = si_pc_shader_type_suffixes;
- pc->shader_type_bits = si_pc_shader_type_bits;
-
- pc->emit_instance = si_pc_emit_instance;
- pc->emit_shaders = si_pc_emit_shaders;
- pc->emit_select = si_pc_emit_select;
- pc->emit_start = si_pc_emit_start;
- pc->emit_stop = si_pc_emit_stop;
- pc->emit_read = si_pc_emit_read;
- pc->cleanup = si_pc_cleanup;
+ pc->separate_se = debug_get_bool_option("RADEON_PC_SEPARATE_SE", false);
+ pc->separate_instance = debug_get_bool_option("RADEON_PC_SEPARATE_INSTANCE", false);

- if (!si_perfcounters_init(pc, num_blocks))
+ pc->blocks = CALLOC(num_blocks, sizeof(struct si_pc_block));
+ if (!pc->blocks)
goto error;
+ pc->num_blocks = num_blocks;

for (i = 0; i < num_blocks; ++i) {
- struct si_pc_block *block = &blocks[i];
- unsigned instances = block->instances;
+ struct si_pc_block *block = &pc->blocks[i];
+ block->b = &blocks[i];
+ block->num_instances = block->b->instances;
+
+ if (!strcmp(block->b->b->name, "CB") ||
+ !strcmp(block->b->b->name, "DB"))
+ block->num_instances = screen->info.max_se;
+ else if (!strcmp(block->b->b->name, "TCC"))
+ block->num_instances = screen->info.num_tcc_blocks;
+ else if (!strcmp(block->b->b->name, "IA"))
+ block->num_instances = MAX2(1, screen->info.max_se / 2);
+
+ if (si_pc_block_has_per_instance_groups(pc, block)) {
+ block->num_groups = block->num_instances;
+ } else {
+ block->num_groups = 1;
+ }

- if (!strcmp(block->b->name, "CB") ||
- !strcmp(block->b->name, "DB"))
- instances = screen->info.max_se;
- else if (!strcmp(block->b->name, "TCC"))
- instances = screen->info.num_tcc_blocks;
- else if (!strcmp(block->b->name, "IA"))
- instances = MAX2(1, screen->info.max_se / 2);
-
- si_perfcounters_add_block(screen, pc,
- block->b->name,
- block->b->flags,
- block->b->num_counters,
- block->selectors,
- instances,
- block);
+ if (si_pc_block_has_per_se_groups(pc, block))
+ block->num_groups *= screen->info.max_se;
+ if (block->b->b->flags & SI_PC_BLOCK_SHADER)
+ block->num_groups *= ARRAY_SIZE(si_pc_shader_type_bits);
+
+ pc->num_groups += block->num_groups;
}

- screen->perfcounters = pc;
return;

error:
- si_perfcounters_do_destroy(pc);
+ si_destroy_perfcounters(screen);
}
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index fd8ff5fa202..f7bab2c59f5 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -698,21 +698,21 @@ static void si_destroy_screen(struct pipe_screen* pscreen)
struct si_shader_part *part = parts[i];

parts[i] = part->next;
ac_shader_binary_clean(&part->binary);
FREE(part);
}
}
mtx_destroy(&sscreen->shader_parts_mutex);
si_destroy_shader_cache(sscreen);

- si_perfcounters_destroy(sscreen);
+ si_destroy_perfcounters(sscreen);
si_gpu_load_kill_thread(sscreen);

mtx_destroy(&sscreen->gpu_load_mutex);
mtx_destroy(&sscreen->aux_context_lock);
sscreen->aux_context->destroy(sscreen->aux_context);

slab_destroy_parent(&sscreen->pool_transfers);

disk_cache_destroy(sscreen->disk_shader_cache);
sscreen->ws->destroy(sscreen->ws);
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 179671e8871..bfcfc915124 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -1229,25 +1229,23 @@ void si_need_gfx_cs_space(struct si_context *ctx);

/* r600_gpu_load.c */
void si_gpu_load_kill_thread(struct si_screen *sscreen);
uint64_t si_begin_counter(struct si_screen *sscreen, unsigned type);
unsigned si_end_counter(struct si_screen *sscreen, unsigned type,
uint64_t begin);

/* si_compute.c */
void si_init_compute_functions(struct si_context *sctx);

-/* r600_perfcounters.c */
-void si_perfcounters_destroy(struct si_screen *sscreen);
-
/* si_perfcounters.c */
void si_init_perfcounters(struct si_screen *screen);
+void si_destroy_perfcounters(struct si_screen *screen);

/* si_pipe.c */
bool si_check_device_reset(struct si_context *sctx);

/* si_query.c */
void si_init_screen_query_functions(struct si_screen *sscreen);
void si_init_query_functions(struct si_context *sctx);
void si_suspend_queries(struct si_context *sctx);
void si_resume_queries(struct si_context *sctx);

diff --git a/src/gallium/drivers/radeonsi/si_query.h b/src/gallium/drivers/radeonsi/si_query.h
index cf2eccd862b..032946edf4d 100644
--- a/src/gallium/drivers/radeonsi/si_query.h
+++ b/src/gallium/drivers/radeonsi/si_query.h
@@ -206,115 +206,43 @@ void si_query_hw_destroy(struct si_screen *sscreen,
bool si_query_hw_begin(struct si_context *sctx,
struct si_query *rquery);
bool si_query_hw_end(struct si_context *sctx,
struct si_query *rquery);
bool si_query_hw_get_result(struct si_context *sctx,
struct si_query *rquery,
bool wait,
union pipe_query_result *result);

/* Performance counters */
-enum {
- /* This block is part of the shader engine */
- SI_PC_BLOCK_SE = (1 << 0),
-
- /* Expose per-instance groups instead of summing all instances (within
- * an SE). */
- SI_PC_BLOCK_INSTANCE_GROUPS = (1 << 1),
-
- /* Expose per-SE groups instead of summing instances across SEs. */
- SI_PC_BLOCK_SE_GROUPS = (1 << 2),
-
- /* Shader block */
- SI_PC_BLOCK_SHADER = (1 << 3),
-
- /* Non-shader block with perfcounters windowed by shaders. */
- SI_PC_BLOCK_SHADER_WINDOWED = (1 << 4),
-};
-
-/* Describes a hardware block with performance counters. Multiple instances of
- * each block, possibly per-SE, may exist on the chip. Depending on the block
- * and on the user's configuration, we either
- * (a) expose every instance as a performance counter group,
- * (b) expose a single performance counter group that reports the sum over all
- * instances, or
- * (c) expose one performance counter group per instance, but summed over all
- * shader engines.
- */
-struct si_perfcounter_block {
- const char *basename;
- unsigned flags;
- unsigned num_counters;
- unsigned num_selectors;
- unsigned num_instances;
-
- unsigned num_groups;
- char *group_names;
- unsigned group_name_stride;
-
- char *selector_names;
- unsigned selector_name_stride;
-
- void *data;
-};
-
struct si_perfcounters {
unsigned num_groups;
unsigned num_blocks;
- struct si_perfcounter_block *blocks;
+ struct si_pc_block *blocks;

unsigned num_stop_cs_dwords;
unsigned num_instance_cs_dwords;

- unsigned num_shader_types;
- const char * const *shader_type_suffixes;
- const unsigned *shader_type_bits;
-
- void (*emit_instance)(struct si_context *,
- int se, int instance);
- void (*emit_shaders)(struct si_context *, unsigned shaders);
- void (*emit_select)(struct si_context *,
- struct si_perfcounter_block *,
- unsigned count, unsigned *selectors);
- void (*emit_start)(struct si_context *,
- struct r600_resource *buffer, uint64_t va);
- void (*emit_stop)(struct si_context *,
- struct r600_resource *buffer, uint64_t va);
- void (*emit_read)(struct si_context *,
- struct si_perfcounter_block *,
- unsigned count, unsigned *selectors,
- struct r600_resource *buffer, uint64_t va);
-
- void (*cleanup)(struct si_screen *);
-
bool separate_se;
bool separate_instance;
};

struct pipe_query *si_create_batch_query(struct pipe_context *ctx,
unsigned num_queries,
unsigned *query_types);

int si_get_perfcounter_info(struct si_screen *,
unsigned index,
struct pipe_driver_query_info *info);
int si_get_perfcounter_group_info(struct si_screen *,
unsigned index,
struct pipe_driver_query_group_info *info);

-bool si_perfcounters_init(struct si_perfcounters *, unsigned num_blocks);
-void si_perfcounters_add_block(struct si_screen *,
- struct si_perfcounters *,
- const char *name, unsigned flags,
- unsigned counters, unsigned selectors,
- unsigned instances, void *data);
-void si_perfcounters_do_destroy(struct si_perfcounters *);
void si_query_hw_reset_buffers(struct si_context *sctx,
struct si_query_hw *query);

struct si_qbo_state {
void *saved_compute;
struct pipe_constant_buffer saved_const0;
struct pipe_shader_buffer saved_ssbo[3];
};

#endif /* SI_QUERY_H */
--
2.19.1
Nicolai Hähnle
2018-12-06 14:00:45 UTC
Permalink
From: Nicolai Hähnle <***@amd.com>

Remove a level of indirection to make the code more explicit -- should
make it easier to follow what's going on.
---
src/gallium/drivers/radeonsi/si_perfcounter.c | 143 ++++++++++++------
1 file changed, 93 insertions(+), 50 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_perfcounter.c b/src/gallium/drivers/radeonsi/si_perfcounter.c
index f0d10c054c4..65197c0daa4 100644
--- a/src/gallium/drivers/radeonsi/si_perfcounter.c
+++ b/src/gallium/drivers/radeonsi/si_perfcounter.c
@@ -139,21 +139,25 @@ struct si_query_group {
unsigned selectors[SI_QUERY_MAX_COUNTERS];
};

struct si_query_counter {
unsigned base;
unsigned qwords;
unsigned stride; /* in uint64s */
};

struct si_query_pc {
- struct si_query_hw b;
+ struct si_query b;
+ struct si_query_buffer buffer;
+
+ /* Size of the results in memory, in bytes. */
+ unsigned result_size;

unsigned shaders;
unsigned num_counters;
struct si_query_counter *counters;
struct si_query_group *groups;
};


static struct si_pc_block_base cik_CB = {
.name = "CB",
@@ -758,70 +762,72 @@ static void si_pc_query_destroy(struct si_screen *sscreen,
struct si_query_pc *query = (struct si_query_pc *)rquery;

while (query->groups) {
struct si_query_group *group = query->groups;
query->groups = group->next;
FREE(group);
}

FREE(query->counters);

- si_query_hw_destroy(sscreen, rquery);
-}
-
-static bool si_pc_query_prepare_buffer(struct si_context *ctx,
- struct si_query_buffer *qbuf)
-{
- /* no-op */
- return true;
+ si_query_buffer_destroy(sscreen, &query->buffer);
+ FREE(query);
}

-static void si_pc_query_emit_start(struct si_context *sctx,
+static void si_pc_query_resume(struct si_context *sctx, struct si_query *rquery)
+/*
struct si_query_hw *hwquery,
- struct r600_resource *buffer, uint64_t va)
+ struct r600_resource *buffer, uint64_t va)*/
{
- struct si_query_pc *query = (struct si_query_pc *)hwquery;
- struct si_query_group *group;
+ struct si_query_pc *query = (struct si_query_pc *)rquery;
int current_se = -1;
int current_instance = -1;

+ if (!si_query_buffer_alloc(sctx, &query->buffer, NULL, query->result_size))
+ return;
+ si_need_gfx_cs_space(sctx);
+
if (query->shaders)
si_pc_emit_shaders(sctx, query->shaders);

- for (group = query->groups; group; group = group->next) {
+ for (struct si_query_group *group = query->groups; group; group = group->next) {
struct si_pc_block *block = group->block;

if (group->se != current_se || group->instance != current_instance) {
current_se = group->se;
current_instance = group->instance;
si_pc_emit_instance(sctx, group->se, group->instance);
}

si_pc_emit_select(sctx, block, group->num_counters, group->selectors);
}

if (current_se != -1 || current_instance != -1)
si_pc_emit_instance(sctx, -1, -1);

- si_pc_emit_start(sctx, buffer, va);
+ uint64_t va = query->buffer.buf->gpu_address + query->buffer.results_end;
+ si_pc_emit_start(sctx, query->buffer.buf, va);
}

-static void si_pc_query_emit_stop(struct si_context *sctx,
- struct si_query_hw *hwquery,
- struct r600_resource *buffer, uint64_t va)
+static void si_pc_query_suspend(struct si_context *sctx, struct si_query *rquery)
{
- struct si_query_pc *query = (struct si_query_pc *)hwquery;
- struct si_query_group *group;
+ struct si_query_pc *query = (struct si_query_pc *)rquery;

- si_pc_emit_stop(sctx, buffer, va);
+ if (!query->buffer.buf)
+ return;

- for (group = query->groups; group; group = group->next) {
+ uint64_t va = query->buffer.buf->gpu_address + query->buffer.results_end;
+ query->buffer.results_end += query->result_size;
+
+ si_pc_emit_stop(sctx, query->buffer.buf, va);
+
+ for (struct si_query_group *group = query->groups; group; group = group->next) {
struct si_pc_block *block = group->block;
unsigned se = group->se >= 0 ? group->se : 0;
unsigned se_end = se + 1;

if ((block->b->b->flags & SI_PC_BLOCK_SE) && (group->se < 0))
se_end = sctx->screen->info.max_se;

do {
unsigned instance = group->instance >= 0 ? group->instance : 0;

@@ -829,63 +835,101 @@ static void si_pc_query_emit_stop(struct si_context *sctx,
si_pc_emit_instance(sctx, se, instance);
si_pc_emit_read(sctx, block, group->num_counters, va);
va += sizeof(uint64_t) * group->num_counters;
} while (group->instance < 0 && ++instance < block->num_instances);
} while (++se < se_end);
}

si_pc_emit_instance(sctx, -1, -1);
}

-static void si_pc_query_clear_result(struct si_query_hw *hwquery,
- union pipe_query_result *result)
+static bool si_pc_query_begin(struct si_context *ctx, struct si_query *rquery)
{
- struct si_query_pc *query = (struct si_query_pc *)hwquery;
+ struct si_query_pc *query = (struct si_query_pc *)rquery;

- memset(result, 0, sizeof(result->batch[0]) * query->num_counters);
+ si_query_buffer_reset(ctx, &query->buffer);
+
+ LIST_ADDTAIL(&query->b.active_list, &ctx->active_queries);
+ ctx->num_cs_dw_queries_suspend += query->b.num_cs_dw_suspend;
+
+ si_pc_query_resume(ctx, rquery);
+
+ return true;
}

-static void si_pc_query_add_result(struct si_screen *screen,
- struct si_query_hw *hwquery,
+static bool si_pc_query_end(struct si_context *ctx, struct si_query *rquery)
+{
+ struct si_query_pc *query = (struct si_query_pc *)rquery;
+
+ si_pc_query_suspend(ctx, rquery);
+
+ LIST_DEL(&rquery->active_list);
+ ctx->num_cs_dw_queries_suspend -= rquery->num_cs_dw_suspend;
+
+ return query->buffer.buf != NULL;
+}
+
+static void si_pc_query_add_result(struct si_query_pc *query,
void *buffer,
union pipe_query_result *result)
{
- struct si_query_pc *query = (struct si_query_pc *)hwquery;
uint64_t *results = buffer;
unsigned i, j;

for (i = 0; i < query->num_counters; ++i) {
struct si_query_counter *counter = &query->counters[i];

for (j = 0; j < counter->qwords; ++j) {
uint32_t value = results[counter->base + j * counter->stride];
result->batch[i].u64 += value;
}
}
}

+static bool si_pc_query_get_result(struct si_context *sctx, struct si_query *rquery,
+ bool wait, union pipe_query_result *result)
+{
+ struct si_query_pc *query = (struct si_query_pc *)rquery;
+
+ memset(result, 0, sizeof(result->batch[0]) * query->num_counters);
+
+ for (struct si_query_buffer *qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
+ unsigned usage = PIPE_TRANSFER_READ |
+ (wait ? 0 : PIPE_TRANSFER_DONTBLOCK);
+ unsigned results_base = 0;
+ void *map;
+
+ if (rquery->b.flushed)
+ map = sctx->ws->buffer_map(qbuf->buf->buf, NULL, usage);
+ else
+ map = si_buffer_map_sync_with_rings(sctx, qbuf->buf, usage);
+
+ if (!map)
+ return false;
+
+ while (results_base != qbuf->results_end) {
+ si_pc_query_add_result(query, map + results_base, result);
+ results_base += query->result_size;
+ }
+ }
+
+ return true;
+}
+
static struct si_query_ops batch_query_ops = {
.destroy = si_pc_query_destroy,
- .begin = si_query_hw_begin,
- .end = si_query_hw_end,
- .get_result = si_query_hw_get_result,
-
- .suspend = si_query_hw_suspend,
- .resume = si_query_hw_resume,
-};
+ .begin = si_pc_query_begin,
+ .end = si_pc_query_end,
+ .get_result = si_pc_query_get_result,

-static struct si_query_hw_ops batch_query_hw_ops = {
- .prepare_buffer = si_pc_query_prepare_buffer,
- .emit_start = si_pc_query_emit_start,
- .emit_stop = si_pc_query_emit_stop,
- .clear_result = si_pc_query_clear_result,
- .add_result = si_pc_query_add_result,
+ .suspend = si_pc_query_suspend,
+ .resume = si_pc_query_resume,
};

static struct si_query_group *get_group_state(struct si_screen *screen,
struct si_query_pc *query,
struct si_pc_block *block,
unsigned sub_gid)
{
struct si_query_group *group = query->groups;

while (group) {
@@ -961,22 +1005,21 @@ struct pipe_query *si_create_batch_query(struct pipe_context *ctx,
unsigned base_gid, sub_gid, sub_index;
unsigned i, j;

if (!pc)
return NULL;

query = CALLOC_STRUCT(si_query_pc);
if (!query)
return NULL;

- query->b.b.ops = &batch_query_ops;
- query->b.ops = &batch_query_hw_ops;
+ query->b.ops = &batch_query_ops;

query->num_counters = num_queries;

/* Collect selectors per group */
for (i = 0; i < num_queries; ++i) {
unsigned sub_gid;

if (query_types[i] < SI_QUERY_FIRST_PERFCOUNTER)
goto error;

@@ -996,41 +1039,41 @@ struct pipe_query *si_create_batch_query(struct pipe_context *ctx,
fprintf(stderr,
"perfcounter group %s: too many selected\n",
block->b->b->name);
goto error;
}
group->selectors[group->num_counters] = sub_index;
++group->num_counters;
}

/* Compute result bases and CS size per group */
- query->b.b.num_cs_dw_suspend = pc->num_stop_cs_dwords;
- query->b.b.num_cs_dw_suspend += pc->num_instance_cs_dwords;
+ query->b.num_cs_dw_suspend = pc->num_stop_cs_dwords;
+ query->b.num_cs_dw_suspend += pc->num_instance_cs_dwords;

i = 0;
for (group = query->groups; group; group = group->next) {
struct si_pc_block *block = group->block;
unsigned read_dw;
unsigned instances = 1;

if ((block->b->b->flags & SI_PC_BLOCK_SE) && group->se < 0)
instances = screen->info.max_se;
if (group->instance < 0)
instances *= block->num_instances;

group->result_base = i;
- query->b.result_size += sizeof(uint64_t) * instances * group->num_counters;
+ query->result_size += sizeof(uint64_t) * instances * group->num_counters;
i += instances * group->num_counters;

read_dw = 6 * group->num_counters;
- query->b.b.num_cs_dw_suspend += instances * read_dw;
- query->b.b.num_cs_dw_suspend += instances * pc->num_instance_cs_dwords;
+ query->b.num_cs_dw_suspend += instances * read_dw;
+ query->b.num_cs_dw_suspend += instances * pc->num_instance_cs_dwords;
}

if (query->shaders) {
if (query->shaders == SI_PC_SHADERS_WINDOWING)
query->shaders = 0xffffffff;
}

/* Map user-supplied query array to result indices */
query->counters = CALLOC(num_queries, sizeof(*query->counters));
for (i = 0; i < num_queries; ++i) {
@@ -1057,21 +1100,21 @@ struct pipe_query *si_create_batch_query(struct pipe_context *ctx,
counter->qwords = 1;
if ((block->b->b->flags & SI_PC_BLOCK_SE) && group->se < 0)
counter->qwords = screen->info.max_se;
if (group->instance < 0)
counter->qwords *= block->num_instances;
}

return (struct pipe_query *)query;

error:
- si_pc_query_destroy(screen, &query->b.b);
+ si_pc_query_destroy(screen, &query->b);
return NULL;
}

static bool si_init_block_names(struct si_screen *screen,
struct si_pc_block *block)
{
bool per_instance_groups = si_pc_block_has_per_instance_groups(screen->perfcounters, block);
bool per_se_groups = si_pc_block_has_per_se_groups(screen->perfcounters, block);
unsigned i, j, k;
unsigned groups_shader = 1, groups_se = 1, groups_instance = 1;
--
2.19.1
Loading...