Discussion:
[PATCH v3 1/6] radeonsi: declare new user SGPR indices for bindless samplers/images
Add Reply
Samuel Pitoiset
2017-08-08 16:57:27 UTC
Reply
Permalink
Raw Message
A new pair of user SGPR is needed for loading the bindless
descriptors from shaders. Because the descriptors are global for
all stages, there is no need to add separate indices for GFX9.

v3: - fix merged shaders on GFX9
v2: - fix declaring new bindless parameter

Signed-off-by: Samuel Pitoiset <***@gmail.com>
---
src/gallium/drivers/radeonsi/si_shader.c | 21 +++++++++++++++++----
src/gallium/drivers/radeonsi/si_shader.h | 4 +++-
src/gallium/drivers/radeonsi/si_shader_internal.h | 1 +
3 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 09053c355e..035e36fbab 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -2900,6 +2900,9 @@ static void si_set_ls_return_value_for_tcs(struct si_shader_context *ctx)
ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3);
ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4);
ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5);
+ ret = si_insert_input_ptr_as_2xi32(ctx, ret,
+ ctx->param_bindless_samplers_and_images,
+ 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);

ret = si_insert_input_ret(ctx, ret, ctx->param_vs_state_bits,
8 + SI_SGPR_VS_STATE_BITS);
@@ -2938,6 +2941,9 @@ static void si_set_es_return_value_for_gs(struct si_shader_context *ctx)
ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3);

ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5);
+ ret = si_insert_input_ptr_as_2xi32(ctx, ret,
+ ctx->param_bindless_samplers_and_images,
+ 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);

unsigned desc_param = ctx->param_vs_state_bits + 1;
ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param,
@@ -4249,6 +4255,8 @@ static void declare_default_desc_pointers(struct si_shader_context *ctx,
{
ctx->param_rw_buffers = add_arg(fninfo, ARG_SGPR,
si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS));
+ ctx->param_bindless_samplers_and_images = add_arg(fninfo, ARG_SGPR,
+ si_const_array(ctx->v8i32, 0));
declare_per_stage_desc_pointers(ctx, fninfo, true);
}

@@ -4388,8 +4396,9 @@ static void create_function(struct si_shader_context *ctx)
add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */

- add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
- add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
+ ctx->param_bindless_samplers_and_images =
+ add_arg(&fninfo, ARG_SGPR, si_const_array(ctx->v8i32, 0));
+
declare_per_stage_desc_pointers(ctx, &fninfo,
ctx->type == PIPE_SHADER_VERTEX);
declare_vs_specific_input_sgprs(ctx, &fninfo);
@@ -4442,8 +4451,9 @@ static void create_function(struct si_shader_context *ctx)
add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused (SPI_SHADER_PGM_LO/HI_GS << 8) */
add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */

- add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
- add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
+ ctx->param_bindless_samplers_and_images =
+ add_arg(&fninfo, ARG_SGPR, si_const_array(ctx->v8i32, 0));
+
declare_per_stage_desc_pointers(ctx, &fninfo,
(ctx->type == PIPE_SHADER_VERTEX ||
ctx->type == PIPE_SHADER_TESS_EVAL));
@@ -6888,6 +6898,7 @@ static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
add_arg(&fninfo, ARG_SGPR, ctx->i64);
add_arg(&fninfo, ARG_SGPR, ctx->i64);
add_arg(&fninfo, ARG_SGPR, ctx->i64);
+ add_arg(&fninfo, ARG_SGPR, ctx->i64);
add_arg(&fninfo, ARG_SGPR, ctx->i32);
add_arg(&fninfo, ARG_SGPR, ctx->i32);
add_arg(&fninfo, ARG_SGPR, ctx->i32);
@@ -6898,6 +6909,7 @@ static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
ctx->param_tcs_factor_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
} else {
+ add_arg(&fninfo, ARG_SGPR, ctx->i64);
add_arg(&fninfo, ARG_SGPR, ctx->i64);
add_arg(&fninfo, ARG_SGPR, ctx->i64);
add_arg(&fninfo, ARG_SGPR, ctx->i64);
@@ -7249,6 +7261,7 @@ static void si_build_ps_epilog_function(struct si_shader_context *ctx,

/* Declare input SGPRs. */
ctx->param_rw_buffers = add_arg(&fninfo, ARG_SGPR, ctx->i64);
+ ctx->param_bindless_samplers_and_images = add_arg(&fninfo, ARG_SGPR, ctx->i64);
ctx->param_const_and_shader_buffers = add_arg(&fninfo, ARG_SGPR, ctx->i64);
ctx->param_samplers_and_images = add_arg(&fninfo, ARG_SGPR, ctx->i64);
add_arg_checked(&fninfo, ARG_SGPR, ctx->f32, SI_PARAM_ALPHA_REF);
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index e44d71c261..88becdab8a 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -159,6 +159,8 @@ enum {
*/
SI_SGPR_RW_BUFFERS, /* rings (& stream-out, VS only) */
SI_SGPR_RW_BUFFERS_HI,
+ SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES,
+ SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES_HI,
SI_SGPR_CONST_AND_SHADER_BUFFERS,
SI_SGPR_CONST_AND_SHADER_BUFFERS_HI,
SI_SGPR_SAMPLERS_AND_IMAGES,
@@ -219,7 +221,7 @@ enum {

/* LLVM function parameter indices */
enum {
- SI_NUM_RESOURCE_PARAMS = 3,
+ SI_NUM_RESOURCE_PARAMS = 4,

/* PS only parameters */
SI_PARAM_ALPHA_REF = SI_NUM_RESOURCE_PARAMS,
diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h
index 808996adf5..f304295cb6 100644
--- a/src/gallium/drivers/radeonsi/si_shader_internal.h
+++ b/src/gallium/drivers/radeonsi/si_shader_internal.h
@@ -123,6 +123,7 @@ struct si_shader_context {
int param_rw_buffers;
int param_const_and_shader_buffers;
int param_samplers_and_images;
+ int param_bindless_samplers_and_images;
/* Common inputs for merged shaders. */
int param_merged_wave_info;
int param_merged_scratch_offset;
--
2.14.0
Samuel Pitoiset
2017-08-08 16:57:28 UTC
Reply
Permalink
Raw Message
The number of bindless descriptors is dynamic and we definitely
have to support more than 256 slots.

Signed-off-by: Samuel Pitoiset <***@gmail.com>
Reviewed-by: Marek Olšák <***@amd.com>
---
src/gallium/drivers/radeonsi/si_state.h | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index bce4066308..2b3c37fa16 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -234,7 +234,7 @@ struct si_descriptors {
/* The size of one descriptor. */
ubyte element_dw_size;
/* The maximum number of descriptors. */
- ubyte num_elements;
+ uint32_t num_elements;

/* Offset in CE RAM */
uint16_t ce_offset;
@@ -243,16 +243,16 @@ struct si_descriptors {
* range, direct uploads to memory will be used instead. This basically
* governs switching between onchip (CE) and offchip (upload) modes.
*/
- ubyte first_ce_slot;
- ubyte num_ce_slots;
+ uint32_t first_ce_slot;
+ uint32_t num_ce_slots;

/* Slots that are used by currently-bound shaders.
* With CE: It determines which slots are dumped to L2.
* It doesn't skip uploads to CE RAM.
* Without CE: It determines which slots are uploaded.
*/
- ubyte first_active_slot;
- ubyte num_active_slots;
+ uint32_t first_active_slot;
+ uint32_t num_active_slots;

/* Whether CE is used to upload this descriptor array. */
bool uses_ce;
--
2.14.0
Samuel Pitoiset
2017-08-08 16:57:30 UTC
Reply
Permalink
Raw Message
To share common code between rw buffers and bindless descriptors.

v3: - rename to si_emit_global_shader_pointers()

Signed-off-by: Samuel Pitoiset <***@gmail.com>
---
src/gallium/drivers/radeonsi/si_descriptors.c | 57 +++++++++++++++------------
1 file changed, 31 insertions(+), 26 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index 586310c168..799a53eefb 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -2185,6 +2185,35 @@ static void si_emit_shader_pointer(struct si_context *sctx,
radeon_emit(cs, va >> 32);
}

+static void si_emit_global_shader_pointers(struct si_context *sctx,
+ struct si_descriptors *descs)
+{
+ si_emit_shader_pointer(sctx, descs,
+ R_00B030_SPI_SHADER_USER_DATA_PS_0);
+ si_emit_shader_pointer(sctx, descs,
+ R_00B130_SPI_SHADER_USER_DATA_VS_0);
+
+ if (sctx->b.chip_class >= GFX9) {
+ /* GFX9 merged LS-HS and ES-GS.
+ * Set RW_BUFFERS in the special registers, so that
+ * it's preloaded into s[0:1] instead of s[8:9].
+ */
+ si_emit_shader_pointer(sctx, descs,
+ R_00B208_SPI_SHADER_USER_DATA_ADDR_LO_GS);
+ si_emit_shader_pointer(sctx, descs,
+ R_00B408_SPI_SHADER_USER_DATA_ADDR_LO_HS);
+ } else {
+ si_emit_shader_pointer(sctx, descs,
+ R_00B230_SPI_SHADER_USER_DATA_GS_0);
+ si_emit_shader_pointer(sctx, descs,
+ R_00B330_SPI_SHADER_USER_DATA_ES_0);
+ si_emit_shader_pointer(sctx, descs,
+ R_00B430_SPI_SHADER_USER_DATA_HS_0);
+ si_emit_shader_pointer(sctx, descs,
+ R_00B530_SPI_SHADER_USER_DATA_LS_0);
+ }
+}
+
void si_emit_graphics_shader_pointers(struct si_context *sctx,
struct r600_atom *atom)
{
@@ -2194,32 +2223,8 @@ void si_emit_graphics_shader_pointers(struct si_context *sctx,

descs = &sctx->descriptors[SI_DESCS_RW_BUFFERS];

- if (sctx->shader_pointers_dirty & (1 << SI_DESCS_RW_BUFFERS)) {
- si_emit_shader_pointer(sctx, descs,
- R_00B030_SPI_SHADER_USER_DATA_PS_0);
- si_emit_shader_pointer(sctx, descs,
- R_00B130_SPI_SHADER_USER_DATA_VS_0);
-
- if (sctx->b.chip_class >= GFX9) {
- /* GFX9 merged LS-HS and ES-GS.
- * Set RW_BUFFERS in the special registers, so that
- * it's preloaded into s[0:1] instead of s[8:9].
- */
- si_emit_shader_pointer(sctx, descs,
- R_00B208_SPI_SHADER_USER_DATA_ADDR_LO_GS);
- si_emit_shader_pointer(sctx, descs,
- R_00B408_SPI_SHADER_USER_DATA_ADDR_LO_HS);
- } else {
- si_emit_shader_pointer(sctx, descs,
- R_00B230_SPI_SHADER_USER_DATA_GS_0);
- si_emit_shader_pointer(sctx, descs,
- R_00B330_SPI_SHADER_USER_DATA_ES_0);
- si_emit_shader_pointer(sctx, descs,
- R_00B430_SPI_SHADER_USER_DATA_HS_0);
- si_emit_shader_pointer(sctx, descs,
- R_00B530_SPI_SHADER_USER_DATA_LS_0);
- }
- }
+ if (sctx->shader_pointers_dirty & (1 << SI_DESCS_RW_BUFFERS))
+ si_emit_global_shader_pointers(sctx, descs);

mask = sctx->shader_pointers_dirty &
u_bit_consecutive(SI_DESCS_FIRST_SHADER,
--
2.14.0
Marek Olšák
2017-08-11 15:13:28 UTC
Reply
Permalink
Raw Message
Reviewed-by: Marek Olšák <***@amd.com>

Marek

On Tue, Aug 8, 2017 at 6:57 PM, Samuel Pitoiset
Post by Samuel Pitoiset
To share common code between rw buffers and bindless descriptors.
v3: - rename to si_emit_global_shader_pointers()
---
src/gallium/drivers/radeonsi/si_descriptors.c | 57 +++++++++++++++------------
1 file changed, 31 insertions(+), 26 deletions(-)
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index 586310c168..799a53eefb 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -2185,6 +2185,35 @@ static void si_emit_shader_pointer(struct si_context *sctx,
radeon_emit(cs, va >> 32);
}
+static void si_emit_global_shader_pointers(struct si_context *sctx,
+ struct si_descriptors *descs)
+{
+ si_emit_shader_pointer(sctx, descs,
+ R_00B030_SPI_SHADER_USER_DATA_PS_0);
+ si_emit_shader_pointer(sctx, descs,
+ R_00B130_SPI_SHADER_USER_DATA_VS_0);
+
+ if (sctx->b.chip_class >= GFX9) {
+ /* GFX9 merged LS-HS and ES-GS.
+ * Set RW_BUFFERS in the special registers, so that
+ * it's preloaded into s[0:1] instead of s[8:9].
+ */
+ si_emit_shader_pointer(sctx, descs,
+ R_00B208_SPI_SHADER_USER_DATA_ADDR_LO_GS);
+ si_emit_shader_pointer(sctx, descs,
+ R_00B408_SPI_SHADER_USER_DATA_ADDR_LO_HS);
+ } else {
+ si_emit_shader_pointer(sctx, descs,
+ R_00B230_SPI_SHADER_USER_DATA_GS_0);
+ si_emit_shader_pointer(sctx, descs,
+ R_00B330_SPI_SHADER_USER_DATA_ES_0);
+ si_emit_shader_pointer(sctx, descs,
+ R_00B430_SPI_SHADER_USER_DATA_HS_0);
+ si_emit_shader_pointer(sctx, descs,
+ R_00B530_SPI_SHADER_USER_DATA_LS_0);
+ }
+}
+
void si_emit_graphics_shader_pointers(struct si_context *sctx,
struct r600_atom *atom)
{
@@ -2194,32 +2223,8 @@ void si_emit_graphics_shader_pointers(struct si_context *sctx,
descs = &sctx->descriptors[SI_DESCS_RW_BUFFERS];
- if (sctx->shader_pointers_dirty & (1 << SI_DESCS_RW_BUFFERS)) {
- si_emit_shader_pointer(sctx, descs,
- R_00B030_SPI_SHADER_USER_DATA_PS_0);
- si_emit_shader_pointer(sctx, descs,
- R_00B130_SPI_SHADER_USER_DATA_VS_0);
-
- if (sctx->b.chip_class >= GFX9) {
- /* GFX9 merged LS-HS and ES-GS.
- * Set RW_BUFFERS in the special registers, so that
- * it's preloaded into s[0:1] instead of s[8:9].
- */
- si_emit_shader_pointer(sctx, descs,
- R_00B208_SPI_SHADER_USER_DATA_ADDR_LO_GS);
- si_emit_shader_pointer(sctx, descs,
- R_00B408_SPI_SHADER_USER_DATA_ADDR_LO_HS);
- } else {
- si_emit_shader_pointer(sctx, descs,
- R_00B230_SPI_SHADER_USER_DATA_GS_0);
- si_emit_shader_pointer(sctx, descs,
- R_00B330_SPI_SHADER_USER_DATA_ES_0);
- si_emit_shader_pointer(sctx, descs,
- R_00B430_SPI_SHADER_USER_DATA_HS_0);
- si_emit_shader_pointer(sctx, descs,
- R_00B530_SPI_SHADER_USER_DATA_LS_0);
- }
- }
+ if (sctx->shader_pointers_dirty & (1 << SI_DESCS_RW_BUFFERS))
+ si_emit_global_shader_pointers(sctx, descs);
mask = sctx->shader_pointers_dirty &
u_bit_consecutive(SI_DESCS_FIRST_SHADER,
--
2.14.0
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev
Samuel Pitoiset
2017-08-08 16:57:29 UTC
Reply
Permalink
Raw Message
Looks like it's useless to initialize that field when CE is
unused. This will also allow to declare more than 64 elements
for the array of bindless descriptors.

Signed-off-by: Samuel Pitoiset <***@gmail.com>
Reviewed-by: Marek Olšák <***@amd.com>
---
src/gallium/drivers/radeonsi/si_descriptors.c | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index 1e0c422fb4..586310c168 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -125,19 +125,20 @@ static void si_init_descriptors(struct si_context *sctx,
unsigned num_ce_slots,
unsigned *ce_offset)
{
- assert(num_elements <= sizeof(desc->dirty_mask)*8);
-
desc->list = CALLOC(num_elements, element_dw_size * 4);
desc->element_dw_size = element_dw_size;
desc->num_elements = num_elements;
desc->first_ce_slot = sctx->ce_ib ? first_ce_slot : 0;
desc->num_ce_slots = sctx->ce_ib ? num_ce_slots : 0;
- desc->dirty_mask = u_bit_consecutive64(0, num_elements);
+ desc->dirty_mask = 0;
desc->shader_userdata_offset = shader_userdata_index * 4;

if (desc->num_ce_slots) {
+ assert(num_elements <= sizeof(desc->dirty_mask)*8);
+
desc->uses_ce = true;
desc->ce_offset = *ce_offset;
+ desc->dirty_mask = u_bit_consecutive64(0, num_elements);

*ce_offset += element_dw_size * desc->num_ce_slots * 4;
}
--
2.14.0
Samuel Pitoiset
2017-08-08 16:57:31 UTC
Reply
Permalink
Raw Message
Using VRAM address as bindless handles is not a good idea because
we have to use LLVMIntToPTr and the LLVM CSE pass can't optimize
because it has no information about the pointer.

Instead, use slots indexes like the existing descriptors. Note
that we use fixed 16-dword slots for both samplers and images.
This doesn't really matter because no real apps use image handles.

This improves performance with DOW3 by +7%.

v3: - fix si_emit_global_shader_pointers() for merged GFX9 shaders
- always re-upload the array of descriptors at creation time
v2: - inline si_release_bindless_descriptors()
- fix overwriting sampler and image slots
- use fixed 16-dword slots for images

Signed-off-by: Samuel Pitoiset <***@gmail.com>
Reviewed-by: Marek Olšák <***@amd.com> (v2)
---
src/gallium/drivers/radeonsi/si_descriptors.c | 350 ++++++++++------------
src/gallium/drivers/radeonsi/si_pipe.c | 12 -
src/gallium/drivers/radeonsi/si_pipe.h | 23 +-
src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c | 35 ++-
4 files changed, 193 insertions(+), 227 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index 799a53eefb..2e8f1320a1 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -1875,16 +1875,20 @@ static void si_rebind_buffer(struct pipe_context *ctx, struct pipe_resource *buf

/* Bindless texture handles */
if (rbuffer->texture_handle_allocated) {
+ struct si_descriptors *descs = &sctx->bindless_descriptors;
+
util_dynarray_foreach(&sctx->resident_tex_handles,
struct si_texture_handle *, tex_handle) {
struct pipe_sampler_view *view = (*tex_handle)->view;
- struct si_bindless_descriptor *desc = (*tex_handle)->desc;
+ unsigned desc_slot = (*tex_handle)->desc_slot;

if (view->texture == buf) {
si_set_buf_desc_address(rbuffer,
view->u.buf.offset,
- &desc->desc_list[4]);
- desc->dirty = true;
+ descs->list +
+ desc_slot * 16 + 4);
+
+ (*tex_handle)->desc_dirty = true;
sctx->bindless_descriptors_dirty = true;

radeon_add_to_buffer_list_check_mem(
@@ -1897,10 +1901,12 @@ static void si_rebind_buffer(struct pipe_context *ctx, struct pipe_resource *buf

/* Bindless image handles */
if (rbuffer->image_handle_allocated) {
+ struct si_descriptors *descs = &sctx->bindless_descriptors;
+
util_dynarray_foreach(&sctx->resident_img_handles,
struct si_image_handle *, img_handle) {
struct pipe_image_view *view = &(*img_handle)->view;
- struct si_bindless_descriptor *desc = (*img_handle)->desc;
+ unsigned desc_slot = (*img_handle)->desc_slot;

if (view->resource == buf) {
if (view->access & PIPE_IMAGE_ACCESS_WRITE)
@@ -1908,8 +1914,10 @@ static void si_rebind_buffer(struct pipe_context *ctx, struct pipe_resource *buf

si_set_buf_desc_address(rbuffer,
view->u.buf.offset,
- &desc->desc_list[4]);
- desc->dirty = true;
+ descs->list +
+ desc_slot * 16 + 4);
+
+ (*img_handle)->desc_dirty = true;
sctx->bindless_descriptors_dirty = true;

radeon_add_to_buffer_list_check_mem(
@@ -1941,11 +1949,19 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource
}

static void si_upload_bindless_descriptor(struct si_context *sctx,
- struct si_bindless_descriptor *desc)
+ unsigned desc_slot,
+ unsigned num_dwords)
{
+ struct si_descriptors *desc = &sctx->bindless_descriptors;
struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
- uint64_t va = desc->buffer->gpu_address + desc->offset;
- unsigned num_dwords = sizeof(desc->desc_list) / 4;
+ unsigned desc_slot_offset = desc_slot * 16;
+ uint32_t *data;
+ uint64_t va;
+
+ data = desc->list + desc_slot_offset;
+
+ va = desc->buffer->gpu_address + desc->buffer_offset +
+ desc_slot_offset * 4;

radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + num_dwords, 0));
radeon_emit(cs, S_370_DST_SEL(V_370_TC_L2) |
@@ -1953,7 +1969,7 @@ static void si_upload_bindless_descriptor(struct si_context *sctx,
S_370_ENGINE_SEL(V_370_ME));
radeon_emit(cs, va);
radeon_emit(cs, va >> 32);
- radeon_emit_array(cs, desc->desc_list, num_dwords);
+ radeon_emit_array(cs, data, num_dwords);
}

static void si_upload_bindless_descriptors(struct si_context *sctx)
@@ -1970,24 +1986,24 @@ static void si_upload_bindless_descriptors(struct si_context *sctx)

util_dynarray_foreach(&sctx->resident_tex_handles,
struct si_texture_handle *, tex_handle) {
- struct si_bindless_descriptor *desc = (*tex_handle)->desc;
+ unsigned desc_slot = (*tex_handle)->desc_slot;

- if (!desc->dirty)
+ if (!(*tex_handle)->desc_dirty)
continue;

- si_upload_bindless_descriptor(sctx, desc);
- desc->dirty = false;
+ si_upload_bindless_descriptor(sctx, desc_slot, 16);
+ (*tex_handle)->desc_dirty = false;
}

util_dynarray_foreach(&sctx->resident_img_handles,
struct si_image_handle *, img_handle) {
- struct si_bindless_descriptor *desc = (*img_handle)->desc;
+ unsigned desc_slot = (*img_handle)->desc_slot;

- if (!desc->dirty)
+ if (!(*img_handle)->desc_dirty)
continue;

- si_upload_bindless_descriptor(sctx, desc);
- desc->dirty = false;
+ si_upload_bindless_descriptor(sctx, desc_slot, 8);
+ (*img_handle)->desc_dirty = false;
}

/* Invalidate L1 because it doesn't know that L2 changed. */
@@ -2000,9 +2016,11 @@ static void si_upload_bindless_descriptors(struct si_context *sctx)
/* Update mutable image descriptor fields of all resident textures. */
static void si_update_all_resident_texture_descriptors(struct si_context *sctx)
{
+ struct si_descriptors *desc = &sctx->bindless_descriptors;
+
util_dynarray_foreach(&sctx->resident_tex_handles,
struct si_texture_handle *, tex_handle) {
- struct si_bindless_descriptor *desc = (*tex_handle)->desc;
+ unsigned desc_slot_offset = (*tex_handle)->desc_slot * 16;
struct si_sampler_view *sview =
(struct si_sampler_view *)(*tex_handle)->view;
uint32_t desc_list[16];
@@ -2010,31 +2028,43 @@ static void si_update_all_resident_texture_descriptors(struct si_context *sctx)
if (sview->base.texture->target == PIPE_BUFFER)
continue;

- memcpy(desc_list, desc->desc_list, sizeof(desc_list));
+ /* Store the previous descriptor to only mark it dirty if it
+ * has been changed.
+ */
+ memcpy(desc_list, desc->list + desc_slot_offset,
+ sizeof(desc_list));
+
si_set_sampler_view_desc(sctx, sview, &(*tex_handle)->sstate,
- &desc->desc_list[0]);
+ desc->list + desc_slot_offset);

- if (memcmp(desc_list, desc->desc_list, sizeof(desc_list))) {
- desc->dirty = true;
+ if (memcmp(desc_list, desc->list + desc_slot_offset,
+ sizeof(desc_list))) {
+ (*tex_handle)->desc_dirty = true;
sctx->bindless_descriptors_dirty = true;
}
}

util_dynarray_foreach(&sctx->resident_img_handles,
struct si_image_handle *, img_handle) {
- struct si_bindless_descriptor *desc = (*img_handle)->desc;
+ unsigned desc_slot_offset = (*img_handle)->desc_slot * 16;
struct pipe_image_view *view = &(*img_handle)->view;
- uint32_t desc_list[16];
+ uint32_t desc_list[8];

if (view->resource->target == PIPE_BUFFER)
continue;

- memcpy(desc_list, desc->desc_list, sizeof(desc_list));
+ /* Store the previous descriptor to only mark it dirty if it
+ * has been changed.
+ */
+ memcpy(desc_list, desc->list + desc_slot_offset,
+ sizeof(desc_list));
+
si_set_shader_image_desc(sctx, view, true,
- &desc->desc_list[0]);
+ desc->list + desc_slot_offset);

- if (memcmp(desc_list, desc->desc_list, sizeof(desc_list))) {
- desc->dirty = true;
+ if (memcmp(desc_list, desc->list + desc_slot_offset,
+ sizeof(desc_list))) {
+ (*img_handle)->desc_dirty = true;
sctx->bindless_descriptors_dirty = true;
}
}
@@ -2106,6 +2136,8 @@ static void si_shader_pointers_begin_new_cs(struct si_context *sctx)
sctx->shader_pointers_dirty = u_bit_consecutive(0, SI_NUM_DESCS);
sctx->vertex_buffer_pointer_dirty = sctx->vertex_buffers.buffer != NULL;
si_mark_atom_dirty(sctx, &sctx->shader_pointers.atom);
+ sctx->graphics_bindless_pointer_dirty = sctx->bindless_descriptors.buffer != NULL;
+ sctx->compute_bindless_pointer_dirty = sctx->bindless_descriptors.buffer != NULL;
}

/* Set a base register address for user data constants in the given shader.
@@ -2194,14 +2226,25 @@ static void si_emit_global_shader_pointers(struct si_context *sctx,
R_00B130_SPI_SHADER_USER_DATA_VS_0);

if (sctx->b.chip_class >= GFX9) {
- /* GFX9 merged LS-HS and ES-GS.
- * Set RW_BUFFERS in the special registers, so that
- * it's preloaded into s[0:1] instead of s[8:9].
- */
- si_emit_shader_pointer(sctx, descs,
- R_00B208_SPI_SHADER_USER_DATA_ADDR_LO_GS);
- si_emit_shader_pointer(sctx, descs,
- R_00B408_SPI_SHADER_USER_DATA_ADDR_LO_HS);
+ /* GFX9 merged LS-HS and ES-GS. */
+ if (descs == &sctx->descriptors[SI_DESCS_RW_BUFFERS]) {
+ /* Set RW_BUFFERS in the special registers, so that
+ * it's preloaded into s[0:1] instead of s[8:9].
+ */
+ si_emit_shader_pointer(sctx, descs,
+ R_00B208_SPI_SHADER_USER_DATA_ADDR_LO_GS);
+ si_emit_shader_pointer(sctx, descs,
+ R_00B408_SPI_SHADER_USER_DATA_ADDR_LO_HS);
+ } else {
+ /* Set BINDLESS_SAMPLERS_AND_IMAGES into s[10:11],
+ * s[8:9] remains unused for now.
+ */
+ assert(descs == &sctx->bindless_descriptors);
+ si_emit_shader_pointer(sctx, descs,
+ R_00B330_SPI_SHADER_USER_DATA_ES_0);
+ si_emit_shader_pointer(sctx, descs,
+ R_00B430_SPI_SHADER_USER_DATA_LS_0);
+ }
} else {
si_emit_shader_pointer(sctx, descs,
R_00B230_SPI_SHADER_USER_DATA_GS_0);
@@ -2246,6 +2289,12 @@ void si_emit_graphics_shader_pointers(struct si_context *sctx,
sh_base[PIPE_SHADER_VERTEX]);
sctx->vertex_buffer_pointer_dirty = false;
}
+
+ if (sctx->graphics_bindless_pointer_dirty) {
+ si_emit_global_shader_pointers(sctx,
+ &sctx->bindless_descriptors);
+ sctx->graphics_bindless_pointer_dirty = false;
+ }
}

void si_emit_compute_shader_pointers(struct si_context *sctx)
@@ -2262,135 +2311,87 @@ void si_emit_compute_shader_pointers(struct si_context *sctx)
si_emit_shader_pointer(sctx, descs + i, base);
}
sctx->shader_pointers_dirty &= ~compute_mask;
+
+ if (sctx->compute_bindless_pointer_dirty) {
+ si_emit_shader_pointer(sctx, &sctx->bindless_descriptors, base);
+ sctx->compute_bindless_pointer_dirty = false;
+ }
}

/* BINDLESS */

-struct si_bindless_descriptor_slab
+static void si_init_bindless_descriptors(struct si_context *sctx,
+ struct si_descriptors *desc,
+ unsigned shader_userdata_index,
+ unsigned num_elements)
{
- struct pb_slab base;
- struct r600_resource *buffer;
- struct si_bindless_descriptor *entries;
-};
+ si_init_descriptors(sctx, desc, shader_userdata_index, 16, num_elements,
+ 0, 0, NULL);
+ sctx->bindless_descriptors.num_active_slots = num_elements;

-bool si_bindless_descriptor_can_reclaim_slab(void *priv,
- struct pb_slab_entry *entry)
-{
- /* Do not allow to reclaim any bindless descriptors for now because the
- * GPU might be using them. This should be improved later on.
+ /* The first bindless descriptor is stored at slot 1, because 0 is not
+ * considered to be a valid handle.
*/
- return false;
+ sctx->num_bindless_descriptors = 1;
}

-struct pb_slab *si_bindless_descriptor_slab_alloc(void *priv, unsigned heap,
- unsigned entry_size,
- unsigned group_index)
+static inline void si_release_bindless_descriptors(struct si_context *sctx)
{
- struct si_context *sctx = priv;
- struct si_screen *sscreen = sctx->screen;
- struct si_bindless_descriptor_slab *slab;
-
- slab = CALLOC_STRUCT(si_bindless_descriptor_slab);
- if (!slab)
- return NULL;
-
- /* Create a buffer in VRAM for 1024 bindless descriptors. */
- slab->buffer = (struct r600_resource *)
- pipe_buffer_create(&sscreen->b.b, 0,
- PIPE_USAGE_DEFAULT, 64 * 1024);
- if (!slab->buffer)
- goto fail;
-
- slab->base.num_entries = slab->buffer->bo_size / entry_size;
- slab->base.num_free = slab->base.num_entries;
- slab->entries = CALLOC(slab->base.num_entries, sizeof(*slab->entries));
- if (!slab->entries)
- goto fail_buffer;
-
- LIST_INITHEAD(&slab->base.free);
-
- for (unsigned i = 0; i < slab->base.num_entries; ++i) {
- struct si_bindless_descriptor *desc = &slab->entries[i];
-
- desc->entry.slab = &slab->base;
- desc->entry.group_index = group_index;
- desc->buffer = slab->buffer;
- desc->offset = i * entry_size;
-
- LIST_ADDTAIL(&desc->entry.head, &slab->base.free);
- }
-
- /* Add the descriptor to the per-context list. */
- util_dynarray_append(&sctx->bindless_descriptors,
- struct r600_resource *, slab->buffer);
-
- return &slab->base;
-
-fail_buffer:
- r600_resource_reference(&slab->buffer, NULL);
-fail:
- FREE(slab);
- return NULL;
+ si_release_descriptors(&sctx->bindless_descriptors);
}

-void si_bindless_descriptor_slab_free(void *priv, struct pb_slab *pslab)
+static unsigned
+si_create_bindless_descriptor(struct si_context *sctx, uint32_t *desc_list,
+ unsigned size)
{
- struct si_context *sctx = priv;
- struct si_bindless_descriptor_slab *slab =
- (struct si_bindless_descriptor_slab *)pslab;
+ struct si_descriptors *desc = &sctx->bindless_descriptors;
+ unsigned desc_slot, desc_slot_offset;

- /* Remove the descriptor from the per-context list. */
- util_dynarray_delete_unordered(&sctx->bindless_descriptors,
- struct r600_resource *, slab->buffer);
+ /* Reserve a new slot for this bindless descriptor. */
+ desc_slot = sctx->num_bindless_descriptors++;

- r600_resource_reference(&slab->buffer, NULL);
- FREE(slab->entries);
- FREE(slab);
-}
+ if (desc_slot >= desc->num_elements) {
+ /* The array of bindless descriptors is full, resize it. */
+ unsigned slot_size = desc->element_dw_size * 4;
+ unsigned new_num_elements = desc->num_elements * 2;

-static struct si_bindless_descriptor *
-si_create_bindless_descriptor(struct si_context *sctx, uint32_t *desc_list,
- unsigned size)
-{
- struct si_screen *sscreen = sctx->screen;
- struct si_bindless_descriptor *desc;
- struct pb_slab_entry *entry;
- void *ptr;
+ desc->list = REALLOC(desc->list, desc->num_elements * slot_size,
+ new_num_elements * slot_size);
+ desc->num_elements = new_num_elements;
+ desc->num_active_slots = new_num_elements;
+ }

- /* Sub-allocate the bindless descriptor from a slab to avoid dealing
- * with a ton of buffers and for reducing the winsys overhead.
+ /* For simplicity, sampler and image bindless descriptors use fixed
+ * 16-dword slots for now. Image descriptors only need 8-dword but this
+ * doesn't really matter because no real apps use image handles.
*/
- entry = pb_slab_alloc(&sctx->bindless_descriptor_slabs, 64, 0);
- if (!entry)
- return NULL;
+ desc_slot_offset = desc_slot * 16;

- desc = NULL;
- desc = container_of(entry, desc, entry);
+ /* Copy the descriptor into the array. */
+ memcpy(desc->list + desc_slot_offset, desc_list, size);

- /* Upload the descriptor directly in VRAM. Because the slabs are
- * currently never reclaimed, we don't need to synchronize the
- * operation.
+ /* Re-upload the whole array of bindless descriptors into a new buffer.
*/
- ptr = sscreen->b.ws->buffer_map(desc->buffer->buf, NULL,
- PIPE_TRANSFER_WRITE |
- PIPE_TRANSFER_UNSYNCHRONIZED);
- util_memcpy_cpu_to_le32(ptr + desc->offset, desc_list, size);
+ if (!si_upload_descriptors(sctx, desc, &sctx->shader_pointers.atom))
+ return 0;

- /* Keep track of the initial descriptor especially for buffers
- * invalidation because we might need to know the previous address.
- */
- memcpy(desc->desc_list, desc_list, sizeof(desc->desc_list));
+ /* Make sure to re-emit the shader pointers for all stages. */
+ sctx->graphics_bindless_pointer_dirty = true;
+ sctx->compute_bindless_pointer_dirty = true;

- return desc;
+ return desc_slot;
}

static void si_invalidate_bindless_buf_desc(struct si_context *sctx,
- struct si_bindless_descriptor *desc,
+ unsigned desc_slot,
struct pipe_resource *resource,
- uint64_t offset)
+ uint64_t offset,
+ bool *desc_dirty)
{
+ struct si_descriptors *desc = &sctx->bindless_descriptors;
struct r600_resource *buf = r600_resource(resource);
- uint32_t *desc_list = desc->desc_list + 4;
+ unsigned desc_slot_offset = desc_slot * 16;
+ uint32_t *desc_list = desc->list + desc_slot_offset + 4;
uint64_t old_desc_va;

assert(resource->target == PIPE_BUFFER);
@@ -2405,7 +2406,7 @@ static void si_invalidate_bindless_buf_desc(struct si_context *sctx,
*/
si_set_buf_desc_address(buf, offset, &desc_list[0]);

- desc->dirty = true;
+ *desc_dirty = true;
sctx->bindless_descriptors_dirty = true;
}
}
@@ -2438,20 +2439,17 @@ static uint64_t si_create_texture_handle(struct pipe_context *ctx,
memcpy(&tex_handle->sstate, sstate, sizeof(*sstate));
ctx->delete_sampler_state(ctx, sstate);

- tex_handle->desc = si_create_bindless_descriptor(sctx, desc_list,
- sizeof(desc_list));
- if (!tex_handle->desc) {
+ tex_handle->desc_slot = si_create_bindless_descriptor(sctx, desc_list,
+ sizeof(desc_list));
+ if (!tex_handle->desc_slot) {
FREE(tex_handle);
return 0;
}

- handle = tex_handle->desc->buffer->gpu_address +
- tex_handle->desc->offset;
+ handle = tex_handle->desc_slot;

if (!_mesa_hash_table_insert(sctx->tex_handles, (void *)handle,
tex_handle)) {
- pb_slab_free(&sctx->bindless_descriptor_slabs,
- &tex_handle->desc->entry);
FREE(tex_handle);
return 0;
}
@@ -2477,8 +2475,6 @@ static void si_delete_texture_handle(struct pipe_context *ctx, uint64_t handle)

pipe_sampler_view_reference(&tex_handle->view, NULL);
_mesa_hash_table_remove(sctx->tex_handles, entry);
- pb_slab_free(&sctx->bindless_descriptor_slabs,
- &tex_handle->desc->entry);
FREE(tex_handle);
}

@@ -2520,9 +2516,11 @@ static void si_make_texture_handle_resident(struct pipe_context *ctx,
p_atomic_read(&rtex->framebuffers_bound))
sctx->need_check_render_feedback = true;
} else {
- si_invalidate_bindless_buf_desc(sctx, tex_handle->desc,
+ si_invalidate_bindless_buf_desc(sctx,
+ tex_handle->desc_slot,
sview->base.texture,
- sview->base.u.buf.offset);
+ sview->base.u.buf.offset,
+ &tex_handle->desc_dirty);
}

/* Add the texture handle to the per-context list. */
@@ -2532,11 +2530,6 @@ static void si_make_texture_handle_resident(struct pipe_context *ctx,
/* Add the buffers to the current CS in case si_begin_new_cs()
* is not going to be called.
*/
- radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
- tex_handle->desc->buffer,
- RADEON_USAGE_READWRITE,
- RADEON_PRIO_DESCRIPTORS);
-
si_sampler_view_add_buffer(sctx, sview->base.texture,
RADEON_USAGE_READ,
sview->is_stencil_sampler, false);
@@ -2563,7 +2556,7 @@ static uint64_t si_create_image_handle(struct pipe_context *ctx,
{
struct si_context *sctx = (struct si_context *)ctx;
struct si_image_handle *img_handle;
- uint32_t desc_list[16];
+ uint32_t desc_list[8];
uint64_t handle;

if (!view || !view->resource)
@@ -2578,20 +2571,17 @@ static uint64_t si_create_image_handle(struct pipe_context *ctx,

si_set_shader_image_desc(sctx, view, false, &desc_list[0]);

- img_handle->desc = si_create_bindless_descriptor(sctx, desc_list,
- sizeof(desc_list));
- if (!img_handle->desc) {
+ img_handle->desc_slot = si_create_bindless_descriptor(sctx, desc_list,
+ sizeof(desc_list));
+ if (!img_handle->desc_slot) {
FREE(img_handle);
return 0;
}

- handle = img_handle->desc->buffer->gpu_address +
- img_handle->desc->offset;
+ handle = img_handle->desc_slot;

if (!_mesa_hash_table_insert(sctx->img_handles, (void *)handle,
img_handle)) {
- pb_slab_free(&sctx->bindless_descriptor_slabs,
- &img_handle->desc->entry);
FREE(img_handle);
return 0;
}
@@ -2617,8 +2607,6 @@ static void si_delete_image_handle(struct pipe_context *ctx, uint64_t handle)

util_copy_image_view(&img_handle->view, NULL);
_mesa_hash_table_remove(sctx->img_handles, entry);
- pb_slab_free(&sctx->bindless_descriptor_slabs,
- &img_handle->desc->entry);
FREE(img_handle);
}

@@ -2656,9 +2644,11 @@ static void si_make_image_handle_resident(struct pipe_context *ctx,
p_atomic_read(&rtex->framebuffers_bound))
sctx->need_check_render_feedback = true;
} else {
- si_invalidate_bindless_buf_desc(sctx, img_handle->desc,
+ si_invalidate_bindless_buf_desc(sctx,
+ img_handle->desc_slot,
view->resource,
- view->u.buf.offset);
+ view->u.buf.offset,
+ &img_handle->desc_dirty);
}

/* Add the image handle to the per-context list. */
@@ -2668,11 +2658,6 @@ static void si_make_image_handle_resident(struct pipe_context *ctx,
/* Add the buffers to the current CS in case si_begin_new_cs()
* is not going to be called.
*/
- radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
- img_handle->desc->buffer,
- RADEON_USAGE_READWRITE,
- RADEON_PRIO_DESCRIPTORS);
-
si_sampler_view_add_buffer(sctx, view->resource,
(access & PIPE_IMAGE_ACCESS_WRITE) ?
RADEON_USAGE_READWRITE :
@@ -2702,20 +2687,6 @@ void si_all_resident_buffers_begin_new_cs(struct si_context *sctx)
num_resident_img_handles = sctx->resident_img_handles.size /
sizeof(struct si_image_handle *);

- /* Skip adding the bindless descriptors when no handles are resident.
- */
- if (!num_resident_tex_handles && !num_resident_img_handles)
- return;
-
- /* Add all bindless descriptors. */
- util_dynarray_foreach(&sctx->bindless_descriptors,
- struct r600_resource *, desc) {
-
- radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, *desc,
- RADEON_USAGE_READWRITE,
- RADEON_PRIO_DESCRIPTORS);
- }
-
/* Add all resident texture handles. */
util_dynarray_foreach(&sctx->resident_tex_handles,
struct si_texture_handle *, tex_handle) {
@@ -2866,6 +2837,13 @@ void si_init_all_descriptors(struct si_context *sctx)
FREE(sctx->vertex_buffers.list); /* not used */
sctx->vertex_buffers.list = NULL;

+ /* Initialize an array of 1024 bindless descriptors, when the limit is
+ * reached, just make it larger and re-upload the whole array.
+ */
+ si_init_bindless_descriptors(sctx, &sctx->bindless_descriptors,
+ SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES,
+ 1024);
+
sctx->descriptors_dirty = u_bit_consecutive(0, SI_NUM_DESCS);
sctx->total_ce_ram_allocated = ce_offset;

@@ -2978,6 +2956,7 @@ void si_release_all_descriptors(struct si_context *sctx)

sctx->vertex_buffers.list = NULL; /* points into a mapped buffer */
si_release_descriptors(&sctx->vertex_buffers);
+ si_release_bindless_descriptors(sctx);
}

void si_all_descriptors_begin_new_cs(struct si_context *sctx)
@@ -2994,6 +2973,7 @@ void si_all_descriptors_begin_new_cs(struct si_context *sctx)

for (i = 0; i < SI_NUM_DESCS; ++i)
si_descriptors_begin_new_cs(sctx, &sctx->descriptors[i]);
+ si_descriptors_begin_new_cs(sctx, &sctx->bindless_descriptors);

si_shader_pointers_begin_new_cs(sctx);
}
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 2c65cc886f..5fbcd8e7d3 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -99,9 +99,6 @@ static void si_destroy_context(struct pipe_context *context)
r600_resource_reference(&sctx->last_trace_buf, NULL);
radeon_clear_saved_cs(&sctx->last_gfx);

- pb_slabs_deinit(&sctx->bindless_descriptor_slabs);
- util_dynarray_fini(&sctx->bindless_descriptors);
-
_mesa_hash_table_destroy(sctx->tex_handles, NULL);
_mesa_hash_table_destroy(sctx->img_handles, NULL);

@@ -346,15 +343,6 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen,

sctx->tm = si_create_llvm_target_machine(sscreen);

- /* Create a slab allocator for all bindless descriptors. */
- if (!pb_slabs_init(&sctx->bindless_descriptor_slabs, 6, 6, 1, sctx,
- si_bindless_descriptor_can_reclaim_slab,
- si_bindless_descriptor_slab_alloc,
- si_bindless_descriptor_slab_free))
- goto fail;
-
- util_dynarray_init(&sctx->bindless_descriptors, NULL);
-
/* Bindless handles. */
sctx->tex_handles = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
_mesa_key_pointer_equal);
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index ee0ab1b37b..c44d6d5dff 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -245,25 +245,18 @@ union si_vgt_param_key {
uint32_t index;
};

-struct si_bindless_descriptor
-{
- struct pb_slab_entry entry;
- struct r600_resource *buffer;
- unsigned offset;
- uint32_t desc_list[16];
- bool dirty;
-};
-
struct si_texture_handle
{
- struct si_bindless_descriptor *desc;
+ unsigned desc_slot;
+ bool desc_dirty;
struct pipe_sampler_view *view;
struct si_sampler_state sstate;
};

struct si_image_handle
{
- struct si_bindless_descriptor *desc;
+ unsigned desc_slot;
+ bool desc_dirty;
struct pipe_image_view view;
};

@@ -434,12 +427,12 @@ struct si_context {
union si_vgt_param_key ia_multi_vgt_param_key;
unsigned ia_multi_vgt_param[SI_NUM_VGT_PARAM_STATES];

- /* Slab allocator for bindless descriptors. */
- struct pb_slabs bindless_descriptor_slabs;
-
/* Bindless descriptors. */
- struct util_dynarray bindless_descriptors;
+ struct si_descriptors bindless_descriptors;
+ unsigned num_bindless_descriptors;
bool bindless_descriptors_dirty;
+ bool graphics_bindless_pointer_dirty;
+ bool compute_bindless_pointer_dirty;

/* Allocated bindless handles */
struct hash_table *tex_handles;
diff --git a/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c b/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c
index 42f977d7ce..b776b2c2e8 100644
--- a/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c
+++ b/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c
@@ -205,15 +205,22 @@ image_fetch_rsrc(
}

if (image->Register.File != TGSI_FILE_IMAGE) {
+ /* Bindless descriptors are accessible from a different pair of
+ * user SGPR indices.
+ */
struct gallivm_state *gallivm = &ctx->gallivm;
LLVMBuilderRef builder = gallivm->builder;

- LLVMValueRef ptr =
- lp_build_emit_fetch_src(bld_base, image,
- TGSI_TYPE_UNSIGNED64, 0);
- rsrc_ptr = LLVMBuildIntToPtr(builder, ptr,
- si_const_array(ctx->v8i32, 0), "");
- index = LLVMConstInt(ctx->i32, 0, 0);
+ rsrc_ptr = LLVMGetParam(ctx->main_fn,
+ ctx->param_bindless_samplers_and_images);
+ index = lp_build_emit_fetch_src(bld_base, image,
+ TGSI_TYPE_UNSIGNED, 0);
+
+ /* For simplicity, bindless image descriptors use fixed
+ * 16-dword slots for now.
+ */
+ index = LLVMBuildMul(builder, index,
+ LLVMConstInt(ctx->i32, 2, 0), "");
}

*rsrc = si_load_image_desc(ctx, rsrc_ptr, index,
@@ -1213,15 +1220,13 @@ static void tex_fetch_ptrs(
}

if (reg->Register.File != TGSI_FILE_SAMPLER) {
- struct gallivm_state *gallivm = &ctx->gallivm;
- LLVMBuilderRef builder = gallivm->builder;
-
- LLVMValueRef ptr =
- lp_build_emit_fetch_src(bld_base, reg,
- TGSI_TYPE_UNSIGNED64, 0);
- list = LLVMBuildIntToPtr(builder, ptr,
- si_const_array(ctx->v8i32, 0), "");
- index = LLVMConstInt(ctx->i32, 0, 0);
+ /* Bindless descriptors are accessible from a different pair of
+ * user SGPR indices.
+ */
+ list = LLVMGetParam(ctx->main_fn,
+ ctx->param_bindless_samplers_and_images);
+ index = lp_build_emit_fetch_src(bld_base, reg,
+ TGSI_TYPE_UNSIGNED, 0);
}

if (target == TGSI_TEXTURE_BUFFER)
--
2.14.0
Marek Olšák
2017-08-11 16:53:45 UTC
Reply
Permalink
Raw Message
On Tue, Aug 8, 2017 at 6:57 PM, Samuel Pitoiset
Post by Samuel Pitoiset
Using VRAM address as bindless handles is not a good idea because
we have to use LLVMIntToPTr and the LLVM CSE pass can't optimize
because it has no information about the pointer.
Instead, use slots indexes like the existing descriptors. Note
that we use fixed 16-dword slots for both samplers and images.
This doesn't really matter because no real apps use image handles.
This improves performance with DOW3 by +7%.
v3: - fix si_emit_global_shader_pointers() for merged GFX9 shaders
- always re-upload the array of descriptors at creation time
v2: - inline si_release_bindless_descriptors()
I meant that you inline the function manually. Anyway:

Reviewed-by: Marek Olšák <***@amd.com>

Marek
Marek Olšák
2017-08-11 16:56:01 UTC
Reply
Permalink
Raw Message
Post by Marek Olšák
On Tue, Aug 8, 2017 at 6:57 PM, Samuel Pitoiset
Post by Samuel Pitoiset
Using VRAM address as bindless handles is not a good idea because
we have to use LLVMIntToPTr and the LLVM CSE pass can't optimize
because it has no information about the pointer.
Instead, use slots indexes like the existing descriptors. Note
that we use fixed 16-dword slots for both samplers and images.
This doesn't really matter because no real apps use image handles.
This improves performance with DOW3 by +7%.
v3: - fix si_emit_global_shader_pointers() for merged GFX9 shaders
- always re-upload the array of descriptors at creation time
v2: - inline si_release_bindless_descriptors()
I see patch 6 where the function is no longer a one-liner. There is no
need to inline.
Post by Marek Olšák
Marek
Samuel Pitoiset
2017-08-08 16:57:32 UTC
Reply
Permalink
Raw Message
Currently, when the array is full it is resized but it can grow
over and over because we don't try to re-use descriptor slots.

v3: - use new idalloc gallium module

Signed-off-by: Samuel Pitoiset <***@gmail.com>
---
src/gallium/drivers/radeonsi/si_descriptors.c | 57 +++++++++++++++++++++------
src/gallium/drivers/radeonsi/si_pipe.h | 2 +
2 files changed, 47 insertions(+), 12 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index 2e8f1320a1..29eb5bf3d1 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -61,6 +61,7 @@
#include "gfx9d.h"

#include "util/hash_table.h"
+#include "util/u_idalloc.h"
#include "util/u_format.h"
#include "util/u_memory.h"
#include "util/u_upload_mgr.h"
@@ -2333,33 +2334,62 @@ static void si_init_bindless_descriptors(struct si_context *sctx,
* considered to be a valid handle.
*/
sctx->num_bindless_descriptors = 1;
+
+ /* Keep track of which bindless slots are used (or not). */
+ util_idalloc_init(&sctx->bindless_used_slots);
+ util_idalloc_resize(&sctx->bindless_used_slots, num_elements);
+
+ /* Lock slot 0 because it's an invalid handle for bindless. */
+ util_idalloc_lock(&sctx->bindless_used_slots, 0);
}

static inline void si_release_bindless_descriptors(struct si_context *sctx)
{
si_release_descriptors(&sctx->bindless_descriptors);
+ util_idalloc_fini(&sctx->bindless_used_slots);
}

-static unsigned
-si_create_bindless_descriptor(struct si_context *sctx, uint32_t *desc_list,
- unsigned size)
+static unsigned si_get_next_free_bindless_slot(struct si_context *sctx)
{
- struct si_descriptors *desc = &sctx->bindless_descriptors;
- unsigned desc_slot, desc_slot_offset;
-
- /* Reserve a new slot for this bindless descriptor. */
- desc_slot = sctx->num_bindless_descriptors++;
+ int desc_slot;

- if (desc_slot >= desc->num_elements) {
- /* The array of bindless descriptors is full, resize it. */
+ desc_slot = util_idalloc_get_next_free(&sctx->bindless_used_slots);
+ if (desc_slot < 0) {
+ /* No slots are available. */
+ struct si_descriptors *desc = &sctx->bindless_descriptors;
unsigned slot_size = desc->element_dw_size * 4;
- unsigned new_num_elements = desc->num_elements * 2;
+ unsigned old_num_elements = desc->num_elements;
+ unsigned new_num_elements = old_num_elements * 2;

- desc->list = REALLOC(desc->list, desc->num_elements * slot_size,
+ /* Resize the array of descriptors. */
+ desc->list = REALLOC(desc->list, old_num_elements * slot_size,
new_num_elements * slot_size);
desc->num_elements = new_num_elements;
desc->num_active_slots = new_num_elements;
+
+ /* Resize the array of slots. */
+ util_idalloc_resize(&sctx->bindless_used_slots, new_num_elements);
+
+ /* Get a new slot. */
+ desc_slot = util_idalloc_get_next_free(&sctx->bindless_used_slots);
}
+ assert(desc_slot);
+
+ /* Prevent this bindless descriptor slot to be re-used. */
+ util_idalloc_lock(&sctx->bindless_used_slots, desc_slot);
+
+ return desc_slot;
+}
+
+static unsigned
+si_create_bindless_descriptor(struct si_context *sctx, uint32_t *desc_list,
+ unsigned size)
+{
+ struct si_descriptors *desc = &sctx->bindless_descriptors;
+ unsigned desc_slot, desc_slot_offset;
+
+ /* Find a free slot. */
+ desc_slot = si_get_next_free_bindless_slot(sctx);

/* For simplicity, sampler and image bindless descriptors use fixed
* 16-dword slots for now. Image descriptors only need 8-dword but this
@@ -2473,6 +2503,9 @@ static void si_delete_texture_handle(struct pipe_context *ctx, uint64_t handle)

tex_handle = (struct si_texture_handle *)entry->data;

+ /* Unlock this descriptor slot. */
+ util_idalloc_unlock(&sctx->bindless_used_slots, tex_handle->desc_slot);
+
pipe_sampler_view_reference(&tex_handle->view, NULL);
_mesa_hash_table_remove(sctx->tex_handles, entry);
FREE(tex_handle);
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index c44d6d5dff..447795d97b 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -29,6 +29,7 @@
#include "si_shader.h"

#include "util/u_dynarray.h"
+#include "util/u_idalloc.h"

#ifdef PIPE_ARCH_BIG_ENDIAN
#define SI_BIG_ENDIAN 1
@@ -429,6 +430,7 @@ struct si_context {

/* Bindless descriptors. */
struct si_descriptors bindless_descriptors;
+ struct util_idalloc bindless_used_slots;
unsigned num_bindless_descriptors;
bool bindless_descriptors_dirty;
bool graphics_bindless_pointer_dirty;
--
2.14.0
Marek Olšák
2017-08-11 17:37:42 UTC
Reply
Permalink
Raw Message
This patch needs to be updated for the latest version of util_idalloc.

Marek

On Tue, Aug 8, 2017 at 6:57 PM, Samuel Pitoiset
Post by Samuel Pitoiset
Currently, when the array is full it is resized but it can grow
over and over because we don't try to re-use descriptor slots.
v3: - use new idalloc gallium module
---
src/gallium/drivers/radeonsi/si_descriptors.c | 57 +++++++++++++++++++++------
src/gallium/drivers/radeonsi/si_pipe.h | 2 +
2 files changed, 47 insertions(+), 12 deletions(-)
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index 2e8f1320a1..29eb5bf3d1 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -61,6 +61,7 @@
#include "gfx9d.h"
#include "util/hash_table.h"
+#include "util/u_idalloc.h"
#include "util/u_format.h"
#include "util/u_memory.h"
#include "util/u_upload_mgr.h"
@@ -2333,33 +2334,62 @@ static void si_init_bindless_descriptors(struct si_context *sctx,
* considered to be a valid handle.
*/
sctx->num_bindless_descriptors = 1;
+
+ /* Keep track of which bindless slots are used (or not). */
+ util_idalloc_init(&sctx->bindless_used_slots);
+ util_idalloc_resize(&sctx->bindless_used_slots, num_elements);
+
+ /* Lock slot 0 because it's an invalid handle for bindless. */
+ util_idalloc_lock(&sctx->bindless_used_slots, 0);
}
static inline void si_release_bindless_descriptors(struct si_context *sctx)
{
si_release_descriptors(&sctx->bindless_descriptors);
+ util_idalloc_fini(&sctx->bindless_used_slots);
}
-static unsigned
-si_create_bindless_descriptor(struct si_context *sctx, uint32_t *desc_list,
- unsigned size)
+static unsigned si_get_next_free_bindless_slot(struct si_context *sctx)
{
- struct si_descriptors *desc = &sctx->bindless_descriptors;
- unsigned desc_slot, desc_slot_offset;
-
- /* Reserve a new slot for this bindless descriptor. */
- desc_slot = sctx->num_bindless_descriptors++;
+ int desc_slot;
- if (desc_slot >= desc->num_elements) {
- /* The array of bindless descriptors is full, resize it. */
+ desc_slot = util_idalloc_get_next_free(&sctx->bindless_used_slots);
+ if (desc_slot < 0) {
+ /* No slots are available. */
+ struct si_descriptors *desc = &sctx->bindless_descriptors;
unsigned slot_size = desc->element_dw_size * 4;
- unsigned new_num_elements = desc->num_elements * 2;
+ unsigned old_num_elements = desc->num_elements;
+ unsigned new_num_elements = old_num_elements * 2;
- desc->list = REALLOC(desc->list, desc->num_elements * slot_size,
+ /* Resize the array of descriptors. */
+ desc->list = REALLOC(desc->list, old_num_elements * slot_size,
new_num_elements * slot_size);
desc->num_elements = new_num_elements;
desc->num_active_slots = new_num_elements;
+
+ /* Resize the array of slots. */
+ util_idalloc_resize(&sctx->bindless_used_slots, new_num_elements);
+
+ /* Get a new slot. */
+ desc_slot = util_idalloc_get_next_free(&sctx->bindless_used_slots);
}
+ assert(desc_slot);
+
+ /* Prevent this bindless descriptor slot to be re-used. */
+ util_idalloc_lock(&sctx->bindless_used_slots, desc_slot);
+
+ return desc_slot;
+}
+
+static unsigned
+si_create_bindless_descriptor(struct si_context *sctx, uint32_t *desc_list,
+ unsigned size)
+{
+ struct si_descriptors *desc = &sctx->bindless_descriptors;
+ unsigned desc_slot, desc_slot_offset;
+
+ /* Find a free slot. */
+ desc_slot = si_get_next_free_bindless_slot(sctx);
/* For simplicity, sampler and image bindless descriptors use fixed
* 16-dword slots for now. Image descriptors only need 8-dword but this
@@ -2473,6 +2503,9 @@ static void si_delete_texture_handle(struct pipe_context *ctx, uint64_t handle)
tex_handle = (struct si_texture_handle *)entry->data;
+ /* Unlock this descriptor slot. */
+ util_idalloc_unlock(&sctx->bindless_used_slots, tex_handle->desc_slot);
+
pipe_sampler_view_reference(&tex_handle->view, NULL);
_mesa_hash_table_remove(sctx->tex_handles, entry);
FREE(tex_handle);
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index c44d6d5dff..447795d97b 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -29,6 +29,7 @@
#include "si_shader.h"
#include "util/u_dynarray.h"
+#include "util/u_idalloc.h"
#ifdef PIPE_ARCH_BIG_ENDIAN
#define SI_BIG_ENDIAN 1
@@ -429,6 +430,7 @@ struct si_context {
/* Bindless descriptors. */
struct si_descriptors bindless_descriptors;
+ struct util_idalloc bindless_used_slots;
unsigned num_bindless_descriptors;
bool bindless_descriptors_dirty;
bool graphics_bindless_pointer_dirty;
--
2.14.0
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev
Marek Olšák
2017-08-10 21:17:40 UTC
Reply
Permalink
Raw Message
On Tue, Aug 8, 2017 at 6:57 PM, Samuel Pitoiset
Post by Samuel Pitoiset
A new pair of user SGPR is needed for loading the bindless
descriptors from shaders. Because the descriptors are global for
all stages, there is no need to add separate indices for GFX9.
v3: - fix merged shaders on GFX9
v2: - fix declaring new bindless parameter
---
src/gallium/drivers/radeonsi/si_shader.c | 21 +++++++++++++++++----
src/gallium/drivers/radeonsi/si_shader.h | 4 +++-
src/gallium/drivers/radeonsi/si_shader_internal.h | 1 +
3 files changed, 21 insertions(+), 5 deletions(-)
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 09053c355e..035e36fbab 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -2900,6 +2900,9 @@ static void si_set_ls_return_value_for_tcs(struct si_shader_context *ctx)
ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3);
ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4);
ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5);
+ ret = si_insert_input_ptr_as_2xi32(ctx, ret,
+ ctx->param_bindless_samplers_and_images,
+ 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
ret = si_insert_input_ret(ctx, ret, ctx->param_vs_state_bits,
8 + SI_SGPR_VS_STATE_BITS);
@@ -2938,6 +2941,9 @@ static void si_set_es_return_value_for_gs(struct si_shader_context *ctx)
ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3);
ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5);
+ ret = si_insert_input_ptr_as_2xi32(ctx, ret,
+ ctx->param_bindless_samplers_and_images,
+ 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
unsigned desc_param = ctx->param_vs_state_bits + 1;
ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param,
@@ -4249,6 +4255,8 @@ static void declare_default_desc_pointers(struct si_shader_context *ctx,
{
ctx->param_rw_buffers = add_arg(fninfo, ARG_SGPR,
si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS));
+ ctx->param_bindless_samplers_and_images = add_arg(fninfo, ARG_SGPR,
+ si_const_array(ctx->v8i32, 0));
declare_per_stage_desc_pointers(ctx, fninfo, true);
}
@@ -4388,8 +4396,9 @@ static void create_function(struct si_shader_context *ctx)
add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
- add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
- add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
Don't remove these unused inputs.
Post by Samuel Pitoiset
+ ctx->param_bindless_samplers_and_images =
+ add_arg(&fninfo, ARG_SGPR, si_const_array(ctx->v8i32, 0));
+
declare_per_stage_desc_pointers(ctx, &fninfo,
ctx->type == PIPE_SHADER_VERTEX);
declare_vs_specific_input_sgprs(ctx, &fninfo);
@@ -4442,8 +4451,9 @@ static void create_function(struct si_shader_context *ctx)
add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused (SPI_SHADER_PGM_LO/HI_GS << 8) */
add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */
- add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
- add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
Don't remove these unused inputs.

With those fixed, the patch is:

Reviewed-by: Marek Olšák <***@amd.com>

Marek
Samuel Pitoiset
2017-08-11 07:26:42 UTC
Reply
Permalink
Raw Message
Post by Marek Olšák
On Tue, Aug 8, 2017 at 6:57 PM, Samuel Pitoiset
Post by Samuel Pitoiset
A new pair of user SGPR is needed for loading the bindless
descriptors from shaders. Because the descriptors are global for
all stages, there is no need to add separate indices for GFX9.
v3: - fix merged shaders on GFX9
v2: - fix declaring new bindless parameter
---
src/gallium/drivers/radeonsi/si_shader.c | 21 +++++++++++++++++----
src/gallium/drivers/radeonsi/si_shader.h | 4 +++-
src/gallium/drivers/radeonsi/si_shader_internal.h | 1 +
3 files changed, 21 insertions(+), 5 deletions(-)
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 09053c355e..035e36fbab 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -2900,6 +2900,9 @@ static void si_set_ls_return_value_for_tcs(struct si_shader_context *ctx)
ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3);
ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4);
ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5);
+ ret = si_insert_input_ptr_as_2xi32(ctx, ret,
+ ctx->param_bindless_samplers_and_images,
+ 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
ret = si_insert_input_ret(ctx, ret, ctx->param_vs_state_bits,
8 + SI_SGPR_VS_STATE_BITS);
@@ -2938,6 +2941,9 @@ static void si_set_es_return_value_for_gs(struct si_shader_context *ctx)
ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3);
ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5);
+ ret = si_insert_input_ptr_as_2xi32(ctx, ret,
+ ctx->param_bindless_samplers_and_images,
+ 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
unsigned desc_param = ctx->param_vs_state_bits + 1;
ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param,
@@ -4249,6 +4255,8 @@ static void declare_default_desc_pointers(struct si_shader_context *ctx,
{
ctx->param_rw_buffers = add_arg(fninfo, ARG_SGPR,
si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS));
+ ctx->param_bindless_samplers_and_images = add_arg(fninfo, ARG_SGPR,
+ si_const_array(ctx->v8i32, 0));
declare_per_stage_desc_pointers(ctx, fninfo, true);
}
@@ -4388,8 +4396,9 @@ static void create_function(struct si_shader_context *ctx)
add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
- add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
- add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
Don't remove these unused inputs.
Post by Samuel Pitoiset
+ ctx->param_bindless_samplers_and_images =
+ add_arg(&fninfo, ARG_SGPR, si_const_array(ctx->v8i32, 0));
+
declare_per_stage_desc_pointers(ctx, &fninfo,
ctx->type == PIPE_SHADER_VERTEX);
declare_vs_specific_input_sgprs(ctx, &fninfo);
@@ -4442,8 +4451,9 @@ static void create_function(struct si_shader_context *ctx)
add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused (SPI_SHADER_PGM_LO/HI_GS << 8) */
add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */
- add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
- add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
Don't remove these unused inputs.
Fixed locally, thanks!

Samuel.
Post by Marek Olšák
Marek
Loading...