Discussion:
[PATCH v2 1/5] i965/eu: add support for 1-OWord Block Read/Write messages
Add Reply
Samuel Iglesias Gonsálvez
2017-07-19 13:51:09 UTC
Reply
Permalink
Raw Message
v2:
- Use nibctrl and the number of written/read owords to detect
each case of a 1-OWord Block Read/Write (Curro)

Signed-off-by: Samuel Iglesias Gonsálvez <***@igalia.com>
---
src/intel/compiler/brw_eu.h | 14 +++++-----
src/intel/compiler/brw_eu_emit.c | 46 +++++++++++++++++++++++++--------
src/intel/compiler/brw_fs_generator.cpp | 4 +--
3 files changed, 44 insertions(+), 20 deletions(-)

diff --git a/src/intel/compiler/brw_eu.h b/src/intel/compiler/brw_eu.h
index a3a9c63239..de8470b4b5 100644
--- a/src/intel/compiler/brw_eu.h
+++ b/src/intel/compiler/brw_eu.h
@@ -342,15 +342,15 @@ void brw_oword_block_read(struct brw_codegen *p,
unsigned brw_scratch_surface_idx(const struct brw_codegen *p);

void brw_oword_block_read_scratch(struct brw_codegen *p,
- struct brw_reg dest,
- struct brw_reg mrf,
- int num_regs,
- unsigned offset);
+ struct brw_reg dest,
+ struct brw_reg mrf,
+ int num_owords,
+ unsigned offset);

void brw_oword_block_write_scratch(struct brw_codegen *p,
- struct brw_reg mrf,
- int num_regs,
- unsigned offset);
+ struct brw_reg mrf,
+ int num_owords,
+ unsigned offset);

void gen7_block_read_scratch(struct brw_codegen *p,
struct brw_reg dest,
diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c
index 0b0d67a5c5..956ef263a2 100644
--- a/src/intel/compiler/brw_eu_emit.c
+++ b/src/intel/compiler/brw_eu_emit.c
@@ -2133,9 +2133,9 @@ brw_scratch_surface_idx(const struct brw_codegen *p)
* register spilling.
*/
void brw_oword_block_write_scratch(struct brw_codegen *p,
- struct brw_reg mrf,
- int num_regs,
- unsigned offset)
+ struct brw_reg mrf,
+ int num_owords,
+ unsigned offset)
{
const struct gen_device_info *devinfo = p->devinfo;
const unsigned target_cache =
@@ -2149,7 +2149,7 @@ void brw_oword_block_write_scratch(struct brw_codegen *p,

mrf = retype(mrf, BRW_REGISTER_TYPE_UD);

- const unsigned mlen = 1 + num_regs;
+ const unsigned mlen = 1 + MAX2(1, num_owords / 2);

/* Set up the message header. This is g0, with g0.2 filled with
* the offset. We don't want to leave our offset around in g0 or
@@ -2180,6 +2180,18 @@ void brw_oword_block_write_scratch(struct brw_codegen *p,
int send_commit_msg;
struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
BRW_REGISTER_TYPE_UW);
+ int msg_control = BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_owords * 4);
+
+ /* By default for 1-oword, msg_control = BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
+ * fix it when we are writing the high part.
+ */
+ if (num_owords == 1 && brw_inst_nib_control(devinfo, insn) != 0) {
+ msg_control = BRW_DATAPORT_OWORD_BLOCK_1_OWORDHIGH;
+ /* The messages only work with group == 0, we use the group to know which
+ * message emit (1-OWORD LOW or 1-OWORD HIGH), so reset it to zero.
+ */
+ brw_inst_set_group(devinfo, insn, 0);
+ }

brw_inst_set_compression(devinfo, insn, false);

@@ -2223,7 +2235,7 @@ void brw_oword_block_write_scratch(struct brw_codegen *p,
brw_set_dp_write_message(p,
insn,
brw_scratch_surface_idx(p),
- BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
+ msg_control,
msg_type,
target_cache,
mlen,
@@ -2245,10 +2257,10 @@ void brw_oword_block_write_scratch(struct brw_codegen *p,
*/
void
brw_oword_block_read_scratch(struct brw_codegen *p,
- struct brw_reg dest,
- struct brw_reg mrf,
- int num_regs,
- unsigned offset)
+ struct brw_reg dest,
+ struct brw_reg mrf,
+ int num_owords,
+ unsigned offset)
{
const struct gen_device_info *devinfo = p->devinfo;

@@ -2269,7 +2281,7 @@ brw_oword_block_read_scratch(struct brw_codegen *p,
}
dest = retype(dest, BRW_REGISTER_TYPE_UW);

- const unsigned rlen = num_regs;
+ const unsigned rlen = MAX2(1, num_owords / 2);
const unsigned target_cache =
(devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
@@ -2291,6 +2303,18 @@ brw_oword_block_read_scratch(struct brw_codegen *p,

{
brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
+ int msg_control = BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_owords * 4);
+
+ /* By default for 1-oword, msg_control = BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
+ * fix it when we are reading the high part.
+ */
+ if (num_owords == 1 && brw_inst_nib_control(devinfo, insn) != 0) {
+ msg_control = BRW_DATAPORT_OWORD_BLOCK_1_OWORDHIGH;
+ /* The messages only work with group == 0, we use the group to know which
+ * message emit (1-OWORD LOW or 1-OWORD HIGH), so reset it to zero.
+ */
+ brw_inst_set_group(devinfo, insn, 0);
+ }

assert(brw_inst_pred_control(devinfo, insn) == 0);
brw_inst_set_compression(devinfo, insn, false);
@@ -2306,7 +2330,7 @@ brw_oword_block_read_scratch(struct brw_codegen *p,
brw_set_dp_read_message(p,
insn,
brw_scratch_surface_idx(p),
- BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
+ msg_control,
BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
target_cache,
1, /* msg_length */
diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp
index 2ade486705..2dd28048eb 100644
--- a/src/intel/compiler/brw_fs_generator.cpp
+++ b/src/intel/compiler/brw_fs_generator.cpp
@@ -1178,7 +1178,7 @@ fs_generator::generate_scratch_write(fs_inst *inst, struct brw_reg src)
retype(offset(src, block_size * i), BRW_REGISTER_TYPE_UD));

brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf),
- block_size,
+ block_size * 2,
inst->offset + block_size * REG_SIZE * i);
}

@@ -1192,7 +1192,7 @@ fs_generator::generate_scratch_read(fs_inst *inst, struct brw_reg dst)
assert(inst->mlen != 0);

brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf),
- inst->exec_size / 8, inst->offset);
+ inst->exec_size / 4, inst->offset);
}

void
--
2.11.0
Samuel Iglesias Gonsálvez
2017-07-19 13:51:10 UTC
Reply
Permalink
Raw Message
v2:
- Enable partial DF on HSW+ in emit_1grf_df_ivb_scratch_read()
- Copy the data read by first 1-Oword Block read as UD instead
of DF, because on HSW+ we can break regioning rules.

v3:
- Update the calls to brw_oword_block_*_scratch().
- Remove changes in generate_scratch_read().
- Fix offset when emitting 1-Oword Block Write messages, so we
don't need to shuffle data.
- Remove DF_IVB_SCRATCH_READ() and emit_1grf_df_ivb_scratch_read()
- Remove VEC4_OPCODE_GEN4_SCRATCH_READ_1OWORD_{LOW,HIGH} opcodes.
- Add support for Haswell.

Signed-off-by: Samuel Iglesias Gonsálvez <***@igalia.com>
---
src/intel/compiler/brw_vec4_generator.cpp | 59 +++++++++++++++++++++++++++++++
1 file changed, 59 insertions(+)

diff --git a/src/intel/compiler/brw_vec4_generator.cpp b/src/intel/compiler/brw_vec4_generator.cpp
index 334933d15a..c0ceacd9aa 100644
--- a/src/intel/compiler/brw_vec4_generator.cpp
+++ b/src/intel/compiler/brw_vec4_generator.cpp
@@ -1192,6 +1192,65 @@ generate_scratch_write(struct brw_codegen *p,
struct brw_reg header = brw_vec8_grf(0, 0);
bool write_commit;

+ if (devinfo->gen >= 7 && type_sz(src.type) == 8) {
+ bool partial_df = inst->exec_size < 8;
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+ if (!partial_df || inst->group == 0) {
+ for (int i = 0; i < 2; i++) {
+ brw_set_default_exec_size(p, BRW_EXECUTE_4);
+ brw_set_default_mask_control(p, true);
+ struct brw_reg temp =
+ retype(suboffset(src, i * 16 / type_sz(src.type)), BRW_REGISTER_TYPE_UD);
+ temp = stride(temp, 4, 4, 1);
+
+ brw_MOV(p, brw_uvec_mrf(4, inst->base_mrf + 1, 0),
+ temp);
+ brw_set_default_mask_control(p, inst->force_writemask_all);
+ brw_set_default_exec_size(p, BRW_EXECUTE_8);
+
+ /* Offset in OWORDs */
+ brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf),
+ 1, 32*inst->offset + 16*i);
+ }
+ }
+
+ if (!partial_df) {
+ /* HSW can do full DF scratch writes, however we split the writes in
+ * four 1-OWord messages: two for the first GRF, two for the second.
+ *
+ * In order to emit properly the 1-OWord messages for the second GRF,
+ * we need to set the default group (which sets the nibble control)
+ * for them. We also need to fix source regiter to pick the data.
+ */
+ src = suboffset(src, 32 / type_sz(src.type));
+ brw_set_default_group(p, 4);
+ }
+
+ if (!partial_df || inst->group != 0) {
+ for (int i = 0; i < 2; i++) {
+ brw_set_default_exec_size(p, BRW_EXECUTE_4);
+ brw_set_default_mask_control(p, true);
+ struct brw_reg temp =
+ retype(suboffset(src, i * 16 / type_sz(src.type)), BRW_REGISTER_TYPE_UD);
+ temp = stride(temp, 4, 4, 1);
+
+ brw_MOV(p, brw_uvec_mrf(4, inst->base_mrf + 1, 4),
+ temp);
+
+ brw_set_default_mask_control(p, inst->force_writemask_all);
+ brw_set_default_exec_size(p, BRW_EXECUTE_8);
+
+ /* Offset in OWORDs */
+ brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf),
+ 1, 32*inst->offset + 16*i + 32);
+ }
+ }
+ brw_set_default_exec_size(p, cvt(inst->exec_size) - 1);
+ brw_set_default_access_mode(p, BRW_ALIGN_16);
+ return;
+ }
+
/* If the instruction is predicated, we'll predicate the send, not
* the header setup.
*/
--
2.11.0
Samuel Iglesias Gonsálvez
2017-07-19 13:51:11 UTC
Reply
Permalink
Raw Message
Both spill/unspill process assume that both lower simd width
and DF scalarization were previously done.

* Spilling process does the following:

1) Reads the existing content from the scratch memory that
corresponds to the vertex (use inst->group to know if we
are going to write data to the first or the second vertex).
As it is already scalarized, we don't want to modify existing
data of other components. We only read one GRF content as we are
not going to modify the other (exec_size = 4).
2) Overwrite the component the spilled instruction writes to.
3) Do a scratch write to save the updated content of the respective
vertex to scratch memory.

* Unspilling is implemented as several scratch reads when we find
the first instruction whose sources were spilled.
These scratch read get the content of the DF data for both vertices
because we want to have DF data in two consecutive GRFs, even when
this first instruction only reads one (exec_size = 4). Then, it is
not needed to do more unspills until we write new content to the
scratch memory, so we just need to update the register number in
the affected sources of the following instructions.

v2:

- Change mlen only in emit_scratch_{read, write}
- Allow partial DF writes/reads spilling on IVB+.
- Modify emit_scratch_write() to mark the partial DF read case.
- Fix size_written, it is in byte units (Curro)
- Don't do shuffling on emit_scratch_read().
- Don't do shuffling on emit_scratch_write().
- Simplify emit_scratch_read() changes.
- Simplify emit_scratch_write() changes.
- Merge reladdr changes.

Signed-off-by: Samuel Iglesias Gonsálvez <***@igalia.com>
---
src/intel/compiler/brw_vec4.cpp | 2 +
src/intel/compiler/brw_vec4.h | 9 ++-
src/intel/compiler/brw_vec4_reg_allocate.cpp | 4 +-
src/intel/compiler/brw_vec4_visitor.cpp | 112 ++++++++++++++++++++++++---
4 files changed, 109 insertions(+), 18 deletions(-)

diff --git a/src/intel/compiler/brw_vec4.cpp b/src/intel/compiler/brw_vec4.cpp
index 410922c62b..459e37f2f5 100644
--- a/src/intel/compiler/brw_vec4.cpp
+++ b/src/intel/compiler/brw_vec4.cpp
@@ -338,6 +338,8 @@ vec4_visitor::implied_mrf_writes(vec4_instruction *inst)
case SHADER_OPCODE_GEN4_SCRATCH_READ:
return 2;
case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
+ if (devinfo->gen >= 7 && type_sz(inst->dst.type) == 8)
+ return 2;
return 3;
case GS_OPCODE_URB_WRITE:
case GS_OPCODE_URB_WRITE_ALLOCATE:
diff --git a/src/intel/compiler/brw_vec4.h b/src/intel/compiler/brw_vec4.h
index d828da02ea..fafad1291b 100644
--- a/src/intel/compiler/brw_vec4.h
+++ b/src/intel/compiler/brw_vec4.h
@@ -291,11 +291,12 @@ public:
src_reg get_scratch_offset(bblock_t *block, vec4_instruction *inst,
src_reg *reladdr, int reg_offset);
void emit_scratch_read(bblock_t *block, vec4_instruction *inst,
- dst_reg dst,
- src_reg orig_src,
- int base_offset);
+ dst_reg dst,
+ src_reg orig_src,
+ int base_offset,
+ bool resolve_reladdr);
void emit_scratch_write(bblock_t *block, vec4_instruction *inst,
- int base_offset);
+ int base_offset, bool resolve_reladdr);
void emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
dst_reg dst,
src_reg orig_src,
diff --git a/src/intel/compiler/brw_vec4_reg_allocate.cpp b/src/intel/compiler/brw_vec4_reg_allocate.cpp
index a0ba77b867..bed3471159 100644
--- a/src/intel/compiler/brw_vec4_reg_allocate.cpp
+++ b/src/intel/compiler/brw_vec4_reg_allocate.cpp
@@ -526,7 +526,7 @@ vec4_visitor::spill_reg(int spill_reg_nr)
temp.offset = 0;
temp.swizzle = BRW_SWIZZLE_XYZW;
emit_scratch_read(block, inst,
- dst_reg(temp), inst->src[i], spill_offset);
+ dst_reg(temp), inst->src[i], spill_offset, false);
temp.offset = inst->src[i].offset;
}
assert(scratch_reg != -1);
@@ -535,7 +535,7 @@ vec4_visitor::spill_reg(int spill_reg_nr)
}

if (inst->dst.file == VGRF && inst->dst.nr == spill_reg_nr) {
- emit_scratch_write(block, inst, spill_offset);
+ emit_scratch_write(block, inst, spill_offset, false);
scratch_reg = inst->dst.nr;
}
}
diff --git a/src/intel/compiler/brw_vec4_visitor.cpp b/src/intel/compiler/brw_vec4_visitor.cpp
index 22ee4dd1c4..d798db8f17 100644
--- a/src/intel/compiler/brw_vec4_visitor.cpp
+++ b/src/intel/compiler/brw_vec4_visitor.cpp
@@ -1478,8 +1478,8 @@ vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
*/
void
vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
- dst_reg temp, src_reg orig_src,
- int base_offset)
+ dst_reg temp, src_reg orig_src,
+ int base_offset, bool resolve_reladdr)
{
assert(orig_src.offset % REG_SIZE == 0);
int reg_offset = base_offset + orig_src.offset / REG_SIZE;
@@ -1488,7 +1488,15 @@ vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,

if (type_sz(orig_src.type) < 8) {
emit_before(block, inst, SCRATCH_READ(temp, index));
- } else {
+ return;
+ }
+
+ /* The emission of DF scratch reads to resolve reladdrs is done before
+ * executing DF scalarization and lower simd width passes. As the
+ * new DF scratch read code assumes that both passes were executed before,
+ * we keep old code with data shuffling.
+ */
+ if (resolve_reladdr) {
dst_reg shuffled = dst_reg(this, glsl_type::dvec4_type);
dst_reg shuffled_float = retype(shuffled, BRW_REGISTER_TYPE_F);
emit_before(block, inst, SCRATCH_READ(shuffled_float, index));
@@ -1497,7 +1505,30 @@ vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
SCRATCH_READ(byte_offset(shuffled_float, REG_SIZE), index);
emit_before(block, inst, last_read);
shuffle_64bit_data(temp, src_reg(shuffled), false, block, last_read);
+ return;
}
+
+ /* We can call this function when unspilling a partial DF read,
+ * however we unspill both GRFs in order to have the data together.
+ * Because of that, we substract the offset part of the second GRF.
+ */
+ if (inst->exec_size == 4 && inst->group == 4)
+ reg_offset = base_offset;
+
+ vec4_instruction *read = SCRATCH_READ(temp, index);
+ read->force_writemask_all = true;
+ read->exec_size = inst->exec_size;
+ read->size_written = REG_SIZE;
+
+ emit_before(block, inst, read);
+ index = get_scratch_offset(block, inst, orig_src.reladdr, reg_offset + 1);
+ vec4_instruction *last_read =
+ SCRATCH_READ(byte_offset(temp, REG_SIZE), index);
+ last_read->force_writemask_all = true;
+ last_read->exec_size = inst->exec_size;
+ last_read->size_written = REG_SIZE;
+
+ emit_before(block, inst, last_read);
}

/**
@@ -1508,7 +1539,7 @@ vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
*/
void
vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
- int base_offset)
+ int base_offset, bool resolve_reladdr)
{
assert(inst->dst.offset % REG_SIZE == 0);
int reg_offset = base_offset + inst->dst.offset / REG_SIZE;
@@ -1525,20 +1556,77 @@ vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
bool is_64bit = type_sz(inst->dst.type) == 8;
const glsl_type *alloc_type =
is_64bit ? glsl_type::dvec4_type : glsl_type::vec4_type;
- const src_reg temp = swizzle(retype(src_reg(this, alloc_type),
- inst->dst.type),
- brw_swizzle_for_mask(inst->dst.writemask));
+ src_reg temp;
+
+ temp = swizzle(retype(src_reg(this, alloc_type),
+ inst->dst.type),
+ brw_swizzle_for_mask(inst->dst.writemask));
+
+ dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
+ inst->dst.writemask));

if (!is_64bit) {
- dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
- inst->dst.writemask));
vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
if (inst->opcode != BRW_OPCODE_SEL)
write->predicate = inst->predicate;
write->ir = inst->ir;
write->annotation = inst->annotation;
inst->insert_after(block, write);
- } else {
+ }
+
+ if (is_64bit && !resolve_reladdr) {
+ /* As the scratch write for this case is implemented with align1
+ * instructions, we are going to: unspill existing content, overwrite it
+ * taking into account the writemask of the original instruction and
+ * spill it again.
+ */
+ src_reg saved_value = src_reg(this, glsl_type::dvec4_type);
+ dst_reg write_value = dst_reg(saved_value);
+ write_value.writemask = inst->dst.writemask;
+ /* We use write_value to overwrite the unspilled data corresponding to
+ * one GRF in case of a partial DF write. So, if we do a partial
+ * DF write for the second vertex, we need to apply an offset.
+ */
+ if (inst->group == 4)
+ write_value = byte_offset(write_value, REG_SIZE);
+ /* Overwrite the data in the corresponding place. */
+ vec4_instruction *mov = MOV(write_value, temp);
+ mov->group = inst->group;
+ mov->exec_size = inst->exec_size;
+ mov->size_written = type_sz(temp.type) * inst->exec_size;
+
+ inst->insert_after(block, mov);
+ /* Read previously spilled data and have it in 'saved_value'.
+ * emit_scratch_read() emits the instructions before 'mov', so
+ * emit_scratch_read() is called after it.
+ */
+ emit_scratch_read(block, mov, dst_reg(saved_value),
+ temp, base_offset, false);
+
+ /* In case of a partial DF write, we are going to spill only on exec_size
+ * units of data, so we use 'write_value' as source for it. In case of
+ * full DF write, 'write_value' doesn't have applied any offset and works
+ * too.
+ */
+ vec4_instruction *write = SCRATCH_WRITE(dst, src_reg(write_value), index);
+ write->mlen = 2;
+ if (inst->opcode != BRW_OPCODE_SEL)
+ write->predicate = inst->predicate;
+ write->exec_size = inst->exec_size;
+ write->group = inst->group;
+ write->offset = base_offset;
+
+ write->ir = inst->ir;
+ write->annotation = inst->annotation;
+ mov->insert_after(block, write);
+ }
+
+ /* The emission of DF scratch writes to resolve reladdrs is done before
+ * executing DF scalarization and lower simd width passes. As the
+ * new DF scratch write code assumes that both passes were executed before,
+ * we keep old code with data shuffling.
+ */
+ if (is_64bit && resolve_reladdr) {
dst_reg shuffled = dst_reg(this, alloc_type);
vec4_instruction *last =
shuffle_64bit_data(shuffled, temp, true, block, inst);
@@ -1611,7 +1699,7 @@ vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
if (src.file == VGRF && scratch_loc[src.nr] != -1) {
dst_reg temp = dst_reg(this, type_sz(src.type) == 8 ?
glsl_type::dvec4_type : glsl_type::vec4_type);
- emit_scratch_read(block, inst, temp, src, scratch_loc[src.nr]);
+ emit_scratch_read(block, inst, temp, src, scratch_loc[src.nr], true);
src.nr = temp.nr;
src.offset %= REG_SIZE;
src.reladdr = NULL;
@@ -1686,7 +1774,7 @@ vec4_visitor::move_grf_array_access_to_scratch()
* accesses for dst we can safely do the scratch write for dst itself
*/
if (inst->dst.file == VGRF && scratch_loc[inst->dst.nr] != -1)
- emit_scratch_write(block, inst, scratch_loc[inst->dst.nr]);
+ emit_scratch_write(block, inst, scratch_loc[inst->dst.nr], true);

/* Now handle scratch access on any src. In this case, since inst->src[i]
* already is a src_reg, we can just call emit_resolve_reladdr with
--
2.11.0
Samuel Iglesias Gonsálvez
2017-07-19 13:51:12 UTC
Reply
Permalink
Raw Message
Signed-off-by: Samuel Iglesias Gonsálvez <***@igalia.com>
---
src/intel/compiler/brw_vec4_reg_allocate.cpp | 16 ++++++++++------
1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/intel/compiler/brw_vec4_reg_allocate.cpp b/src/intel/compiler/brw_vec4_reg_allocate.cpp
index bed3471159..a6f1070ebd 100644
--- a/src/intel/compiler/brw_vec4_reg_allocate.cpp
+++ b/src/intel/compiler/brw_vec4_reg_allocate.cpp
@@ -301,7 +301,7 @@ vec4_visitor::reg_allocate()
*/
static bool
can_use_scratch_for_source(const vec4_instruction *inst, unsigned i,
- unsigned scratch_reg)
+ unsigned scratch_reg, bool partial_df_read)
{
assert(inst->src[i].file == VGRF);
bool prev_inst_read_scratch_reg = false;
@@ -319,12 +319,14 @@ can_use_scratch_for_source(const vec4_instruction *inst, unsigned i,

/* If the previous instruction writes to scratch_reg then we can reuse
* it if the write is not conditional and the channels we write are
- * compatible with our read mask
+ * compatible with our read mask.
+ *
+ * Ignore partial DF read case as we will read the data for both vertices.
*/
if (prev_inst->dst.file == VGRF && prev_inst->dst.nr == scratch_reg) {
return (!prev_inst->predicate || prev_inst->opcode == BRW_OPCODE_SEL) &&
- (brw_mask_for_swizzle(inst->src[i].swizzle) &
- ~prev_inst->dst.writemask) == 0;
+ ((brw_mask_for_swizzle(inst->src[i].swizzle) &
+ ~prev_inst->dst.writemask) == 0) && !partial_df_read;
}

/* Skip scratch read/writes so that instructions generated by spilling
@@ -403,7 +405,9 @@ vec4_visitor::evaluate_spill_costs(float *spill_costs, bool *no_spill)
* previous instruction, in which case we'll just reuse the scratch
* reg for this instruction.
*/
- if (!can_use_scratch_for_source(inst, i, inst->src[i].nr)) {
+ bool partial_df_read = inst->exec_size == 4 &&
+ type_sz(inst->src[i].type) == 8;
+ if (!can_use_scratch_for_source(inst, i, inst->src[i].nr, partial_df_read)) {
spill_costs[inst->src[i].nr] +=
loop_scale * spill_cost_for_type(inst->src[i].type);
if (inst->src[i].reladdr ||
@@ -514,7 +518,7 @@ vec4_visitor::spill_reg(int spill_reg_nr)
for (unsigned int i = 0; i < 3; i++) {
if (inst->src[i].file == VGRF && inst->src[i].nr == spill_reg_nr) {
if (scratch_reg == -1 ||
- !can_use_scratch_for_source(inst, i, scratch_reg)) {
+ !can_use_scratch_for_source(inst, i, scratch_reg, false)) {
/* We need to unspill anyway so make sure we read the full vec4
* in any case. This way, the cached register can be reused
* for consecutive instructions that read different channels of
--
2.11.0
Samuel Iglesias Gonsálvez
2017-07-19 13:51:13 UTC
Reply
Permalink
Raw Message
v2:
- Enable spilling for partial DF reads/writes on HSW+

Signed-off-by: Samuel Iglesias Gonsálvez <***@igalia.com>
---
src/intel/compiler/brw_vec4_reg_allocate.cpp | 54 ++++++++++++++++++++--------
1 file changed, 40 insertions(+), 14 deletions(-)

diff --git a/src/intel/compiler/brw_vec4_reg_allocate.cpp b/src/intel/compiler/brw_vec4_reg_allocate.cpp
index a6f1070ebd..3ad18b12bb 100644
--- a/src/intel/compiler/brw_vec4_reg_allocate.cpp
+++ b/src/intel/compiler/brw_vec4_reg_allocate.cpp
@@ -411,17 +411,21 @@ vec4_visitor::evaluate_spill_costs(float *spill_costs, bool *no_spill)
spill_costs[inst->src[i].nr] +=
loop_scale * spill_cost_for_type(inst->src[i].type);
if (inst->src[i].reladdr ||
- inst->src[i].offset >= REG_SIZE)
+ (inst->src[i].offset >= REG_SIZE &&
+ (type_sz(inst->src[i].type) != 8 ||
+ !(inst->src[i].offset == 32 && inst->group == 4))))
no_spill[inst->src[i].nr] = true;

- /* We don't support unspills of partial DF reads.
+ /* For execsize == 8, our 64-bit unspills are implemented with
+ * two 32-bit scratch messages, each one reading that for both
+ * SIMD4x2 threads that we need to shuffle into correct 64-bit
+ * data. Ensure that we are reading data for both threads.
*
- * Our 64-bit unspills are implemented with two 32-bit scratch
- * messages, each one reading that for both SIMD4x2 threads that
- * we need to shuffle into correct 64-bit data. Ensure that we
- * are reading data for both threads.
+ * For execsize == 4, it is similar but using 1-Oword block
+ * read messages and we don't need to shuffle data.
*/
- if (type_sz(inst->src[i].type) == 8 && inst->exec_size != 8)
+ if (type_sz(inst->src[i].type) == 8 &&
+ inst->exec_size != 8 && inst->exec_size != 4)
no_spill[inst->src[i].nr] = true;
}

@@ -439,16 +443,21 @@ vec4_visitor::evaluate_spill_costs(float *spill_costs, bool *no_spill)
if (inst->dst.file == VGRF && !no_spill[inst->dst.nr]) {
spill_costs[inst->dst.nr] +=
loop_scale * spill_cost_for_type(inst->dst.type);
- if (inst->dst.reladdr || inst->dst.offset >= REG_SIZE)
+ if (inst->dst.reladdr ||
+ (inst->dst.offset >= REG_SIZE &&
+ (type_sz(inst->dst.type) != 8 ||
+ !(inst->dst.offset == 32 && inst->group == 4))))
no_spill[inst->dst.nr] = true;

- /* We don't support spills of partial DF writes.
+ /* For execsize == 8, our 64-bit spills are implemented with two
+ * 32-bit scratch messages, each one writing that for both SIMD4x2
+ * threads. Ensure that we are writing data for both threads.
*
- * Our 64-bit spills are implemented with two 32-bit scratch messages,
- * each one writing that for both SIMD4x2 threads. Ensure that we
- * are writing data for both threads.
+ * For execsize == 4, it is similar but using 1-Oword block
+ * write messages.
*/
- if (type_sz(inst->dst.type) == 8 && inst->exec_size != 8)
+ if (type_sz(inst->dst.type) == 8 &&
+ inst->exec_size != 8 && inst->exec_size != 4)
no_spill[inst->dst.nr] = true;

/* We can't spill registers that mix 32-bit and 64-bit access (that
@@ -514,11 +523,25 @@ vec4_visitor::spill_reg(int spill_reg_nr)

/* Generate spill/unspill instructions for the objects being spilled. */
int scratch_reg = -1;
+ bool do_partial_df_scratch_read = false;
foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
for (unsigned int i = 0; i < 3; i++) {
if (inst->src[i].file == VGRF && inst->src[i].nr == spill_reg_nr) {
+ /* DF scratch reads are not actual partial reads because we are
+ * going to read both GRFs in the first read instruction.
+ * Because of that, we will skip scratch read of the other splitted
+ * instruction (if any), as it can reuse the read value. We check
+ * the value of done_scratch_read to know if we need to do scratch
+ * read or not.
+ */
+ bool do_df_scratch_read = devinfo->gen >= 7 &&
+ type_sz(inst->src[i].type) == 8 &&
+ (inst->exec_size != 4 || do_partial_df_scratch_read);
+
if (scratch_reg == -1 ||
- !can_use_scratch_for_source(inst, i, scratch_reg, false)) {
+ (!can_use_scratch_for_source(inst, i, scratch_reg,
+ do_partial_df_scratch_read) &&
+ (do_df_scratch_read || type_sz(inst->src[i].type) != 8))) {
/* We need to unspill anyway so make sure we read the full vec4
* in any case. This way, the cached register can be reused
* for consecutive instructions that read different channels of
@@ -532,6 +555,7 @@ vec4_visitor::spill_reg(int spill_reg_nr)
emit_scratch_read(block, inst,
dst_reg(temp), inst->src[i], spill_offset, false);
temp.offset = inst->src[i].offset;
+ do_partial_df_scratch_read = false;
}
assert(scratch_reg != -1);
inst->src[i].nr = scratch_reg;
@@ -541,6 +565,8 @@ vec4_visitor::spill_reg(int spill_reg_nr)
if (inst->dst.file == VGRF && inst->dst.nr == spill_reg_nr) {
emit_scratch_write(block, inst, spill_offset, false);
scratch_reg = inst->dst.nr;
+ if (type_sz(inst->dst.type) == 8 && inst->exec_size == 4)
+ do_partial_df_scratch_read = true;
}
}
--
2.11.0
Mark Janes
2017-08-12 00:50:51 UTC
Reply
Permalink
Raw Message
This series resolves
https://bugs.freedesktop.org/show_bug.cgi?id=101985, currently blocking
17.2 release.
Post by Samuel Iglesias Gonsálvez
- Use nibctrl and the number of written/read owords to detect
each case of a 1-OWord Block Read/Write (Curro)
---
src/intel/compiler/brw_eu.h | 14 +++++-----
src/intel/compiler/brw_eu_emit.c | 46 +++++++++++++++++++++++++--------
src/intel/compiler/brw_fs_generator.cpp | 4 +--
3 files changed, 44 insertions(+), 20 deletions(-)
diff --git a/src/intel/compiler/brw_eu.h b/src/intel/compiler/brw_eu.h
index a3a9c63239..de8470b4b5 100644
--- a/src/intel/compiler/brw_eu.h
+++ b/src/intel/compiler/brw_eu.h
@@ -342,15 +342,15 @@ void brw_oword_block_read(struct brw_codegen *p,
unsigned brw_scratch_surface_idx(const struct brw_codegen *p);
void brw_oword_block_read_scratch(struct brw_codegen *p,
- struct brw_reg dest,
- struct brw_reg mrf,
- int num_regs,
- unsigned offset);
+ struct brw_reg dest,
+ struct brw_reg mrf,
+ int num_owords,
+ unsigned offset);
void brw_oword_block_write_scratch(struct brw_codegen *p,
- struct brw_reg mrf,
- int num_regs,
- unsigned offset);
+ struct brw_reg mrf,
+ int num_owords,
+ unsigned offset);
void gen7_block_read_scratch(struct brw_codegen *p,
struct brw_reg dest,
diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c
index 0b0d67a5c5..956ef263a2 100644
--- a/src/intel/compiler/brw_eu_emit.c
+++ b/src/intel/compiler/brw_eu_emit.c
@@ -2133,9 +2133,9 @@ brw_scratch_surface_idx(const struct brw_codegen *p)
* register spilling.
*/
void brw_oword_block_write_scratch(struct brw_codegen *p,
- struct brw_reg mrf,
- int num_regs,
- unsigned offset)
+ struct brw_reg mrf,
+ int num_owords,
+ unsigned offset)
{
const struct gen_device_info *devinfo = p->devinfo;
const unsigned target_cache =
@@ -2149,7 +2149,7 @@ void brw_oword_block_write_scratch(struct brw_codegen *p,
mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
- const unsigned mlen = 1 + num_regs;
+ const unsigned mlen = 1 + MAX2(1, num_owords / 2);
/* Set up the message header. This is g0, with g0.2 filled with
* the offset. We don't want to leave our offset around in g0 or
@@ -2180,6 +2180,18 @@ void brw_oword_block_write_scratch(struct brw_codegen *p,
int send_commit_msg;
struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
BRW_REGISTER_TYPE_UW);
+ int msg_control = BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_owords * 4);
+
+ /* By default for 1-oword, msg_control = BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
+ * fix it when we are writing the high part.
+ */
+ if (num_owords == 1 && brw_inst_nib_control(devinfo, insn) != 0) {
+ msg_control = BRW_DATAPORT_OWORD_BLOCK_1_OWORDHIGH;
+ /* The messages only work with group == 0, we use the group to know which
+ * message emit (1-OWORD LOW or 1-OWORD HIGH), so reset it to zero.
+ */
+ brw_inst_set_group(devinfo, insn, 0);
+ }
brw_inst_set_compression(devinfo, insn, false);
@@ -2223,7 +2235,7 @@ void brw_oword_block_write_scratch(struct brw_codegen *p,
brw_set_dp_write_message(p,
insn,
brw_scratch_surface_idx(p),
- BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
+ msg_control,
msg_type,
target_cache,
mlen,
@@ -2245,10 +2257,10 @@ void brw_oword_block_write_scratch(struct brw_codegen *p,
*/
void
brw_oword_block_read_scratch(struct brw_codegen *p,
- struct brw_reg dest,
- struct brw_reg mrf,
- int num_regs,
- unsigned offset)
+ struct brw_reg dest,
+ struct brw_reg mrf,
+ int num_owords,
+ unsigned offset)
{
const struct gen_device_info *devinfo = p->devinfo;
@@ -2269,7 +2281,7 @@ brw_oword_block_read_scratch(struct brw_codegen *p,
}
dest = retype(dest, BRW_REGISTER_TYPE_UW);
- const unsigned rlen = num_regs;
+ const unsigned rlen = MAX2(1, num_owords / 2);
const unsigned target_cache =
@@ -2291,6 +2303,18 @@ brw_oword_block_read_scratch(struct brw_codegen *p,
{
brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
+ int msg_control = BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_owords * 4);
+
+ /* By default for 1-oword, msg_control = BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
+ * fix it when we are reading the high part.
+ */
+ if (num_owords == 1 && brw_inst_nib_control(devinfo, insn) != 0) {
+ msg_control = BRW_DATAPORT_OWORD_BLOCK_1_OWORDHIGH;
+ /* The messages only work with group == 0, we use the group to know which
+ * message emit (1-OWORD LOW or 1-OWORD HIGH), so reset it to zero.
+ */
+ brw_inst_set_group(devinfo, insn, 0);
+ }
assert(brw_inst_pred_control(devinfo, insn) == 0);
brw_inst_set_compression(devinfo, insn, false);
@@ -2306,7 +2330,7 @@ brw_oword_block_read_scratch(struct brw_codegen *p,
brw_set_dp_read_message(p,
insn,
brw_scratch_surface_idx(p),
- BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
+ msg_control,
BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
target_cache,
1, /* msg_length */
diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp
index 2ade486705..2dd28048eb 100644
--- a/src/intel/compiler/brw_fs_generator.cpp
+++ b/src/intel/compiler/brw_fs_generator.cpp
@@ -1178,7 +1178,7 @@ fs_generator::generate_scratch_write(fs_inst *inst, struct brw_reg src)
retype(offset(src, block_size * i), BRW_REGISTER_TYPE_UD));
brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf),
- block_size,
+ block_size * 2,
inst->offset + block_size * REG_SIZE * i);
}
@@ -1192,7 +1192,7 @@ fs_generator::generate_scratch_read(fs_inst *inst, struct brw_reg dst)
assert(inst->mlen != 0);
brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf),
- inst->exec_size / 8, inst->offset);
+ inst->exec_size / 4, inst->offset);
}
void
--
2.11.0
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev
Francisco Jerez
2017-08-15 19:54:20 UTC
Reply
Permalink
Raw Message
Post by Mark Janes
This series resolves
https://bugs.freedesktop.org/show_bug.cgi?id=101985, currently blocking
17.2 release.
I have doubts this series is ready for production, though I don't think
it makes a ton of sense for Gen7 fp64 vec4 spilling to be considered a
blocking issue for the 17.2 release?
Post by Mark Janes
Post by Samuel Iglesias Gonsálvez
- Use nibctrl and the number of written/read owords to detect
each case of a 1-OWord Block Read/Write (Curro)
---
src/intel/compiler/brw_eu.h | 14 +++++-----
src/intel/compiler/brw_eu_emit.c | 46 +++++++++++++++++++++++++--------
src/intel/compiler/brw_fs_generator.cpp | 4 +--
3 files changed, 44 insertions(+), 20 deletions(-)
diff --git a/src/intel/compiler/brw_eu.h b/src/intel/compiler/brw_eu.h
index a3a9c63239..de8470b4b5 100644
--- a/src/intel/compiler/brw_eu.h
+++ b/src/intel/compiler/brw_eu.h
@@ -342,15 +342,15 @@ void brw_oword_block_read(struct brw_codegen *p,
unsigned brw_scratch_surface_idx(const struct brw_codegen *p);
void brw_oword_block_read_scratch(struct brw_codegen *p,
- struct brw_reg dest,
- struct brw_reg mrf,
- int num_regs,
- unsigned offset);
+ struct brw_reg dest,
+ struct brw_reg mrf,
+ int num_owords,
+ unsigned offset);
void brw_oword_block_write_scratch(struct brw_codegen *p,
- struct brw_reg mrf,
- int num_regs,
- unsigned offset);
+ struct brw_reg mrf,
+ int num_owords,
+ unsigned offset);
void gen7_block_read_scratch(struct brw_codegen *p,
struct brw_reg dest,
diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c
index 0b0d67a5c5..956ef263a2 100644
--- a/src/intel/compiler/brw_eu_emit.c
+++ b/src/intel/compiler/brw_eu_emit.c
@@ -2133,9 +2133,9 @@ brw_scratch_surface_idx(const struct brw_codegen *p)
* register spilling.
*/
void brw_oword_block_write_scratch(struct brw_codegen *p,
- struct brw_reg mrf,
- int num_regs,
- unsigned offset)
+ struct brw_reg mrf,
+ int num_owords,
+ unsigned offset)
{
const struct gen_device_info *devinfo = p->devinfo;
const unsigned target_cache =
@@ -2149,7 +2149,7 @@ void brw_oword_block_write_scratch(struct brw_codegen *p,
mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
- const unsigned mlen = 1 + num_regs;
+ const unsigned mlen = 1 + MAX2(1, num_owords / 2);
/* Set up the message header. This is g0, with g0.2 filled with
* the offset. We don't want to leave our offset around in g0 or
@@ -2180,6 +2180,18 @@ void brw_oword_block_write_scratch(struct brw_codegen *p,
int send_commit_msg;
struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
BRW_REGISTER_TYPE_UW);
+ int msg_control = BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_owords * 4);
+
+ /* By default for 1-oword, msg_control = BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
+ * fix it when we are writing the high part.
+ */
+ if (num_owords == 1 && brw_inst_nib_control(devinfo, insn) != 0) {
+ msg_control = BRW_DATAPORT_OWORD_BLOCK_1_OWORDHIGH;
+ /* The messages only work with group == 0, we use the group to know which
+ * message emit (1-OWORD LOW or 1-OWORD HIGH), so reset it to zero.
+ */
+ brw_inst_set_group(devinfo, insn, 0);
+ }
brw_inst_set_compression(devinfo, insn, false);
@@ -2223,7 +2235,7 @@ void brw_oword_block_write_scratch(struct brw_codegen *p,
brw_set_dp_write_message(p,
insn,
brw_scratch_surface_idx(p),
- BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
+ msg_control,
msg_type,
target_cache,
mlen,
@@ -2245,10 +2257,10 @@ void brw_oword_block_write_scratch(struct brw_codegen *p,
*/
void
brw_oword_block_read_scratch(struct brw_codegen *p,
- struct brw_reg dest,
- struct brw_reg mrf,
- int num_regs,
- unsigned offset)
+ struct brw_reg dest,
+ struct brw_reg mrf,
+ int num_owords,
+ unsigned offset)
{
const struct gen_device_info *devinfo = p->devinfo;
@@ -2269,7 +2281,7 @@ brw_oword_block_read_scratch(struct brw_codegen *p,
}
dest = retype(dest, BRW_REGISTER_TYPE_UW);
- const unsigned rlen = num_regs;
+ const unsigned rlen = MAX2(1, num_owords / 2);
const unsigned target_cache =
@@ -2291,6 +2303,18 @@ brw_oword_block_read_scratch(struct brw_codegen *p,
{
brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
+ int msg_control = BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_owords * 4);
+
+ /* By default for 1-oword, msg_control = BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
+ * fix it when we are reading the high part.
+ */
+ if (num_owords == 1 && brw_inst_nib_control(devinfo, insn) != 0) {
+ msg_control = BRW_DATAPORT_OWORD_BLOCK_1_OWORDHIGH;
+ /* The messages only work with group == 0, we use the group to know which
+ * message emit (1-OWORD LOW or 1-OWORD HIGH), so reset it to zero.
+ */
+ brw_inst_set_group(devinfo, insn, 0);
+ }
assert(brw_inst_pred_control(devinfo, insn) == 0);
brw_inst_set_compression(devinfo, insn, false);
@@ -2306,7 +2330,7 @@ brw_oword_block_read_scratch(struct brw_codegen *p,
brw_set_dp_read_message(p,
insn,
brw_scratch_surface_idx(p),
- BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
+ msg_control,
BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
target_cache,
1, /* msg_length */
diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp
index 2ade486705..2dd28048eb 100644
--- a/src/intel/compiler/brw_fs_generator.cpp
+++ b/src/intel/compiler/brw_fs_generator.cpp
@@ -1178,7 +1178,7 @@ fs_generator::generate_scratch_write(fs_inst *inst, struct brw_reg src)
retype(offset(src, block_size * i), BRW_REGISTER_TYPE_UD));
brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf),
- block_size,
+ block_size * 2,
inst->offset + block_size * REG_SIZE * i);
}
@@ -1192,7 +1192,7 @@ fs_generator::generate_scratch_read(fs_inst *inst, struct brw_reg dst)
assert(inst->mlen != 0);
brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf),
- inst->exec_size / 8, inst->offset);
+ inst->exec_size / 4, inst->offset);
}
void
--
2.11.0
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev
Loading...