[Mesa-dev] [PATCH 03/59] compiler/spirv: implement 16-bit acos

---
src/compiler/spirv/vtn_glsl450.c | 20 +++++++++++++-------
1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/src/compiler/spirv/vtn_glsl450.c b/src/compiler/spirv/vtn_glsl450.c
index 06a49e48e3f..e85123725da 100644
--- a/src/compiler/spirv/vtn_glsl450.c
+++ b/src/compiler/spirv/vtn_glsl450.c
@@ -200,19 +200,25 @@ build_log(nir_builder *b, nir_ssa_def *x)
* in each case.
*/
static nir_ssa_def *
-build_asin(nir_builder *b, nir_ssa_def *x, float p0, float p1)
+build_asin(nir_builder *b, nir_ssa_def *x, float _p0, float _p1)
{
+ nir_ssa_def *p0 = nir_imm_floatN_t(b, _p0, x->bit_size);
+ nir_ssa_def *p1 = nir_imm_floatN_t(b, _p1, x->bit_size);
+ nir_ssa_def *one = nir_imm_floatN_t(b, 1.0f, x->bit_size);
+ nir_ssa_def *m_pi_2 = nir_imm_floatN_t(b, M_PI_2f, x->bit_size);
+ nir_ssa_def *m_pi_4_minus_one =
+ nir_imm_floatN_t(b, M_PI_4f - 1.0f, x->bit_size);
nir_ssa_def *abs_x = nir_fabs(b, x);
return nir_fmul(b, nir_fsign(b, x),
- nir_fsub(b, nir_imm_float(b, M_PI_2f),
- nir_fmul(b, nir_fsqrt(b, nir_fsub(b, nir_imm_float(b, 1.0f), abs_x)),
- nir_fadd(b, nir_imm_float(b, M_PI_2f),
+ nir_fsub(b, m_pi_2,
+ nir_fmul(b, nir_fsqrt(b, nir_fsub(b, one, abs_x)),
+ nir_fadd(b, m_pi_2,
nir_fmul(b, abs_x,
- nir_fadd(b, nir_imm_float(b, M_PI_4f - 1.0f),
+ nir_fadd(b, m_pi_4_minus_one,
nir_fmul(b, abs_x,
- nir_fadd(b, nir_imm_float(b, p0),
+ nir_fadd(b, p0,
nir_fmul(b, abs_x,
- nir_imm_float(b, p1))))))))));
+ p1)))))))));
}

/**

--
2.17.1

Iago Toral Quiroga

2018-12-04 07:16:29 UTC

v2: fix huge_val for 16-bit, it was mean't to be 2^14 not 10^14.
---
src/compiler/spirv/vtn_glsl450.c | 21 ++++++++++++++-------
1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/src/compiler/spirv/vtn_glsl450.c b/src/compiler/spirv/vtn_glsl450.c
index 9cda80c5137..f05531dbad6 100644
--- a/src/compiler/spirv/vtn_glsl450.c
+++ b/src/compiler/spirv/vtn_glsl450.c
@@ -304,8 +304,11 @@ build_atan(nir_builder *b, nir_ssa_def *y_over_x)
static nir_ssa_def *
build_atan2(nir_builder *b, nir_ssa_def *y, nir_ssa_def *x)
{
- nir_ssa_def *zero = nir_imm_float(b, 0);
- nir_ssa_def *one = nir_imm_float(b, 1);
+ assert(y->bit_size == x->bit_size);
+ const uint32_t bit_size = x->bit_size;
+
+ nir_ssa_def *zero = nir_imm_floatN_t(b, 0, bit_size);
+ nir_ssa_def *one = nir_imm_floatN_t(b, 1, bit_size);

/* If we're on the left half-plane rotate the coordinates π/2 clock-wise
* for the y=0 discontinuity to end up aligned with the vertical
@@ -335,9 +338,10 @@ build_atan2(nir_builder *b, nir_ssa_def *y, nir_ssa_def *x)
* floating point representations with at least the dynamic range of ATI's
* 24-bit representation.
*/
- nir_ssa_def *huge = nir_imm_float(b, 1e18f);
+ const double huge_val = bit_size >= 32 ? 1e18 : 16384;
+ nir_ssa_def *huge = nir_imm_floatN_t(b, huge_val, bit_size);
nir_ssa_def *scale = nir_bcsel(b, nir_fge(b, nir_fabs(b, t), huge),
- nir_imm_float(b, 0.25), one);
+ nir_imm_floatN_t(b, 0.25, bit_size), one);
nir_ssa_def *rcp_scaled_t = nir_frcp(b, nir_fmul(b, t, scale));
nir_ssa_def *s_over_t = nir_fmul(b, nir_fmul(b, s, scale), rcp_scaled_t);

@@ -364,9 +368,12 @@ build_atan2(nir_builder *b, nir_ssa_def *y, nir_ssa_def *x)
/* Calculate the arctangent and fix up the result if we had flipped the
* coordinate system.
*/
- nir_ssa_def *arc = nir_fadd(b, nir_fmul(b, nir_b2f(b, flip),
- nir_imm_float(b, M_PI_2f)),
- build_atan(b, tan));
+ nir_ssa_def *b2f_flip = nir_b2f(b, flip);
+ b2f_flip->bit_size = bit_size;
+ nir_ssa_def *arc =
+ nir_fadd(b, nir_fmul(b, b2f_flip,
+ nir_imm_floatN_t(b, M_PI_2f, bit_size)),
+ build_atan(b, tan));

/* Rather convoluted calculation of the sign of the result. When x < 0 we
* cannot use fsign because we need to be able to distinguish between

--
2.17.1

Iago Toral Quiroga

2018-12-04 07:16:28 UTC

---
src/compiler/spirv/vtn_glsl450.c | 36 +++++++++++++++++++++-----------
1 file changed, 24 insertions(+), 12 deletions(-)

diff --git a/src/compiler/spirv/vtn_glsl450.c b/src/compiler/spirv/vtn_glsl450.c
index 4345c9c61a3..9cda80c5137 100644
--- a/src/compiler/spirv/vtn_glsl450.c
+++ b/src/compiler/spirv/vtn_glsl450.c
@@ -238,8 +238,10 @@ build_fsum(nir_builder *b, nir_ssa_def **xs, int terms)
static nir_ssa_def *
build_atan(nir_builder *b, nir_ssa_def *y_over_x)
{
+ const uint32_t bit_size = y_over_x->bit_size;
+
nir_ssa_def *abs_y_over_x = nir_fabs(b, y_over_x);
- nir_ssa_def *one = nir_imm_float(b, 1.0f);
+ nir_ssa_def *one = nir_imm_floatN_t(b, 1.0f, bit_size);

/*
* range-reduction, first step:
@@ -265,25 +267,35 @@ build_atan(nir_builder *b, nir_ssa_def *y_over_x)
nir_ssa_def *x_9 = nir_fmul(b, x_7, x_2);
nir_ssa_def *x_11 = nir_fmul(b, x_9, x_2);

+ const float coef[] = {
+ 0.9999793128310355f,
+ -0.3326756418091246f,
+ 0.1938924977115610f,
+ -0.1173503194786851f,
+ 0.0536813784310406f,
+ -0.0121323213173444f,
+ };
+
nir_ssa_def *polynomial_terms[] = {
- nir_fmul(b, x, nir_imm_float(b, 0.9999793128310355f)),
- nir_fmul(b, x_3, nir_imm_float(b, -0.3326756418091246f)),
- nir_fmul(b, x_5, nir_imm_float(b, 0.1938924977115610f)),
- nir_fmul(b, x_7, nir_imm_float(b, -0.1173503194786851f)),
- nir_fmul(b, x_9, nir_imm_float(b, 0.0536813784310406f)),
- nir_fmul(b, x_11, nir_imm_float(b, -0.0121323213173444f)),
+ nir_fmul(b, x, nir_imm_floatN_t(b, coef[0], bit_size)),
+ nir_fmul(b, x_3, nir_imm_floatN_t(b, coef[1], bit_size)),
+ nir_fmul(b, x_5, nir_imm_floatN_t(b, coef[2], bit_size)),
+ nir_fmul(b, x_7, nir_imm_floatN_t(b, coef[3], bit_size)),
+ nir_fmul(b, x_9, nir_imm_floatN_t(b, coef[4], bit_size)),
+ nir_fmul(b, x_11, nir_imm_floatN_t(b, coef[5], bit_size)),
};

nir_ssa_def *tmp =
build_fsum(b, polynomial_terms, ARRAY_SIZE(polynomial_terms));

/* range-reduction fixup */
+ nir_ssa_def *minus_2 = nir_imm_floatN_t(b, -2.0f, bit_size);
+ nir_ssa_def *m_pi_2 = nir_imm_floatN_t(b, M_PI_2f, bit_size);
+ nir_ssa_def *b2f = nir_b2f(b, nir_flt(b, one, abs_y_over_x));
+ b2f->bit_size = bit_size; /* do we prefer b2f<bitsize> opcodes? */
tmp = nir_fadd(b, tmp,
- nir_fmul(b,
- nir_b2f(b, nir_flt(b, one, abs_y_over_x)),
- nir_fadd(b, nir_fmul(b, tmp,
- nir_imm_float(b, -2.0f)),
- nir_imm_float(b, M_PI_2f))));
+ nir_fmul(b, b2f,
+ nir_fadd(b, nir_fmul(b, tmp, minus_2), m_pi_2)));

/* sign fixup */
return nir_fmul(b, tmp, nir_fsign(b, y_over_x));

--
2.17.1

Jason Ekstrand

2018-12-05 17:33:41 UTC

Post by Iago Toral Quiroga
---
src/compiler/spirv/vtn_glsl450.c | 36 +++++++++++++++++++++-----------
1 file changed, 24 insertions(+), 12 deletions(-)
diff --git a/src/compiler/spirv/vtn_glsl450.c
b/src/compiler/spirv/vtn_glsl450.c
index 4345c9c61a3..9cda80c5137 100644
--- a/src/compiler/spirv/vtn_glsl450.c
+++ b/src/compiler/spirv/vtn_glsl450.c
@@ -238,8 +238,10 @@ build_fsum(nir_builder *b, nir_ssa_def **xs, int terms)
static nir_ssa_def *
build_atan(nir_builder *b, nir_ssa_def *y_over_x)
{
+ const uint32_t bit_size = y_over_x->bit_size;
+
nir_ssa_def *abs_y_over_x = nir_fabs(b, y_over_x);
- nir_ssa_def *one = nir_imm_float(b, 1.0f);
+ nir_ssa_def *one = nir_imm_floatN_t(b, 1.0f, bit_size);
/*
@@ -265,25 +267,35 @@ build_atan(nir_builder *b, nir_ssa_def *y_over_x)
nir_ssa_def *x_9 = nir_fmul(b, x_7, x_2);
nir_ssa_def *x_11 = nir_fmul(b, x_9, x_2);
+ const float coef[] = {
+ 0.9999793128310355f,
+ -0.3326756418091246f,
+ 0.1938924977115610f,
+ -0.1173503194786851f,
+ 0.0536813784310406f,
+ -0.0121323213173444f,
+ };
+
nir_ssa_def *polynomial_terms[] = {
- nir_fmul(b, x, nir_imm_float(b, 0.9999793128310355f)),
- nir_fmul(b, x_3, nir_imm_float(b, -0.3326756418091246f)),
- nir_fmul(b, x_5, nir_imm_float(b, 0.1938924977115610f)),
- nir_fmul(b, x_7, nir_imm_float(b, -0.1173503194786851f)),
- nir_fmul(b, x_9, nir_imm_float(b, 0.0536813784310406f)),
- nir_fmul(b, x_11, nir_imm_float(b, -0.0121323213173444f)),
+ nir_fmul(b, x, nir_imm_floatN_t(b, coef[0], bit_size)),
+ nir_fmul(b, x_3, nir_imm_floatN_t(b, coef[1], bit_size)),
+ nir_fmul(b, x_5, nir_imm_floatN_t(b, coef[2], bit_size)),
+ nir_fmul(b, x_7, nir_imm_floatN_t(b, coef[3], bit_size)),
+ nir_fmul(b, x_9, nir_imm_floatN_t(b, coef[4], bit_size)),
+ nir_fmul(b, x_11, nir_imm_floatN_t(b, coef[5], bit_size)),

Any reason why you split the coefficients out into their own array? Just
to avoid line wrapping?

In a recent commit, I added nir_iadd_imm and nir_imul_imm helpers for
multiplying by or adding an immediate. It might be worth doing the same
thing for multiplying and add by floats for these kinds of computations.

Post by Iago Toral Quiroga
};
nir_ssa_def *tmp =
build_fsum(b, polynomial_terms, ARRAY_SIZE(polynomial_terms));
/* range-reduction fixup */
+ nir_ssa_def *minus_2 = nir_imm_floatN_t(b, -2.0f, bit_size);
+ nir_ssa_def *m_pi_2 = nir_imm_floatN_t(b, M_PI_2f, bit_size);
+ nir_ssa_def *b2f = nir_b2f(b, nir_flt(b, one, abs_y_over_x));
+ b2f->bit_size = bit_size; /* do we prefer b2f<bitsize> opcodes? */

I have patches which will fix this at least somewhat. For right now, what
you did there is fine.

Post by Iago Toral Quiroga
tmp = nir_fadd(b, tmp,
- nir_fmul(b,
- nir_b2f(b, nir_flt(b, one, abs_y_over_x)),
- nir_fadd(b, nir_fmul(b, tmp,
- nir_imm_float(b, -2.0f)),
- nir_imm_float(b, M_PI_2f))));
+ nir_fmul(b, b2f,
+ nir_fadd(b, nir_fmul(b, tmp, minus_2), m_pi_2)));
/* sign fixup */
return nir_fmul(b, tmp, nir_fsign(b, y_over_x));
--
2.17.1
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Iago Toral

2018-12-06 10:00:44 UTC

Post by Jason Ekstrand

Post by Iago Toral Quiroga
---
src/compiler/spirv/vtn_glsl450.c | 36 +++++++++++++++++++++-------
----
1 file changed, 24 insertions(+), 12 deletions(-)
diff --git a/src/compiler/spirv/vtn_glsl450.c
b/src/compiler/spirv/vtn_glsl450.c
index 4345c9c61a3..9cda80c5137 100644
--- a/src/compiler/spirv/vtn_glsl450.c
+++ b/src/compiler/spirv/vtn_glsl450.c
@@ -238,8 +238,10 @@ build_fsum(nir_builder *b, nir_ssa_def **xs, int terms)
static nir_ssa_def *
build_atan(nir_builder *b, nir_ssa_def *y_over_x)
{
+ const uint32_t bit_size = y_over_x->bit_size;
+
nir_ssa_def *abs_y_over_x = nir_fabs(b, y_over_x);
- nir_ssa_def *one = nir_imm_float(b, 1.0f);
+ nir_ssa_def *one = nir_imm_floatN_t(b, 1.0f, bit_size);
/*
@@ -265,25 +267,35 @@ build_atan(nir_builder *b, nir_ssa_def
*y_over_x)
nir_ssa_def *x_9 = nir_fmul(b, x_7, x_2);
nir_ssa_def *x_11 = nir_fmul(b, x_9, x_2);
+ const float coef[] = {
+ 0.9999793128310355f,
+ -0.3326756418091246f,
+ 0.1938924977115610f,
+ -0.1173503194786851f,
+ 0.0536813784310406f,
+ -0.0121323213173444f,
+ };
+
nir_ssa_def *polynomial_terms[] = {
- nir_fmul(b, x, nir_imm_float(b, 0.9999793128310355f)),
- nir_fmul(b, x_3, nir_imm_float(b, -0.3326756418091246f)),
- nir_fmul(b, x_5, nir_imm_float(b, 0.1938924977115610f)),
- nir_fmul(b, x_7, nir_imm_float(b, -0.1173503194786851f)),
- nir_fmul(b, x_9, nir_imm_float(b, 0.0536813784310406f)),
- nir_fmul(b, x_11, nir_imm_float(b, -0.0121323213173444f)),
+ nir_fmul(b, x, nir_imm_floatN_t(b, coef[0], bit_size)),
+ nir_fmul(b, x_3, nir_imm_floatN_t(b, coef[1], bit_size)),
+ nir_fmul(b, x_5, nir_imm_floatN_t(b, coef[2], bit_size)),
+ nir_fmul(b, x_7, nir_imm_floatN_t(b, coef[3], bit_size)),
+ nir_fmul(b, x_9, nir_imm_floatN_t(b, coef[4], bit_size)),
+ nir_fmul(b, x_11, nir_imm_floatN_t(b, coef[5], bit_size)),

Any reason why you split the coefficients out into their own array?
Just to avoid line wrapping?

Yes, I think that was the only reason.

Post by Jason Ekstrand
In a recent commit, I added nir_iadd_imm and nir_imul_imm helpers for
multiplying by or adding an immediate. It might be worth doing the
same thing for multiplying and add by floats for these kinds of
computations.

Right, yes, that is probably a good idea, I'll do that.

Post by Jason Ekstrand

Post by Iago Toral Quiroga
};
nir_ssa_def *tmp =
build_fsum(b, polynomial_terms,
ARRAY_SIZE(polynomial_terms));
/* range-reduction fixup */
+ nir_ssa_def *minus_2 = nir_imm_floatN_t(b, -2.0f, bit_size);
+ nir_ssa_def *m_pi_2 = nir_imm_floatN_t(b, M_PI_2f, bit_size);
+ nir_ssa_def *b2f = nir_b2f(b, nir_flt(b, one, abs_y_over_x));
+ b2f->bit_size = bit_size; /* do we prefer b2f<bitsize> opcodes? */

I have patches which will fix this at least somewhat. For right now,
what you did there is fine.

Ah yes, I imagine it is likely that your patches will land ahead of
this series so I should remember to edit this when that happens.

Post by Jason Ekstrand

Post by Iago Toral Quiroga
tmp = nir_fadd(b, tmp,
- nir_fmul(b,
- nir_b2f(b, nir_flt(b, one,
abs_y_over_x)),
- nir_fadd(b, nir_fmul(b, tmp,
- nir_imm_float(b, -2.0f)),
- nir_imm_float(b,
M_PI_2f))));
+ nir_fmul(b, b2f,
+ nir_fadd(b, nir_fmul(b, tmp,
minus_2), m_pi_2)));
/* sign fixup */
return nir_fmul(b, tmp, nir_fsign(b, y_over_x));

Iago Toral Quiroga

2018-12-04 07:16:34 UTC

Signed-off-by: Samuel Iglesias Gonsálvez <***@igalia.com>
---
src/intel/compiler/brw_fs_nir.cpp | 41 +++++++++++++++++++++++++++++++
1 file changed, 41 insertions(+)

diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index 6eb68794f58..7294f49ddc0 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -796,6 +796,47 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
case nir_op_f2f64:
case nir_op_f2i64:
case nir_op_f2u64:
+ /* BDW PRM, vol02, Command Reference Instructions, mov - MOVE:
+ *
+ * "There is no direct conversion from HF to DF or DF to HF.
+ * Use two instructions and F (Float) as an intermediate type.
+ *
+ * There is no direct conversion from HF to Q/UQ or Q/UQ to HF.
+ * Use two instructions and F (Float) or a word integer type
+ * or a DWord integer type as an intermediate type."
+ */
+ if (nir_src_bit_size(instr->src[0].src) == 16) {
+ fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 1);
+ inst = bld.MOV(tmp, op[0]);
+ inst->saturate = instr->dest.saturate;
+ op[0] = tmp;
+ }
+
+ /* CHV PRM, vol07, 3D Media GPGPU Engine, Register Region Restrictions:
+ *
+ * "When source or destination is 64b (...), regioning in Align1
+ * must follow these rules:
+ *
+ * 1. Source and destination horizontal stride must be aligned to
+ * the same qword.
+ * (...)"
+ *
+ * This means that conversions from bit-sizes smaller than 64-bit to
+ * 64-bit need to have the source data elements aligned to 64-bit.
+ * This restriction does not apply to BDW and later.
+ */
+ if (type_sz(result.type) == 8 && type_sz(op[0].type) < 8 &&
+ (devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) {
+ fs_reg tmp = bld.vgrf(result.type, 1);
+ tmp = subscript(tmp, op[0].type, 0);
+ inst = bld.MOV(tmp, op[0]);
+ op[0] = tmp;
+ }
+
+ inst = bld.MOV(result, op[0]);
+ inst->saturate = instr->dest.saturate;
+ break;
+
case nir_op_i2f64:
case nir_op_i2i64:
case nir_op_u2f64:

--
2.17.1

Pohjolainen, Topi

2018-12-04 12:33:25 UTC

Post by Iago Toral Quiroga
---
src/intel/compiler/brw_fs_nir.cpp | 41 +++++++++++++++++++++++++++++++
1 file changed, 41 insertions(+)
diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index 6eb68794f58..7294f49ddc0 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -796,6 +796,47 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
+ *
+ * "There is no direct conversion from HF to DF or DF to HF.
+ * Use two instructions and F (Float) as an intermediate type.
+ *
+ * There is no direct conversion from HF to Q/UQ or Q/UQ to HF.
+ * Use two instructions and F (Float) or a word integer type
+ * or a DWord integer type as an intermediate type."
+ */
+ if (nir_src_bit_size(instr->src[0].src) == 16) {
+ fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 1);
+ inst = bld.MOV(tmp, op[0]);
+ inst->saturate = instr->dest.saturate;
+ op[0] = tmp;
+ }
+
+ *
+ * "When source or destination is 64b (...), regioning in Align1
+ *
+ * 1. Source and destination horizontal stride must be aligned to
+ * the same qword.
+ * (...)"
+ *
+ * This means that conversions from bit-sizes smaller than 64-bit to
+ * 64-bit need to have the source data elements aligned to 64-bit.
+ * This restriction does not apply to BDW and later.
+ */
+ if (type_sz(result.type) == 8 && type_sz(op[0].type) < 8 &&
+ (devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) {
+ fs_reg tmp = bld.vgrf(result.type, 1);
+ tmp = subscript(tmp, op[0].type, 0);
+ inst = bld.MOV(tmp, op[0]);
+ op[0] = tmp;
+ }

For this second part we seem to have similar logic further down after
"nir_op_u2u64" (not visible here) in master? Would it be possible to fallthru
from here and re-use that?

Post by Iago Toral Quiroga
+
+ inst = bld.MOV(result, op[0]);
+ inst->saturate = instr->dest.saturate;
+ break;
+
--
2.17.1
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Pohjolainen, Topi

2018-12-04 16:10:42 UTC

For this second part we seem to have similar logic further down after
"nir_op_u2u64" (not visible here) in master? Would it be possible to fallthru
from here and re-use that?

And after reading it more carefully myself it looks that this is actually
cleaner.

I noticed that in the nir_op_u2u64 case the destination and source sizes are
checked using:

if (nir_dest_bit_size(instr->dest.dest) == 64 &&
nir_src_bit_size(instr->src[0].src) < 64 &&
...

Should we use the same here for consistency?

Iago Toral

2018-12-05 08:20:57 UTC

Post by Iago Toral Quiroga
---
src/intel/compiler/brw_fs_nir.cpp | 41
+++++++++++++++++++++++++++++++
1 file changed, 41 insertions(+)
diff --git a/src/intel/compiler/brw_fs_nir.cpp
b/src/intel/compiler/brw_fs_nir.cpp
index 6eb68794f58..7294f49ddc0 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -796,6 +796,47 @@ fs_visitor::nir_emit_alu(const fs_builder
&bld, nir_alu_instr *instr)
+ *
+ * "There is no direct conversion from HF to DF or DF to HF.
+ * Use two instructions and F (Float) as an
intermediate type.
+ *
+ * There is no direct conversion from HF to Q/UQ or Q/UQ to HF.
+ * Use two instructions and F (Float) or a word integer type
+ * or a DWord integer type as an intermediate type."
+ */
+ if (nir_src_bit_size(instr->src[0].src) == 16) {
+ fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 1);
+ inst = bld.MOV(tmp, op[0]);
+ inst->saturate = instr->dest.saturate;
+ op[0] = tmp;
+ }
+
+ *
+ * "When source or destination is 64b (...), regioning in Align1
+ *
+ * 1. Source and destination horizontal stride must be aligned to
+ * the same qword.
+ * (...)"
+ *
+ * This means that conversions from bit-sizes smaller than 64-bit to
+ * 64-bit need to have the source data elements aligned to 64-bit.
+ * This restriction does not apply to BDW and later.
+ */
+ if (type_sz(result.type) == 8 && type_sz(op[0].type) < 8 &&
+ (devinfo->is_cherryview ||
gen_device_info_is_9lp(devinfo))) {
+ fs_reg tmp = bld.vgrf(result.type, 1);
+ tmp = subscript(tmp, op[0].type, 0);
+ inst = bld.MOV(tmp, op[0]);
+ op[0] = tmp;
+ }

For this second part we seem to have similar logic further down after
"nir_op_u2u64" (not visible here) in master? Would it be possible to fallthru
from here and re-use that?

And after reading it more carefully myself it looks that this is actually
cleaner.
I noticed that in the nir_op_u2u64 case the destination and source sizes are
if (nir_dest_bit_size(instr->dest.dest) == 64 &&
nir_src_bit_size(instr->src[0].src) < 64 &&
...
Should we use the same here for consistency?

Right above this we can rewrite op[0] with a temporary that would be
different from instr->src[0].src so we can't check the nir sources any
more. Also, by the end of the series, when we incorporate 8-bit
conversions there will be more cases like this that we need to account
for and we end up rewriting the u2u64 case to this style as well.

Iago

Pohjolainen, Topi

2018-12-05 08:57:59 UTC

Post by Iago Toral Quiroga
---
src/intel/compiler/brw_fs_nir.cpp | 41
+++++++++++++++++++++++++++++++
1 file changed, 41 insertions(+)
diff --git a/src/intel/compiler/brw_fs_nir.cpp
b/src/intel/compiler/brw_fs_nir.cpp
index 6eb68794f58..7294f49ddc0 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -796,6 +796,47 @@ fs_visitor::nir_emit_alu(const fs_builder
&bld, nir_alu_instr *instr)
+ *
+ * "There is no direct conversion from HF to DF or DF to HF.
+ * Use two instructions and F (Float) as an
intermediate type.
+ *
+ * There is no direct conversion from HF to Q/UQ or Q/UQ to HF.
+ * Use two instructions and F (Float) or a word integer type
+ * or a DWord integer type as an intermediate type."
+ */
+ if (nir_src_bit_size(instr->src[0].src) == 16) {
+ fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 1);
+ inst = bld.MOV(tmp, op[0]);
+ inst->saturate = instr->dest.saturate;
+ op[0] = tmp;
+ }
+
+ *
+ * "When source or destination is 64b (...), regioning in Align1
+ *
+ * 1. Source and destination horizontal stride must be aligned to
+ * the same qword.
+ * (...)"
+ *
+ * This means that conversions from bit-sizes smaller than 64-bit to
+ * 64-bit need to have the source data elements aligned to 64-bit.
+ * This restriction does not apply to BDW and later.
+ */
+ if (type_sz(result.type) == 8 && type_sz(op[0].type) < 8 &&
+ (devinfo->is_cherryview ||
gen_device_info_is_9lp(devinfo))) {
+ fs_reg tmp = bld.vgrf(result.type, 1);
+ tmp = subscript(tmp, op[0].type, 0);
+ inst = bld.MOV(tmp, op[0]);
+ op[0] = tmp;
+ }

For this second part we seem to have similar logic further down after
"nir_op_u2u64" (not visible here) in master? Would it be possible to fallthru
from here and re-use that?

Ok, thanks for the explanation! This patch is:

Reviewed-by: Topi Pohjolainen <***@intel.com>

Jason Ekstrand

2018-12-07 15:32:33 UTC

Would it be easier to split it into two instructions in NIR and just
implement the two conversions in the back-end? I suppose structuring
things this way, it's probably fairly easy to just do it in the back-end.
I guess that's ok.

Post by Iago Toral Quiroga
---
src/intel/compiler/brw_fs_nir.cpp | 41 +++++++++++++++++++++++++++++++
1 file changed, 41 insertions(+)
diff --git a/src/intel/compiler/brw_fs_nir.cpp
b/src/intel/compiler/brw_fs_nir.cpp
index 6eb68794f58..7294f49ddc0 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -796,6 +796,47 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
+ *
+ * "There is no direct conversion from HF to DF or DF to HF.
+ * Use two instructions and F (Float) as an intermediate type.
+ *
+ * There is no direct conversion from HF to Q/UQ or Q/UQ to HF.
+ * Use two instructions and F (Float) or a word integer type
+ * or a DWord integer type as an intermediate type."
+ */
+ if (nir_src_bit_size(instr->src[0].src) == 16) {
+ fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 1);
+ inst = bld.MOV(tmp, op[0]);
+ inst->saturate = instr->dest.saturate;
+ op[0] = tmp;
+ }
+
+ *
+ * "When source or destination is 64b (...), regioning in Align1
+ *
+ * 1. Source and destination horizontal stride must be aligned to
+ * the same qword.
+ * (...)"
+ *
+ * This means that conversions from bit-sizes smaller than 64-bit to
+ * 64-bit need to have the source data elements aligned to 64-bit.
+ * This restriction does not apply to BDW and later.
+ */
+ if (type_sz(result.type) == 8 && type_sz(op[0].type) < 8 &&
+ (devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) {
+ fs_reg tmp = bld.vgrf(result.type, 1);
+ tmp = subscript(tmp, op[0].type, 0);
+ inst = bld.MOV(tmp, op[0]);
+ op[0] = tmp;
+ }
+
+ inst = bld.MOV(result, op[0]);
+ inst->saturate = instr->dest.saturate;
+ break;
+
--
2.17.1
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Iago Toral Quiroga

2018-12-04 07:16:30 UTC

Reviewed-by: Jason Ekstrand <***@jlekstrand.net>
---
src/compiler/spirv/vtn_glsl450.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/compiler/spirv/vtn_glsl450.c b/src/compiler/spirv/vtn_glsl450.c
index f05531dbad6..8bdef9db822 100644
--- a/src/compiler/spirv/vtn_glsl450.c
+++ b/src/compiler/spirv/vtn_glsl450.c
@@ -177,7 +177,7 @@ matrix_inverse(struct vtn_builder *b, struct vtn_ssa_value *src)
static nir_ssa_def *
build_exp(nir_builder *b, nir_ssa_def *x)
{
- return nir_fexp2(b, nir_fmul(b, x, nir_imm_float(b, M_LOG2E)));
+ return nir_fexp2(b, nir_fmul(b, x, nir_imm_floatN_t(b, M_LOG2E, x->bit_size)));
}

/**
@@ -186,7 +186,8 @@ build_exp(nir_builder *b, nir_ssa_def *x)
static nir_ssa_def *
build_log(nir_builder *b, nir_ssa_def *x)
{
- return nir_fmul(b, nir_flog2(b, x), nir_imm_float(b, 1.0 / M_LOG2E));
+ return nir_fmul(b, nir_flog2(b, x),
+ nir_imm_floatN_t(b, 1.0 / M_LOG2E, x->bit_size));
}

/**

--
2.17.1

Iago Toral Quiroga

2018-12-04 07:16:41 UTC

The PRM states that half-float operands are supported since gen9.
---
src/intel/compiler/brw_eu_emit.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c
index 4630b83b1a0..5f066d17a1f 100644
--- a/src/intel/compiler/brw_eu_emit.c
+++ b/src/intel/compiler/brw_eu_emit.c
@@ -1860,8 +1860,10 @@ void gen6_math(struct brw_codegen *p,
assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
(devinfo->gen >= 8 && src1.file == BRW_IMMEDIATE_VALUE));
} else {
- assert(src0.type == BRW_REGISTER_TYPE_F);
- assert(src1.type == BRW_REGISTER_TYPE_F);
+ assert(src0.type == BRW_REGISTER_TYPE_F ||
+ (src0.type == BRW_REGISTER_TYPE_HF && devinfo->gen >= 9));
+ assert(src1.type == BRW_REGISTER_TYPE_F ||
+ (src1.type == BRW_REGISTER_TYPE_HF && devinfo->gen >= 9));
}

/* Source modifiers are ignored for extended math instructions on Gen6. */

--
2.17.1

Pohjolainen, Topi

2018-12-04 17:22:21 UTC

Post by Iago Toral Quiroga
The PRM states that half-float operands are supported since gen9.
---
src/intel/compiler/brw_eu_emit.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c
index 4630b83b1a0..5f066d17a1f 100644
--- a/src/intel/compiler/brw_eu_emit.c
+++ b/src/intel/compiler/brw_eu_emit.c
@@ -1860,8 +1860,10 @@ void gen6_math(struct brw_codegen *p,
assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
(devinfo->gen >= 8 && src1.file == BRW_IMMEDIATE_VALUE));
} else {
- assert(src0.type == BRW_REGISTER_TYPE_F);
- assert(src1.type == BRW_REGISTER_TYPE_F);
+ assert(src0.type == BRW_REGISTER_TYPE_F ||
+ (src0.type == BRW_REGISTER_TYPE_HF && devinfo->gen >= 9));
+ assert(src1.type == BRW_REGISTER_TYPE_F ||
+ (src1.type == BRW_REGISTER_TYPE_HF && devinfo->gen >= 9));
}
/* Source modifiers are ignored for extended math instructions on Gen6. */
--
2.17.1
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Iago Toral Quiroga

2018-12-04 07:16:33 UTC

The 16-bit polynomial execution doesn't meet Khronos precision requirements.
Also, the half-float denorm range starts at 2^(-14) and with asin taking input
values in the range [0, 1], polynomial approximations can lead to flushing
relatively easy.

An alternative is to use the atan2 formula to compute asin, which is the
reference taken by Khronos to determine precision requirements, but that
ends up generating too many additional instructions when compared to the
polynomial approximation. Specifically, for the Intel case, doing this
adds +41 instructions to the program for each asin/acos call, which looks
like an undesirable trade off.

So for now we take the easy way out and fallback to using the 32-bit
polynomial approximation, which is better (faster) than the 16-bit atan2
implementation and gives us better precision that matches Khronos
requirements.
---
src/compiler/spirv/vtn_glsl450.c | 21 +++++++++++++++++++--
1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/src/compiler/spirv/vtn_glsl450.c b/src/compiler/spirv/vtn_glsl450.c
index bb340c87416..64a1431ae14 100644
--- a/src/compiler/spirv/vtn_glsl450.c
+++ b/src/compiler/spirv/vtn_glsl450.c
@@ -201,8 +201,20 @@ build_log(nir_builder *b, nir_ssa_def *x)
* in each case.
*/
static nir_ssa_def *
-build_asin(nir_builder *b, nir_ssa_def *x, float _p0, float _p1)
+build_asin(nir_builder *b, nir_ssa_def *_x, float _p0, float _p1)
{
+ /* The polynomial approximation isn't precise enough to meet half-float
+ * precision requirements. Alternatively, we could implement this using
+ * the formula:
+ *
+ * asin(x) = atan2(x, sqrt(1 - x*x))
+ *
+ * But that is very expensive, so instead we just do the polynomial
+ * approximation in 32-bit math and then we convert the result back to
+ * 16-bit.
+ */
+ nir_ssa_def *x = _x->bit_size == 16 ? nir_f2f32(b, _x) : _x;
+
nir_ssa_def *p0 = nir_imm_floatN_t(b, _p0, x->bit_size);
nir_ssa_def *p1 = nir_imm_floatN_t(b, _p1, x->bit_size);
nir_ssa_def *one = nir_imm_floatN_t(b, 1.0f, x->bit_size);
@@ -210,7 +222,8 @@ build_asin(nir_builder *b, nir_ssa_def *x, float _p0, float _p1)
nir_ssa_def *m_pi_4_minus_one =
nir_imm_floatN_t(b, M_PI_4f - 1.0f, x->bit_size);
nir_ssa_def *abs_x = nir_fabs(b, x);
- return nir_fmul(b, nir_fsign(b, x),
+ nir_ssa_def *result =
+ nir_fmul(b, nir_fsign(b, x),
nir_fsub(b, m_pi_2,
nir_fmul(b, nir_fsqrt(b, nir_fsub(b, one, abs_x)),
nir_fadd(b, m_pi_2,
@@ -220,6 +233,10 @@ build_asin(nir_builder *b, nir_ssa_def *x, float _p0, float _p1)
nir_fadd(b, p0,
nir_fmul(b, abs_x,
p1)))))))));
+ if (_x->bit_size == 16)
+ result = nir_f2f16(b, result);
+
+ return result;
}

/**

--
2.17.1

Jason Ekstrand

2018-12-07 15:26:14 UTC

Post by Iago Toral Quiroga
The 16-bit polynomial execution doesn't meet Khronos precision
requirements.
Also, the half-float denorm range starts at 2^(-14) and with asin taking input
values in the range [0, 1], polynomial approximations can lead to flushing
relatively easy.
An alternative is to use the atan2 formula to compute asin, which is the
reference taken by Khronos to determine precision requirements, but that
ends up generating too many additional instructions when compared to the
polynomial approximation. Specifically, for the Intel case, doing this
adds +41 instructions to the program for each asin/acos call, which looks
like an undesirable trade off.
So for now we take the easy way out and fallback to using the 32-bit
polynomial approximation, which is better (faster) than the 16-bit atan2
implementation and gives us better precision that matches Khronos
requirements.
---
src/compiler/spirv/vtn_glsl450.c | 21 +++++++++++++++++++--
1 file changed, 19 insertions(+), 2 deletions(-)
diff --git a/src/compiler/spirv/vtn_glsl450.c
b/src/compiler/spirv/vtn_glsl450.c
index bb340c87416..64a1431ae14 100644
--- a/src/compiler/spirv/vtn_glsl450.c
+++ b/src/compiler/spirv/vtn_glsl450.c
@@ -201,8 +201,20 @@ build_log(nir_builder *b, nir_ssa_def *x)
* in each case.
*/
static nir_ssa_def *
-build_asin(nir_builder *b, nir_ssa_def *x, float _p0, float _p1)
+build_asin(nir_builder *b, nir_ssa_def *_x, float _p0, float _p1)
{
+ /* The polynomial approximation isn't precise enough to meet half-float
+ * precision requirements. Alternatively, we could implement this using

This isn't surprising. It's possible we could restructure the
floating-point calculation to be more stable but just doing 32-bit seems
reasonable.

Post by Iago Toral Quiroga
+ *
+ * asin(x) = atan2(x, sqrt(1 - x*x))
+ *
+ * But that is very expensive, so instead we just do the polynomial
+ * approximation in 32-bit math and then we convert the result back to
+ * 16-bit.
+ */
+ nir_ssa_def *x = _x->bit_size == 16 ? nir_f2f32(b, _x) : _x;

Mind restructuring this as follows?

if (x->bit_size == 16) {
/* Comment goes here */
return f2f16(b, build_asin(b, f2f32(b, x), p0, p1));
}

I find a bit of recursion easier to read than having two bits at the
beginning and end.

Post by Iago Toral Quiroga
+
nir_ssa_def *p0 = nir_imm_floatN_t(b, _p0, x->bit_size);
nir_ssa_def *p1 = nir_imm_floatN_t(b, _p1, x->bit_size);
nir_ssa_def *one = nir_imm_floatN_t(b, 1.0f, x->bit_size);
@@ -210,7 +222,8 @@ build_asin(nir_builder *b, nir_ssa_def *x, float _p0, float _p1)
nir_ssa_def *m_pi_4_minus_one =
nir_imm_floatN_t(b, M_PI_4f - 1.0f, x->bit_size);
nir_ssa_def *abs_x = nir_fabs(b, x);
- return nir_fmul(b, nir_fsign(b, x),
+ nir_ssa_def *result =
+ nir_fmul(b, nir_fsign(b, x),
nir_fsub(b, m_pi_2,
nir_fmul(b, nir_fsqrt(b, nir_fsub(b, one, abs_x)),
nir_fadd(b, m_pi_2,
@@ -220,6 +233,10 @@ build_asin(nir_builder *b, nir_ssa_def *x, float _p0, float _p1)
nir_fadd(b, p0,
nir_fmul(b, abs_x,
p1)))))))));
+ if (_x->bit_size == 16)
+ result = nir_f2f16(b, result);
+
+ return result;
}
/**
--
2.17.1
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Iago Toral Quiroga

2018-12-04 07:16:42 UTC

Reviewed-by: Jason Ekstrand <***@jlekstrand.net>
---
src/compiler/nir/nir.h | 1 +
src/compiler/nir/nir_opt_algebraic.py | 1 +
2 files changed, 2 insertions(+)

diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index db935c8496b..25ed64aa73c 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -2080,6 +2080,7 @@ typedef struct nir_shader_compiler_options {
bool lower_fpow;
bool lower_fsat;
bool lower_fsqrt;
+ bool lower_fmod16;
bool lower_fmod32;
bool lower_fmod64;
/** Lowers ibitfield_extract/ubitfield_extract to ibfe/ubfe. */
diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py
index f2a7be0c403..4977a383c3b 100644
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@@ -621,6 +621,7 @@ optimizations = [
(('bcsel', ('ine', a, -1), ('ifind_msb', a), -1), ('ifind_msb', a)),

# Misc. lowering
+ (('***@16', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod16'),
(('***@32', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod32'),
(('***@64', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod64'),
(('frem', a, b), ('fsub', a, ('fmul', b, ('ftrunc', ('fdiv', a, b)))), 'options->lower_fmod32'),

--
2.17.1

Iago Toral Quiroga

2018-12-04 07:16:43 UTC

Reviewed-by: Jason Ekstrand <***@jlekstrand.net>
---
src/intel/compiler/brw_compiler.c | 1 +
1 file changed, 1 insertion(+)

diff --git a/src/intel/compiler/brw_compiler.c b/src/intel/compiler/brw_compiler.c
index fe632c5badc..f885e79c3e6 100644
--- a/src/intel/compiler/brw_compiler.c
+++ b/src/intel/compiler/brw_compiler.c
@@ -33,6 +33,7 @@
.lower_sub = true, \
.lower_fdiv = true, \
.lower_scmp = true, \
+ .lower_fmod16 = true, \
.lower_fmod32 = true, \
.lower_fmod64 = false, \
.lower_bitfield_extract = true, \

--
2.17.1

Iago Toral Quiroga

2018-12-04 07:16:44 UTC

Reviewed-by: Jason Ekstrand <***@jlekstrand.net>
---
src/compiler/nir/nir.h | 1 +
src/compiler/nir/nir_opt_algebraic.py | 1 +
2 files changed, 2 insertions(+)

diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 25ed64aa73c..65a1f60c3c6 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -2074,6 +2074,7 @@ typedef struct nir_shader_compiler_options {
bool lower_fdiv;
bool lower_ffma;
bool fuse_ffma;
+ bool lower_flrp16;
bool lower_flrp32;
/** Lowers flrp when it does not support doubles */
bool lower_flrp64;
diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py
index 4977a383c3b..6c3b77c9b6e 100644
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@@ -121,6 +121,7 @@ optimizations = [
(('~flrp', 0.0, a, b), ('fmul', a, b)),
(('~flrp', a, b, ('b2f', c)), ('bcsel', c, b, a), 'options->lower_flrp32'),
(('~flrp', a, 0.0, c), ('fadd', ('fmul', ('fneg', a), c), a)),
+ (('***@16', a, b, c), ('fadd', ('fmul', c, ('fsub', b, a)), a), 'options->lower_flrp16'),
(('***@32', a, b, c), ('fadd', ('fmul', c, ('fsub', b, a)), a), 'options->lower_flrp32'),
(('***@64', a, b, c), ('fadd', ('fmul', c, ('fsub', b, a)), a), 'options->lower_flrp64'),
(('ffloor', a), ('fsub', a, ('ffract', a)), 'options->lower_ffloor'),

--
2.17.1

Iago Toral Quiroga

2018-12-04 07:16:56 UTC

In SIMD8 we pack 2 vector components in a single SIMD register, so
for example, component Y of a 16-bit vec2 starts is at byte offset
16B. This means that when we compute the offset of the elements to
be differentiated we should not stomp whatever base offset we have,
but instead add to it.
---
src/intel/compiler/brw_fs_generator.cpp | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp
index bffd9bc4787..d8e4bae17e0 100644
--- a/src/intel/compiler/brw_fs_generator.cpp
+++ b/src/intel/compiler/brw_fs_generator.cpp
@@ -1259,7 +1259,7 @@ fs_generator::generate_ddx(const fs_inst *inst,
struct brw_reg src0 = src;
struct brw_reg src1 = src;

- src0.subnr = type_sz(src.type);
+ src0.subnr += type_sz(src.type);
src0.vstride = vstride;
src0.width = width;
src0.hstride = BRW_HORIZONTAL_STRIDE_0;
@@ -1325,8 +1325,8 @@ fs_generator::generate_ddy(const fs_inst *inst,
/* replicate the derivative at the top-left pixel to other pixels */
struct brw_reg src0 = stride(src, 4, 4, 0);
struct brw_reg src1 = stride(src, 4, 4, 0);
- src0.subnr = 0 * type_size;
- src1.subnr = 2 * type_size;
+ src0.subnr += 0 * type_size;
+ src1.subnr += 2 * type_size;

brw_ADD(p, dst, negate(src0), src1);
}

--
2.17.1

Iago Toral Quiroga

2018-12-04 07:16:38 UTC

The hardware doesn't support half-float for these.
---
src/intel/compiler/brw_nir.c | 5 +++++
1 file changed, 5 insertions(+)

diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c
index aa6788b9fe5..e0027f5179c 100644
--- a/src/intel/compiler/brw_nir.c
+++ b/src/intel/compiler/brw_nir.c
@@ -620,6 +620,11 @@ lower_bit_size_callback(const nir_alu_instr *alu, UNUSED void *data)
case nir_op_irem:
case nir_op_udiv:
case nir_op_umod:
+ case nir_op_fceil:
+ case nir_op_ffloor:
+ case nir_op_ffract:
+ case nir_op_fround_even:
+ case nir_op_ftrunc:
return 32;
default:
return 0;

--
2.17.1

Pohjolainen, Topi

2018-12-05 11:27:38 UTC

Post by Iago Toral Quiroga
The hardware doesn't support half-float for these.
---
src/intel/compiler/brw_nir.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c
index aa6788b9fe5..e0027f5179c 100644
--- a/src/intel/compiler/brw_nir.c
+++ b/src/intel/compiler/brw_nir.c
@@ -620,6 +620,11 @@ lower_bit_size_callback(const nir_alu_instr *alu, UNUSED void *data)
return 32;
return 0;
--
2.17.1
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Jason Ekstrand

2018-12-07 17:13:25 UTC

Iago Toral Quiroga

2018-12-04 07:16:52 UTC

Source0 and Destination extract the floating-point precision automatically
from the SrcType and DstType instruction fields respectively when they are
set to types :F or :HF. For Source1 and Source2 operands, we use the new
1-bit fields Src1Type and Src2Type, where 0 means normal precision and 1
means half-precision. Since we always use the type of the destination for
all operands when we emit 3-source instructions, we only need set Src1Type
and Src2Type to 1 when we are emitting a half-precision instruction.
---
src/intel/compiler/brw_eu_emit.c | 5 +++++
1 file changed, 5 insertions(+)

diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c
index 2c9fc9a5c7c..66edfb43baf 100644
--- a/src/intel/compiler/brw_eu_emit.c
+++ b/src/intel/compiler/brw_eu_emit.c
@@ -801,6 +801,11 @@ brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest,
*/
brw_inst_set_3src_a16_src_type(devinfo, inst, dest.type);
brw_inst_set_3src_a16_dst_type(devinfo, inst, dest.type);
+
+ if (devinfo->gen >= 8 && dest.type == BRW_REGISTER_TYPE_HF) {
+ brw_inst_set_3src_a16_src1_type(devinfo, inst, 1);
+ brw_inst_set_3src_a16_src2_type(devinfo, inst, 1);
+ }
}
}

--
2.17.1

Pohjolainen, Topi

2018-12-05 12:58:24 UTC

Post by Iago Toral Quiroga
Source0 and Destination extract the floating-point precision automatically
from the SrcType and DstType instruction fields respectively when they are
set to types :F or :HF. For Source1 and Source2 operands, we use the new
1-bit fields Src1Type and Src2Type, where 0 means normal precision and 1
means half-precision. Since we always use the type of the destination for
all operands when we emit 3-source instructions, we only need set Src1Type
and Src2Type to 1 when we are emitting a half-precision instruction.
---
src/intel/compiler/brw_eu_emit.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c
index 2c9fc9a5c7c..66edfb43baf 100644
--- a/src/intel/compiler/brw_eu_emit.c
+++ b/src/intel/compiler/brw_eu_emit.c
@@ -801,6 +801,11 @@ brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest,
*/
brw_inst_set_3src_a16_src_type(devinfo, inst, dest.type);
brw_inst_set_3src_a16_dst_type(devinfo, inst, dest.type);
+
+ if (devinfo->gen >= 8 && dest.type == BRW_REGISTER_TYPE_HF) {
+ brw_inst_set_3src_a16_src1_type(devinfo, inst, 1);
+ brw_inst_set_3src_a16_src2_type(devinfo, inst, 1);
+ }

I had similar patch which prepares for mixed mode (useful for linterp with
32-bit input varyings):

/* From the Bspec: Instruction types
*
* Three source instructions can use operands with mixed-mode
* precision. When SrcType field is set to :f or :hf it defines
* precision for source 0 only, and fields Src1Type and Src2Type
* define precision for other source operands:
*
* 0b = :f. Single precision Float (32-bit).
* 1b = :hf. Half precision Float (16-bit).
*/
if (src1.type == BRW_REGISTER_TYPE_HF)
brw_inst_set_3src_src1_type(devinfo, inst, 1);

if (src2.type == BRW_REGISTER_TYPE_HF)
brw_inst_set_3src_src2_type(devinfo, inst, 1);

How would you feel about that? (Direct cut-paste and the helpers have
different name).

Post by Iago Toral Quiroga
}
}
--
2.17.1
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Iago Toral

2018-12-05 13:04:16 UTC

Post by Iago Toral Quiroga
Source0 and Destination extract the floating-point precision
automatically
from the SrcType and DstType instruction fields respectively when they are
set to types :F or :HF. For Source1 and Source2 operands, we use the new
1-bit fields Src1Type and Src2Type, where 0 means normal precision and 1
means half-precision. Since we always use the type of the
destination for
all operands when we emit 3-source instructions, we only need set Src1Type
and Src2Type to 1 when we are emitting a half-precision
instruction.
---
src/intel/compiler/brw_eu_emit.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/src/intel/compiler/brw_eu_emit.c
b/src/intel/compiler/brw_eu_emit.c
index 2c9fc9a5c7c..66edfb43baf 100644
--- a/src/intel/compiler/brw_eu_emit.c
+++ b/src/intel/compiler/brw_eu_emit.c
@@ -801,6 +801,11 @@ brw_alu3(struct brw_codegen *p, unsigned
opcode, struct brw_reg dest,
*/
brw_inst_set_3src_a16_src_type(devinfo, inst, dest.type);
brw_inst_set_3src_a16_dst_type(devinfo, inst, dest.type);
+
+ if (devinfo->gen >= 8 && dest.type ==
BRW_REGISTER_TYPE_HF) {
+ brw_inst_set_3src_a16_src1_type(devinfo, inst, 1);
+ brw_inst_set_3src_a16_src2_type(devinfo, inst, 1);
+ }

I had similar patch which prepares for mixed mode (useful for linterp with
/* From the Bspec: Instruction types
*
* Three source instructions can use operands with mixed-
mode
* precision. When SrcType field is set to :f or :hf it defines
* precision for source 0 only, and fields Src1Type and Src2Type
*
* 0b = :f. Single precision Float (32-bit).
* 1b = :hf. Half precision Float (16-bit).
*/
if (src1.type == BRW_REGISTER_TYPE_HF)
brw_inst_set_3src_src1_type(devinfo, inst, 1);
if (src2.type == BRW_REGISTER_TYPE_HF)
brw_inst_set_3src_src2_type(devinfo, inst, 1);
How would you feel about that? (Direct cut-paste and the helpers have
different name).

Sure, if we are planning to use mixed mode in the future this makes
more sense. Thanks!

Post by Iago Toral Quiroga
}
}
--
2.17.1
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Pohjolainen, Topi

2018-12-05 13:14:08 UTC

Post by Iago Toral Quiroga
Source0 and Destination extract the floating-point precision automatically
from the SrcType and DstType instruction fields respectively when they are
set to types :F or :HF. For Source1 and Source2 operands, we use the new
1-bit fields Src1Type and Src2Type, where 0 means normal precision and 1
means half-precision. Since we always use the type of the
destination for
all operands when we emit 3-source instructions, we only need set Src1Type
and Src2Type to 1 when we are emitting a half-precision
instruction.
---
src/intel/compiler/brw_eu_emit.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/src/intel/compiler/brw_eu_emit.c
b/src/intel/compiler/brw_eu_emit.c
index 2c9fc9a5c7c..66edfb43baf 100644
--- a/src/intel/compiler/brw_eu_emit.c
+++ b/src/intel/compiler/brw_eu_emit.c
@@ -801,6 +801,11 @@ brw_alu3(struct brw_codegen *p, unsigned
opcode, struct brw_reg dest,
*/
brw_inst_set_3src_a16_src_type(devinfo, inst, dest.type);
brw_inst_set_3src_a16_dst_type(devinfo, inst, dest.type);
+
+ if (devinfo->gen >= 8 && dest.type ==
BRW_REGISTER_TYPE_HF) {
+ brw_inst_set_3src_a16_src1_type(devinfo, inst, 1);
+ brw_inst_set_3src_a16_src2_type(devinfo, inst, 1);
+ }

Sure, if we are planning to use mixed mode in the future this makes
more sense. Thanks!

Nice!

Reviewed-by: Topi Pohjolainen <***@intel.com>

Iago Toral Quiroga

2018-12-04 07:16:57 UTC

The implementation of these opcodes in the generator assumes that their
arguments are packed, and it generates register regions based on that
assumption. While this expectation is reasonable for 32-bit, when we
load 16-bit elements from UBOs we get them with a stride of 2 that we
then need to pack with a stride of 1. Copy propagation can see through this
and rewrite ddx/ddy operands to use the original, strided register, breaking
the implementation in the generator.
---
.../compiler/brw_fs_copy_propagation.cpp | 21 +++++++++++++++++++
1 file changed, 21 insertions(+)

diff --git a/src/intel/compiler/brw_fs_copy_propagation.cpp b/src/intel/compiler/brw_fs_copy_propagation.cpp
index 58d5080b4e9..c01d4ec4a4f 100644
--- a/src/intel/compiler/brw_fs_copy_propagation.cpp
+++ b/src/intel/compiler/brw_fs_copy_propagation.cpp
@@ -361,6 +361,20 @@ can_take_stride(fs_inst *inst, unsigned arg, unsigned stride,
return true;
}

+static bool
+instruction_requires_packed_data(fs_inst *inst)
+{
+ switch (inst->opcode) {
+ case FS_OPCODE_DDX_FINE:
+ case FS_OPCODE_DDX_COARSE:
+ case FS_OPCODE_DDY_FINE:
+ case FS_OPCODE_DDY_COARSE:
+ return true;
+ default:
+ return false;
+ }
+}
+
bool
fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
{
@@ -407,6 +421,13 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
inst->opcode == SHADER_OPCODE_GEN4_SCRATCH_WRITE)
return false;

+ /* Some instructions implemented in the generator backend, such as
+ * derivatives, assume that their operands are packed so we can't
+ * generally propagate strided regions to them.
+ */
+ if (instruction_requires_packed_data(inst) && entry->src.stride > 1)
+ return false;
+
/* Bail if the result of composing both strides would exceed the
* hardware limit.
*/

--
2.17.1

Iago Toral Quiroga

2018-12-04 07:17:00 UTC

---
src/compiler/shader_info.h | 1 +
src/compiler/spirv/spirv_to_nir.c | 4 +++-
2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/compiler/shader_info.h b/src/compiler/shader_info.h
index 65bc0588d67..0a3cb37069c 100644
--- a/src/compiler/shader_info.h
+++ b/src/compiler/shader_info.h
@@ -45,6 +45,7 @@ struct spirv_supported_capabilities {
bool variable_pointers;
bool storage_16bit;
bool int16;
+ bool float16;
bool shader_viewport_index_layer;
bool subgroup_arithmetic;
bool subgroup_ballot;
diff --git a/src/compiler/spirv/spirv_to_nir.c b/src/compiler/spirv/spirv_to_nir.c
index a05c4d236ca..6f6673c8fb1 100644
--- a/src/compiler/spirv/spirv_to_nir.c
+++ b/src/compiler/spirv/spirv_to_nir.c
@@ -3415,7 +3415,6 @@ vtn_handle_preamble_instruction(struct vtn_builder *b, SpvOp opcode,
case SpvCapabilityLinkage:
case SpvCapabilityVector16:
case SpvCapabilityFloat16Buffer:
- case SpvCapabilityFloat16:
case SpvCapabilityInt64Atomics:
case SpvCapabilityStorageImageMultisample:
case SpvCapabilityInt8:
@@ -3432,6 +3431,9 @@ vtn_handle_preamble_instruction(struct vtn_builder *b, SpvOp opcode,
case SpvCapabilityFloat64:
spv_check_supported(float64, cap);
break;
+ case SpvCapabilityFloat16:
+ spv_check_supported(float16, cap);
+ break;
case SpvCapabilityInt64:
spv_check_supported(int64, cap);
break;

--
2.17.1

Iago Toral Quiroga

2018-12-04 07:16:40 UTC

---
src/intel/compiler/brw_fs_nir.cpp | 27 +++++++++++++++++++++------
1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index 3eba8a478f5..559b55a0f84 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -938,14 +938,29 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
* Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
* zero.
*/
- bld.CMP(bld.null_reg_f(), op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ);
+ fs_reg zero, one_mask, sign_mask;
+ brw_reg_type reg_type;
+ if (type_sz(op[0].type) == 4) {
+ zero = brw_imm_f(0.0f);
+ one_mask = brw_imm_ud(0x3f800000);
+ sign_mask = brw_imm_ud(0x80000000);
+ reg_type = BRW_REGISTER_TYPE_UD;
+ } else {
+ assert(type_sz(op[0].type) == 2);
+ zero = retype(brw_imm_uw(0), BRW_REGISTER_TYPE_HF);
+ one_mask = brw_imm_uw(0x3c00);
+ sign_mask = brw_imm_uw(0x8000);
+ reg_type = BRW_REGISTER_TYPE_UW;
+ }
+
+ bld.CMP(bld.null_reg_f(), op[0], zero, BRW_CONDITIONAL_NZ);

- fs_reg result_int = retype(result, BRW_REGISTER_TYPE_UD);
- op[0].type = BRW_REGISTER_TYPE_UD;
- result.type = BRW_REGISTER_TYPE_UD;
- bld.AND(result_int, op[0], brw_imm_ud(0x80000000u));
+ fs_reg result_int = retype(result, reg_type);
+ op[0].type = reg_type;
+ result.type = reg_type;
+ bld.AND(result_int, op[0], sign_mask);

- inst = bld.OR(result_int, result_int, brw_imm_ud(0x3f800000u));
+ inst = bld.OR(result_int, result_int, one_mask);
inst->predicate = BRW_PREDICATE_NORMAL;
} else {
/* For doubles we do the same but we need to consider:

--
2.17.1

Pohjolainen, Topi

2018-12-04 17:19:25 UTC

Post by Iago Toral Quiroga
---
src/intel/compiler/brw_fs_nir.cpp | 27 +++++++++++++++++++++------
1 file changed, 21 insertions(+), 6 deletions(-)
diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index 3eba8a478f5..559b55a0f84 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -938,14 +938,29 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
* Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
* zero.
*/
- bld.CMP(bld.null_reg_f(), op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ);
+ fs_reg zero, one_mask, sign_mask;
+ brw_reg_type reg_type;
+ if (type_sz(op[0].type) == 4) {
+ zero = brw_imm_f(0.0f);
+ one_mask = brw_imm_ud(0x3f800000);
+ sign_mask = brw_imm_ud(0x80000000);
+ reg_type = BRW_REGISTER_TYPE_UD;
+ } else {
+ assert(type_sz(op[0].type) == 2);
+ zero = retype(brw_imm_uw(0), BRW_REGISTER_TYPE_HF);
+ one_mask = brw_imm_uw(0x3c00);
+ sign_mask = brw_imm_uw(0x8000);
+ reg_type = BRW_REGISTER_TYPE_UW;
+ }
+
+ bld.CMP(bld.null_reg_f(), op[0], zero, BRW_CONDITIONAL_NZ);
- fs_reg result_int = retype(result, BRW_REGISTER_TYPE_UD);
- op[0].type = BRW_REGISTER_TYPE_UD;
- result.type = BRW_REGISTER_TYPE_UD;
- bld.AND(result_int, op[0], brw_imm_ud(0x80000000u));
+ fs_reg result_int = retype(result, reg_type);
+ op[0].type = reg_type;
+ result.type = reg_type;
+ bld.AND(result_int, op[0], sign_mask);
- inst = bld.OR(result_int, result_int, brw_imm_ud(0x3f800000u));
+ inst = bld.OR(result_int, result_int, one_mask);
inst->predicate = BRW_PREDICATE_NORMAL;
} else {
--
2.17.1
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Iago Toral Quiroga

2018-12-04 07:17:02 UTC

---
include/vulkan/vulkan_core.h | 109 +++++++++++++++++++++++++++--
src/vulkan/registry/vk.xml | 130 +++++++++++++++++++++++++++--------
2 files changed, 204 insertions(+), 35 deletions(-)

diff --git a/include/vulkan/vulkan_core.h b/include/vulkan/vulkan_core.h
index 35c06649aa5..50064109ef3 100644
--- a/include/vulkan/vulkan_core.h
+++ b/include/vulkan/vulkan_core.h
@@ -43,7 +43,7 @@ extern "C" {
#define VK_VERSION_MINOR(version) (((uint32_t)(version) >> 12) & 0x3ff)
#define VK_VERSION_PATCH(version) ((uint32_t)(version) & 0xfff)
// Version of this file
-#define VK_HEADER_VERSION 93
+#define VK_HEADER_VERSION 95

#define VK_NULL_HANDLE 0
@@ -327,6 +327,7 @@ typedef enum VkStructureType {
VK_STRUCTURE_TYPE_COMMAND_BUFFER_INHERITANCE_CONDITIONAL_RENDERING_INFO_EXT = 1000081000,
VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CONDITIONAL_RENDERING_FEATURES_EXT = 1000081001,
VK_STRUCTURE_TYPE_CONDITIONAL_RENDERING_BEGIN_INFO_EXT = 1000081002,
+ VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR = 1000082000,
VK_STRUCTURE_TYPE_PRESENT_REGIONS_KHR = 1000084000,
VK_STRUCTURE_TYPE_OBJECT_TABLE_CREATE_INFO_NVX = 1000086000,
VK_STRUCTURE_TYPE_INDIRECT_COMMANDS_LAYOUT_CREATE_INFO_NVX = 1000086001,
@@ -442,6 +443,7 @@ typedef enum VkStructureType {
VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_DIVISOR_STATE_CREATE_INFO_EXT = 1000190001,
VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VERTEX_ATTRIBUTE_DIVISOR_FEATURES_EXT = 1000190002,
VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DRIVER_PROPERTIES_KHR = 1000196000,
+ VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT_CONTROLS_PROPERTIES_KHR = 1000197000,
VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COMPUTE_SHADER_DERIVATIVES_FEATURES_NV = 1000201000,
VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MESH_SHADER_FEATURES_NV = 1000202000,
VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MESH_SHADER_PROPERTIES_NV = 1000202001,
@@ -454,6 +456,9 @@ typedef enum VkStructureType {
VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_MEMORY_MODEL_FEATURES_KHR = 1000211000,
VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PCI_BUS_INFO_PROPERTIES_EXT = 1000212000,
VK_STRUCTURE_TYPE_IMAGEPIPE_SURFACE_CREATE_INFO_FUCHSIA = 1000214000,
+ VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FRAGMENT_DENSITY_MAP_FEATURES_EXT = 1000218000,
+ VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FRAGMENT_DENSITY_MAP_PROPERTIES_EXT = 1000218001,
+ VK_STRUCTURE_TYPE_RENDER_PASS_FRAGMENT_DENSITY_MAP_CREATE_INFO_EXT = 1000218002,
VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SCALAR_BLOCK_LAYOUT_FEATURES_EXT = 1000221000,
VK_STRUCTURE_TYPE_IMAGE_STENCIL_USAGE_CREATE_INFO_EXT = 1000246000,
VK_STRUCTURE_TYPE_DEBUG_REPORT_CREATE_INFO_EXT = VK_STRUCTURE_TYPE_DEBUG_REPORT_CALLBACK_CREATE_INFO_EXT,
@@ -879,6 +884,7 @@ typedef enum VkImageLayout {
VK_IMAGE_LAYOUT_PRESENT_SRC_KHR = 1000001002,
VK_IMAGE_LAYOUT_SHARED_PRESENT_KHR = 1000111000,
VK_IMAGE_LAYOUT_SHADING_RATE_OPTIMAL_NV = 1000164003,
+ VK_IMAGE_LAYOUT_FRAGMENT_DENSITY_MAP_OPTIMAL_EXT = 1000218000,
VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL_KHR = VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL,
VK_IMAGE_LAYOUT_DEPTH_ATTACHMENT_STENCIL_READ_ONLY_OPTIMAL_KHR = VK_IMAGE_LAYOUT_DEPTH_ATTACHMENT_STENCIL_READ_ONLY_OPTIMAL,
VK_IMAGE_LAYOUT_BEGIN_RANGE = VK_IMAGE_LAYOUT_UNDEFINED,
@@ -1326,6 +1332,7 @@ typedef enum VkFormatFeatureFlagBits {
VK_FORMAT_FEATURE_COSITED_CHROMA_SAMPLES_BIT = 0x00800000,
VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_CUBIC_BIT_IMG = 0x00002000,
VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_MINMAX_BIT_EXT = 0x00010000,
+ VK_FORMAT_FEATURE_FRAGMENT_DENSITY_MAP_BIT_EXT = 0x01000000,
VK_FORMAT_FEATURE_TRANSFER_SRC_BIT_KHR = VK_FORMAT_FEATURE_TRANSFER_SRC_BIT,
VK_FORMAT_FEATURE_TRANSFER_DST_BIT_KHR = VK_FORMAT_FEATURE_TRANSFER_DST_BIT,
VK_FORMAT_FEATURE_MIDPOINT_CHROMA_SAMPLES_BIT_KHR = VK_FORMAT_FEATURE_MIDPOINT_CHROMA_SAMPLES_BIT,
@@ -1349,6 +1356,7 @@ typedef enum VkImageUsageFlagBits {
VK_IMAGE_USAGE_TRANSIENT_ATTACHMENT_BIT = 0x00000040,
VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT = 0x00000080,
VK_IMAGE_USAGE_SHADING_RATE_IMAGE_BIT_NV = 0x00000100,
+ VK_IMAGE_USAGE_FRAGMENT_DENSITY_MAP_BIT_EXT = 0x00000200,
VK_IMAGE_USAGE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
} VkImageUsageFlagBits;
typedef VkFlags VkImageUsageFlags;
@@ -1368,6 +1376,7 @@ typedef enum VkImageCreateFlagBits {
VK_IMAGE_CREATE_DISJOINT_BIT = 0x00000200,
VK_IMAGE_CREATE_CORNER_SAMPLED_BIT_NV = 0x00002000,
VK_IMAGE_CREATE_SAMPLE_LOCATIONS_COMPATIBLE_DEPTH_BIT_EXT = 0x00001000,
+ VK_IMAGE_CREATE_SUBSAMPLED_BIT_EXT = 0x00004000,
VK_IMAGE_CREATE_SPLIT_INSTANCE_BIND_REGIONS_BIT_KHR = VK_IMAGE_CREATE_SPLIT_INSTANCE_BIND_REGIONS_BIT,
VK_IMAGE_CREATE_2D_ARRAY_COMPATIBLE_BIT_KHR = VK_IMAGE_CREATE_2D_ARRAY_COMPATIBLE_BIT,
VK_IMAGE_CREATE_BLOCK_TEXEL_VIEW_COMPATIBLE_BIT_KHR = VK_IMAGE_CREATE_BLOCK_TEXEL_VIEW_COMPATIBLE_BIT,
@@ -1452,6 +1461,7 @@ typedef enum VkPipelineStageFlagBits {
VK_PIPELINE_STAGE_ACCELERATION_STRUCTURE_BUILD_BIT_NV = 0x02000000,
VK_PIPELINE_STAGE_TASK_SHADER_BIT_NV = 0x00080000,
VK_PIPELINE_STAGE_MESH_SHADER_BIT_NV = 0x00100000,
+ VK_PIPELINE_STAGE_FRAGMENT_DENSITY_PROCESS_BIT_EXT = 0x00800000,
VK_PIPELINE_STAGE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
} VkPipelineStageFlagBits;
typedef VkFlags VkPipelineStageFlags;
@@ -1551,6 +1561,11 @@ typedef enum VkBufferUsageFlagBits {
} VkBufferUsageFlagBits;
typedef VkFlags VkBufferUsageFlags;
typedef VkFlags VkBufferViewCreateFlags;
+
+typedef enum VkImageViewCreateFlagBits {
+ VK_IMAGE_VIEW_CREATE_FRAGMENT_DENSITY_MAP_DYNAMIC_BIT_EXT = 0x00000001,
+ VK_IMAGE_VIEW_CREATE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkImageViewCreateFlagBits;
typedef VkFlags VkImageViewCreateFlags;
typedef VkFlags VkShaderModuleCreateFlags;
typedef VkFlags VkPipelineCacheCreateFlags;
@@ -1617,6 +1632,12 @@ typedef VkFlags VkColorComponentFlags;
typedef VkFlags VkPipelineDynamicStateCreateFlags;
typedef VkFlags VkPipelineLayoutCreateFlags;
typedef VkFlags VkShaderStageFlags;
+
+typedef enum VkSamplerCreateFlagBits {
+ VK_SAMPLER_CREATE_SUBSAMPLED_BIT_EXT = 0x00000001,
+ VK_SAMPLER_CREATE_SUBSAMPLED_COARSE_RECONSTRUCTION_BIT_EXT = 0x00000002,
+ VK_SAMPLER_CREATE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkSamplerCreateFlagBits;
typedef VkFlags VkSamplerCreateFlags;

typedef enum VkDescriptorSetLayoutCreateFlagBits {
@@ -1677,6 +1698,7 @@ typedef enum VkAccessFlagBits {
VK_ACCESS_SHADING_RATE_IMAGE_READ_BIT_NV = 0x00800000,
VK_ACCESS_ACCELERATION_STRUCTURE_READ_BIT_NV = 0x00200000,
VK_ACCESS_ACCELERATION_STRUCTURE_WRITE_BIT_NV = 0x00400000,
+ VK_ACCESS_FRAGMENT_DENSITY_MAP_READ_BIT_EXT = 0x01000000,
VK_ACCESS_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
} VkAccessFlagBits;
typedef VkFlags VkAccessFlags;
@@ -4357,7 +4379,7 @@ typedef struct VkDescriptorUpdateTemplateEntry {

typedef struct VkDescriptorUpdateTemplateCreateInfo {
VkStructureType sType;
- void* pNext;
+ const void* pNext;
VkDescriptorUpdateTemplateCreateFlags flags;
uint32_t descriptorUpdateEntryCount;
const VkDescriptorUpdateTemplateEntry* pDescriptorUpdateEntries;
@@ -4796,6 +4818,7 @@ VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkSwapchainKHR)
typedef enum VkSwapchainCreateFlagBitsKHR {
VK_SWAPCHAIN_CREATE_SPLIT_INSTANCE_BIND_REGIONS_BIT_KHR = 0x00000001,
VK_SWAPCHAIN_CREATE_PROTECTED_BIT_KHR = 0x00000002,
+ VK_SWAPCHAIN_CREATE_MUTABLE_FORMAT_BIT_KHR = 0x00000004,
VK_SWAPCHAIN_CREATE_FLAG_BITS_MAX_ENUM_KHR = 0x7FFFFFFF
} VkSwapchainCreateFlagBitsKHR;
typedef VkFlags VkSwapchainCreateFlagsKHR;
@@ -5472,6 +5495,19 @@ VKAPI_ATTR void VKAPI_CALL vkCmdPushDescriptorSetWithTemplateKHR(
const void* pData);
#endif

+#define VK_KHR_shader_float16_int8 1
+#define VK_KHR_SHADER_FLOAT16_INT8_SPEC_VERSION 1
+#define VK_KHR_SHADER_FLOAT16_INT8_EXTENSION_NAME "VK_KHR_shader_float16_int8"
+
+typedef struct VkPhysicalDeviceFloat16Int8FeaturesKHR {
+ VkStructureType sType;
+ void* pNext;
+ VkBool32 shaderFloat16;
+ VkBool32 shaderInt8;
+} VkPhysicalDeviceFloat16Int8FeaturesKHR;
+
+
+
#define VK_KHR_16bit_storage 1
#define VK_KHR_16BIT_STORAGE_SPEC_VERSION 1
#define VK_KHR_16BIT_STORAGE_EXTENSION_NAME "VK_KHR_16bit_storage"
@@ -6128,6 +6164,39 @@ typedef struct VkPhysicalDeviceDriverPropertiesKHR {

+#define VK_KHR_shader_float_controls 1
+#define VK_KHR_SHADER_FLOAT_CONTROLS_SPEC_VERSION 1
+#define VK_KHR_SHADER_FLOAT_CONTROLS_EXTENSION_NAME "VK_KHR_shader_float_controls"
+
+typedef struct VkPhysicalDeviceFloatControlsPropertiesKHR {
+ VkStructureType sType;
+ void* pNext;
+ VkBool32 separateDenormSettings;
+ VkBool32 separateRoundingModeSettings;
+ VkBool32 shaderSignedZeroInfNanPreserveFloat16;
+ VkBool32 shaderSignedZeroInfNanPreserveFloat32;
+ VkBool32 shaderSignedZeroInfNanPreserveFloat64;
+ VkBool32 shaderDenormPreserveFloat16;
+ VkBool32 shaderDenormPreserveFloat32;
+ VkBool32 shaderDenormPreserveFloat64;
+ VkBool32 shaderDenormFlushToZeroFloat16;
+ VkBool32 shaderDenormFlushToZeroFloat32;
+ VkBool32 shaderDenormFlushToZeroFloat64;
+ VkBool32 shaderRoundingModeRTEFloat16;
+ VkBool32 shaderRoundingModeRTEFloat32;
+ VkBool32 shaderRoundingModeRTEFloat64;
+ VkBool32 shaderRoundingModeRTZFloat16;
+ VkBool32 shaderRoundingModeRTZFloat32;
+ VkBool32 shaderRoundingModeRTZFloat64;
+} VkPhysicalDeviceFloatControlsPropertiesKHR;
+
+
+
+#define VK_KHR_swapchain_mutable_format 1
+#define VK_KHR_SWAPCHAIN_MUTABLE_FORMAT_SPEC_VERSION 1
+#define VK_KHR_SWAPCHAIN_MUTABLE_FORMAT_EXTENSION_NAME "VK_KHR_swapchain_mutable_format"
+
+
#define VK_KHR_vulkan_memory_model 1
#define VK_KHR_VULKAN_MEMORY_MODEL_SPEC_VERSION 2
#define VK_KHR_VULKAN_MEMORY_MODEL_EXTENSION_NAME "VK_KHR_vulkan_memory_model"
@@ -7449,11 +7518,11 @@ typedef struct VkDebugUtilsMessengerCallbackDataEXT {
int32_t messageIdNumber;
const char* pMessage;
uint32_t queueLabelCount;
- VkDebugUtilsLabelEXT* pQueueLabels;
+ const VkDebugUtilsLabelEXT* pQueueLabels;
uint32_t cmdBufLabelCount;
- VkDebugUtilsLabelEXT* pCmdBufLabels;
+ const VkDebugUtilsLabelEXT* pCmdBufLabels;
uint32_t objectCount;
- VkDebugUtilsObjectNameInfoEXT* pObjects;
+ const VkDebugUtilsObjectNameInfoEXT* pObjects;
} VkDebugUtilsMessengerCallbackDataEXT;

typedef VkBool32 (VKAPI_PTR *PFN_vkDebugUtilsMessengerCallbackEXT)(
@@ -8117,7 +8186,7 @@ VKAPI_ATTR void VKAPI_CALL vkCmdSetCoarseSampleOrderNV(
#define VK_NV_ray_tracing 1
VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkAccelerationStructureNV)

-#define VK_NV_RAY_TRACING_SPEC_VERSION 2
+#define VK_NV_RAY_TRACING_SPEC_VERSION 3
#define VK_NV_RAY_TRACING_EXTENSION_NAME "VK_NV_ray_tracing"
#define VK_SHADER_UNUSED_NV (~0U)

@@ -8807,6 +8876,34 @@ typedef struct VkPhysicalDevicePCIBusInfoPropertiesEXT {

+#define VK_EXT_fragment_density_map 1
+#define VK_EXT_FRAGMENT_DENSITY_MAP_SPEC_VERSION 1
+#define VK_EXT_FRAGMENT_DENSITY_MAP_EXTENSION_NAME "VK_EXT_fragment_density_map"
+
+typedef struct VkPhysicalDeviceFragmentDensityMapFeaturesEXT {
+ VkStructureType sType;
+ void* pNext;
+ VkBool32 fragmentDensityMap;
+ VkBool32 fragmentDensityMapDynamic;
+ VkBool32 fragmentDensityMapNonSubsampledImages;
+} VkPhysicalDeviceFragmentDensityMapFeaturesEXT;
+
+typedef struct VkPhysicalDeviceFragmentDensityMapPropertiesEXT {
+ VkStructureType sType;
+ void* pNext;
+ VkExtent2D minFragmentDensityTexelSize;
+ VkExtent2D maxFragmentDensityTexelSize;
+ VkBool32 fragmentDensityInvocations;
+} VkPhysicalDeviceFragmentDensityMapPropertiesEXT;
+
+typedef struct VkRenderPassFragmentDensityMapCreateInfoEXT {
+ VkStructureType sType;
+ const void* pNext;
+ VkAttachmentReference fragmentDensityMapAttachment;
+} VkRenderPassFragmentDensityMapCreateInfoEXT;
+
+
+
#define VK_EXT_scalar_block_layout 1
#define VK_EXT_SCALAR_BLOCK_LAYOUT_SPEC_VERSION 1
#define VK_EXT_SCALAR_BLOCK_LAYOUT_EXTENSION_NAME "VK_EXT_scalar_block_layout"
diff --git a/src/vulkan/registry/vk.xml b/src/vulkan/registry/vk.xml
index 6cfa256d3b3..2c9b79976bc 100644
--- a/src/vulkan/registry/vk.xml
+++ b/src/vulkan/registry/vk.xml
@@ -146,7 +146,7 @@ server.
<type category="define">// Vulkan 1.1 version number
#define <name>VK_API_VERSION_1_1</name> <type>VK_MAKE_VERSION</type>(1, 1, 0)// Patch version should always be set to 0</type>
<type category="define">// Version of this file
-#define <name>VK_HEADER_VERSION</name> 93</type>
+#define <name>VK_HEADER_VERSION</name> 95</type>

<type category="define">
#define <name>VK_DEFINE_HANDLE</name>(object) typedef struct object##_T* object;</type>
@@ -187,7 +187,7 @@ server.
<type category="bitmask">typedef <type>VkFlags</type> <name>VkFramebufferCreateFlags</name>;</type>
<type category="bitmask">typedef <type>VkFlags</type> <name>VkQueryPoolCreateFlags</name>;</type>
<type category="bitmask">typedef <type>VkFlags</type> <name>VkRenderPassCreateFlags</name>;</type>
- <type category="bitmask">typedef <type>VkFlags</type> <name>VkSamplerCreateFlags</name>;</type>
+ <type requires="VkSamplerCreateFlagBits" category="bitmask">typedef <type>VkFlags</type> <name>VkSamplerCreateFlags</name>;</type>
<type category="bitmask">typedef <type>VkFlags</type> <name>VkPipelineLayoutCreateFlags</name>;</type>
<type category="bitmask">typedef <type>VkFlags</type> <name>VkPipelineCacheCreateFlags</name>;</type>
<type category="bitmask">typedef <type>VkFlags</type> <name>VkPipelineDepthStencilStateCreateFlags</name>;</type>
@@ -214,7 +214,7 @@ server.
<type requires="VkShaderStageFlagBits" category="bitmask">typedef <type>VkFlags</type> <name>VkShaderStageFlags</name>;</type>
<type requires="VkImageUsageFlagBits" category="bitmask">typedef <type>VkFlags</type> <name>VkImageUsageFlags</name>;</type>
<type requires="VkImageCreateFlagBits" category="bitmask">typedef <type>VkFlags</type> <name>VkImageCreateFlags</name>;</type>
- <type category="bitmask">typedef <type>VkFlags</type> <name>VkImageViewCreateFlags</name>;</type>
+ <type requires="VkImageViewCreateFlagBits" category="bitmask">typedef <type>VkFlags</type> <name>VkImageViewCreateFlags</name>;</type>
<type requires="VkPipelineCreateFlagBits" category="bitmask">typedef <type>VkFlags</type> <name>VkPipelineCreateFlags</name>;</type>
<type requires="VkColorComponentFlagBits" category="bitmask">typedef <type>VkFlags</type> <name>VkColorComponentFlags</name>;</type>
<type requires="VkFenceCreateFlagBits" category="bitmask">typedef <type>VkFlags</type> <name>VkFenceCreateFlags</name>;</type>
@@ -406,6 +406,7 @@ server.
<type name="VkImageTiling" category="enum"/>
<type name="VkImageType" category="enum"/>
<type name="VkImageUsageFlagBits" category="enum"/>
+ <type name="VkImageViewCreateFlagBits" category="enum"/>
<type name="VkImageViewType" category="enum"/>
<type name="VkSharingMode" category="enum"/>
<type name="VkIndexType" category="enum"/>
@@ -2441,7 +2442,7 @@ server.
<type category="struct" name="VkDescriptorUpdateTemplateEntryKHR" alias="VkDescriptorUpdateTemplateEntry"/>
<type category="struct" name="VkDescriptorUpdateTemplateCreateInfo">
<member values="VK_STRUCTURE_TYPE_DESCRIPTOR_UPDATE_TEMPLATE_CREATE_INFO"><type>VkStructureType</type> <name>sType</name></member>
- <member><type>void</type>* <name>pNext</name></member>
+ <member>const <type>void</type>* <name>pNext</name></member>
<member optional="true"><type>VkDescriptorUpdateTemplateCreateFlags</type> <name>flags</name></member>
<member><type>uint32_t</type> <name>descriptorUpdateEntryCount</name><comment>Number of descriptor update entries to use for the update template</comment></member>
<member len="descriptorUpdateEntryCount">const <type>VkDescriptorUpdateTemplateEntry</type>* <name>pDescriptorUpdateEntries</name><comment>Descriptor update entries for the template</comment></member>
@@ -2848,7 +2849,7 @@ server.
<member><type>VkBool32</type> <name>dstPremultiplied</name></member>
<member><type>VkBlendOverlapEXT</type> <name>blendOverlap</name></member>
</type>
- <type category="struct" name="VkPhysicalDeviceInlineUniformBlockFeaturesEXT" returnedonly="true" structextends="VkPhysicalDeviceFeatures2,VkDeviceCreateInfo">
+ <type category="struct" name="VkPhysicalDeviceInlineUniformBlockFeaturesEXT" structextends="VkPhysicalDeviceFeatures2,VkDeviceCreateInfo">
<member values="VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_INLINE_UNIFORM_BLOCK_FEATURES_EXT"><type>VkStructureType</type> <name>sType</name></member>
<member><type>void</type>* <name>pNext</name></member>
<member><type>VkBool32</type> <name>inlineUniformBlock</name></member>
@@ -2883,7 +2884,7 @@ server.
<member><type>uint32_t</type> <name>coverageModulationTableCount</name></member>
<member noautovalidity="true" optional="true" len="coverageModulationTableCount">const <type>float</type>* <name>pCoverageModulationTable</name></member>
</type>
- <type category="struct" name="VkImageFormatListCreateInfoKHR" structextends="VkImageCreateInfo,VkPhysicalDeviceImageFormatInfo2">
+ <type category="struct" name="VkImageFormatListCreateInfoKHR" structextends="VkImageCreateInfo,VkSwapchainCreateInfoKHR,VkPhysicalDeviceImageFormatInfo2">
<member values="VK_STRUCTURE_TYPE_IMAGE_FORMAT_LIST_CREATE_INFO_KHR"><type>VkStructureType</type> <name>sType</name></member>
<member>const <type>void</type>* <name>pNext</name></member>
<member optional="true"><type>uint32_t</type> <name>viewFormatCount</name></member>
@@ -2919,6 +2920,33 @@ server.
<member noautovalidity="true"><type>void</type>* <name>pNext</name></member>
<member><type>VkBool32</type> <name>shaderDrawParameters</name></member>
</type>
+ <type category="struct" name="VkPhysicalDeviceFloat16Int8FeaturesKHR" structextends="VkPhysicalDeviceFeatures2,VkDeviceCreateInfo">
+ <member values="VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR"><type>VkStructureType</type> <name>sType</name></member>
+ <member noautovalidity="true"><type>void</type>* <name>pNext</name></member> 
+ <member><type>VkBool32</type> <name>shaderFloat16</name></member> 
+ <member><type>VkBool32</type> <name>shaderInt8</name></member> 
+ </type>
+ <type category="struct" name="VkPhysicalDeviceFloatControlsPropertiesKHR" structextends="VkPhysicalDeviceProperties2">
+ <member values="VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT_CONTROLS_PROPERTIES_KHR"><type>VkStructureType</type> <name>sType</name></member>
+ <member><type>void</type>* <name>pNext</name></member>
+ <member><type>VkBool32</type> <name>separateDenormSettings</name></member>
+ <member><type>VkBool32</type> <name>separateRoundingModeSettings</name></member>
+ <member><type>VkBool32</type> <name>shaderSignedZeroInfNanPreserveFloat16</name></member> 
+ <member><type>VkBool32</type> <name>shaderSignedZeroInfNanPreserveFloat32</name></member> 
+ <member><type>VkBool32</type> <name>shaderSignedZeroInfNanPreserveFloat64</name></member> 
+ <member><type>VkBool32</type> <name>shaderDenormPreserveFloat16</name></member> 
+ <member><type>VkBool32</type> <name>shaderDenormPreserveFloat32</name></member> 
+ <member><type>VkBool32</type> <name>shaderDenormPreserveFloat64</name></member> 
+ <member><type>VkBool32</type> <name>shaderDenormFlushToZeroFloat16</name></member> 
+ <member><type>VkBool32</type> <name>shaderDenormFlushToZeroFloat32</name></member> 
+ <member><type>VkBool32</type> <name>shaderDenormFlushToZeroFloat64</name></member> 
+ <member><type>VkBool32</type> <name>shaderRoundingModeRTEFloat16</name></member> 
+ <member><type>VkBool32</type> <name>shaderRoundingModeRTEFloat32</name></member> 
+ <member><type>VkBool32</type> <name>shaderRoundingModeRTEFloat64</name></member> 
+ <member><type>VkBool32</type> <name>shaderRoundingModeRTZFloat16</name></member> 
+ <member><type>VkBool32</type> <name>shaderRoundingModeRTZFloat32</name></member> 
+ <member><type>VkBool32</type> <name>shaderRoundingModeRTZFloat64</name></member> 
+ </type>
<type category="struct" name="VkNativeBufferANDROID">
<member values="VK_STRUCTURE_TYPE_NATIVE_BUFFER_ANDROID"><type>VkStructureType</type> <name>sType</name></member>
<member>const <type>void</type>* <name>pNext</name></member>
@@ -2987,11 +3015,11 @@ server.
<member optional="true"><type>int32_t</type> <name>messageIdNumber</name></member>
<member len="null-terminated">const <type>char</type>* <name>pMessage</name></member>
<member optional="true"><type>uint32_t</type> <name>queueLabelCount</name></member>
- <member noautovalidity="true" optional="true" len="queueLabelCount"><type>VkDebugUtilsLabelEXT</type>* <name>pQueueLabels</name></member>
+ <member len="queueLabelCount">const <type>VkDebugUtilsLabelEXT</type>* <name>pQueueLabels</name></member>
<member optional="true"><type>uint32_t</type> <name>cmdBufLabelCount</name></member>
- <member noautovalidity="true" optional="true" len="cmdBufLabelCount"><type>VkDebugUtilsLabelEXT</type>* <name>pCmdBufLabels</name></member>
- <member><type>uint32_t</type> <name>objectCount</name></member>
- <member noautovalidity="true" len="objectCount"><type>VkDebugUtilsObjectNameInfoEXT</type>* <name>pObjects</name></member>
+ <member len="cmdBufLabelCount">const <type>VkDebugUtilsLabelEXT</type>* <name>pCmdBufLabels</name></member>
+ <member optional="true"><type>uint32_t</type> <name>objectCount</name></member>
+ <member len="objectCount">const <type>VkDebugUtilsObjectNameInfoEXT</type>* <name>pObjects</name></member>
</type>
<type category="struct" name="VkImportMemoryHostPointerInfoEXT" structextends="VkMemoryAllocateInfo">
<member values="VK_STRUCTURE_TYPE_IMPORT_MEMORY_HOST_POINTER_INFO_EXT"><type>VkStructureType</type> <name>sType</name></member>
@@ -3268,7 +3296,7 @@ server.
<member><type>VkBool32</type> <name>conditionalRendering</name></member>
<member><type>VkBool32</type> <name>inheritedConditionalRendering</name></member>
</type>
- <type category="struct" name="VkPhysicalDeviceVulkanMemoryModelFeaturesKHR" returnedonly="true" structextends="VkPhysicalDeviceFeatures2,VkDeviceCreateInfo">
+ <type category="struct" name="VkPhysicalDeviceVulkanMemoryModelFeaturesKHR" structextends="VkPhysicalDeviceFeatures2,VkDeviceCreateInfo">
<member values="VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_MEMORY_MODEL_FEATURES_KHR"><type>VkStructureType</type> <name>sType</name></member>
<member noautovalidity="true"><type>void</type>* <name>pNext</name></member>
<member><type>VkBool32</type> <name>vulkanMemoryModel</name></member>
@@ -3595,6 +3623,25 @@ server.
<member>const <type>void</type>* <name>pNext</name></member>
<member><type>VkMemoryOverallocationBehaviorAMD</type> <name>overallocationBehavior</name></member>
</type>
+ <type category="struct" name="VkPhysicalDeviceFragmentDensityMapFeaturesEXT" returnedonly="true" structextends="VkPhysicalDeviceFeatures2,VkDeviceCreateInfo">
+ <member values="VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FRAGMENT_DENSITY_MAP_FEATURES_EXT"><type>VkStructureType</type> <name>sType</name></member>
+ <member><type>void</type>* <name>pNext</name></member>
+ <member><type>VkBool32</type> <name>fragmentDensityMap</name></member>
+ <member><type>VkBool32</type> <name>fragmentDensityMapDynamic</name></member>
+ <member><type>VkBool32</type> <name>fragmentDensityMapNonSubsampledImages</name></member>
+ </type>
+ <type category="struct" name="VkPhysicalDeviceFragmentDensityMapPropertiesEXT" returnedonly="true" structextends="VkPhysicalDeviceProperties2">
+ <member values="VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FRAGMENT_DENSITY_MAP_PROPERTIES_EXT"><type>VkStructureType</type> <name>sType</name></member>
+ <member><type>void</type>* <name>pNext</name></member>
+ <member><type>VkExtent2D</type> <name>minFragmentDensityTexelSize</name></member>
+ <member><type>VkExtent2D</type> <name>maxFragmentDensityTexelSize</name></member>
+ <member><type>VkBool32</type> <name>fragmentDensityInvocations</name></member>
+ </type>
+ <type category="struct" name="VkRenderPassFragmentDensityMapCreateInfoEXT" structextends="VkRenderPassCreateInfo">
+ <member values="VK_STRUCTURE_TYPE_RENDER_PASS_FRAGMENT_DENSITY_MAP_CREATE_INFO_EXT"><type>VkStructureType</type> <name>sType</name></member>
+ <member>const <type>void</type>* <name>pNext</name></member>
+ <member><type>VkAttachmentReference</type> <name>fragmentDensityMapAttachment</name></member>
+ </type>
<type category="struct" name="VkPhysicalDeviceScalarBlockLayoutFeaturesEXT" structextends="VkPhysicalDeviceFeatures2,VkDeviceCreateInfo">
<member values="VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SCALAR_BLOCK_LAYOUT_FEATURES_EXT"><type>VkStructureType</type> <name>sType</name></member>
<member><type>void</type>* <name>pNext</name></member>
@@ -4256,6 +4303,10 @@ server.
<enum bitpos="3" name="VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT" comment="Allows image views to have different format than the base image"/>
<enum bitpos="4" name="VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT" comment="Allows creating image views with cube type from the created image"/>
</enums>
+ <enums name="VkImageViewCreateFlagBits" type="bitmask">
+ </enums>
+ <enums name="VkSamplerCreateFlagBits" type="bitmask">
+ </enums>
<enums name="VkPipelineCreateFlagBits" type="bitmask">
<enum bitpos="0" name="VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT"/>
<enum bitpos="1" name="VK_PIPELINE_CREATE_ALLOW_DERIVATIVES_BIT"/>
@@ -8458,10 +8509,12 @@ server.
<type name="VkCommandBufferInheritanceConditionalRenderingInfoEXT"/>
</require>
</extension>
- <extension name="VK_KHR_extension_83" number="83" author="KHR" contact="Jan-Harald Fredriksen @janharaldfredriksen-arm" supported="disabled">
+ <extension name="VK_KHR_shader_float16_int8" number="83" type="device" requires="VK_KHR_get_physical_device_properties2" author="KHR" contact="Alexander Galazin @alegal-arm" supported="vulkan">
<require>
- <enum value="0" name="VK_KHR_EXTENSION_83_SPEC_VERSION"/>
- <enum value=""VK_KHR_extension_83"" name="VK_KHR_EXTENSION_83_EXTENSION_NAME"/>
+ <enum value="1" name="VK_KHR_SHADER_FLOAT16_INT8_SPEC_VERSION"/>
+ <enum value=""VK_KHR_shader_float16_int8"" name="VK_KHR_SHADER_FLOAT16_INT8_EXTENSION_NAME"/>
+ <enum offset="0" extends="VkStructureType" name="VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR"/>
+ <type name="VkPhysicalDeviceFloat16Int8FeaturesKHR"/>
</require>
</extension>
<extension name="VK_KHR_16bit_storage" number="84" type="device" requires="VK_KHR_get_physical_device_properties2,VK_KHR_storage_buffer_storage_class" author="KHR" contact="Jan-Harald Fredriksen @janharaldfredriksen-arm" supported="vulkan" promotedto="VK_VERSION_1_1">
@@ -9510,7 +9563,7 @@ server.
</extension>
<extension name="VK_NV_ray_tracing" number="166" type="device" requires="VK_KHR_get_physical_device_properties2,VK_KHR_get_memory_requirements2" author="NV" contact="Eric Werness @ewerness" supported="vulkan">
<require>
- <enum value="2" name="VK_NV_RAY_TRACING_SPEC_VERSION"/>
+ <enum value="3" name="VK_NV_RAY_TRACING_SPEC_VERSION"/>
<enum value=""VK_NV_ray_tracing"" name="VK_NV_RAY_TRACING_EXTENSION_NAME"/>
<enum name="VK_SHADER_UNUSED_NV"/>
<enum offset="0" extends="VkStructureType" name="VK_STRUCTURE_TYPE_RAY_TRACING_PIPELINE_CREATE_INFO_NV"/>
@@ -9820,10 +9873,12 @@ server.
<type name="VkPhysicalDeviceDriverPropertiesKHR"/>
</require>
</extension>
- <extension name="VK_ARM_extension_198" number="198" author="Alexander Galazin" contact="Alexander Galazin @alegal-arm" supported="disabled">
+ <extension name="VK_KHR_shader_float_controls" number="198" type="device" requires="VK_KHR_get_physical_device_properties2" author="KHR" contact="Alexander Galazin @alegal-arm" supported="vulkan">
<require>
- <enum value="0" name="VK_ARM_EXTENSION_198_SPEC_VERSION"/>
- <enum value=""VK_EXT_extension_198"" name="VK_ARM_EXTENSION_198_EXTENSION_NAME"/>
+ <enum value="1" name="VK_KHR_SHADER_FLOAT_CONTROLS_SPEC_VERSION"/>
+ <enum value=""VK_KHR_shader_float_controls"" name="VK_KHR_SHADER_FLOAT_CONTROLS_EXTENSION_NAME"/>
+ <enum offset="0" extends="VkStructureType" name="VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT_CONTROLS_PROPERTIES_KHR"/>
+ <type name="VkPhysicalDeviceFloatControlsPropertiesKHR"/>
</require>
</extension>
<extension name="VK_NV_shader_subgroup_partitioned" number="199" type="device" requiresCore="1.1" author="NV" contact="Jeff Bolz @jeffbolznv" supported="vulkan">
@@ -9839,10 +9894,11 @@ server.
<enum value=""VK_KHR_extension_200"" name="VK_KHR_EXTENSION_200_EXTENSION_NAME"/>
</require>
</extension>
- <extension name="VK_KHR_extension_201" number="201" type="device" author="KHR" contact="Daniel Rakos @drakos-arm" supported="disabled">
+ <extension name="VK_KHR_swapchain_mutable_format" number="201" type="device" author="KHR" requires="VK_KHR_swapchain,VK_KHR_maintenance2,VK_KHR_image_format_list" contact="Daniel Rakos @drakos-arm" supported="vulkan">
<require>
- <enum value="0" name="VK_KHR_EXTENSION_201_SPEC_VERSION"/>
- <enum value=""VK_KHR_extension_201"" name="VK_KHR_EXTENSION_201_EXTENSION_NAME"/>
+ <enum value="1" name="VK_KHR_SWAPCHAIN_MUTABLE_FORMAT_SPEC_VERSION"/>
+ <enum value=""VK_KHR_swapchain_mutable_format"" name="VK_KHR_SWAPCHAIN_MUTABLE_FORMAT_EXTENSION_NAME"/>
+ <enum bitpos="2" extends="VkSwapchainCreateFlagBitsKHR" name="VK_SWAPCHAIN_CREATE_MUTABLE_FORMAT_BIT_KHR"/>
</require>
</extension>
<extension name="VK_NV_compute_shader_derivatives" number="202" type="device" requires="VK_KHR_get_physical_device_properties2" author="NV" contact="Pat Brown @nvpbrown" supported="vulkan">
@@ -9985,15 +10041,25 @@ server.
<enum value=""VK_EXT_macos_ios_window"" name="VK_EXT_MACOS_IOS_WINDOW_EXTENSION_NAME"/>
</require>
</extension>
- <extension name="VK_EXT_extension_219" number="219" type="device" author="EXT" contact="Matthew Netsch @mnetsch" supported="disabled">
+ <extension name="VK_EXT_fragment_density_map" number="219" type="device" requires="VK_KHR_get_physical_device_properties2" author="EXT" contact="Matthew Netsch @mnetsch" supported="vulkan">
<require>
- <enum value="0" name="VK_EXT_EXTENSION_219_SPEC_VERSION"/>
- <enum value=""VK_EXT_extension_219"" name="VK_EXT_EXTENSION_219_EXTENSION_NAME"/>
- <enum bitpos="14" extends="VkImageCreateFlagBits" name="VK_IMAGE_CREATE_RESERVED_14_BIT_EXT"/>
- <enum bitpos="24" extends="VkAccessFlagBits" name="VK_ACCESS_RESERVED_24_BIT_EXT"/>
- <enum bitpos="24" extends="VkFormatFeatureFlagBits" name="VK_FORMAT_FEATURE_RESERVED_24_BIT_EXT"/>
- <enum bitpos="9" extends="VkImageUsageFlagBits" name="VK_IMAGE_USAGE_RESERVED_9_BIT_EXT"/>
- <enum bitpos="23" extends="VkPipelineStageFlagBits" name="VK_PIPELINE_STAGE_RESERVED_23_BIT_EXT"/>
+ <enum value="1" name="VK_EXT_FRAGMENT_DENSITY_MAP_SPEC_VERSION"/>
+ <enum value=""VK_EXT_fragment_density_map"" name="VK_EXT_FRAGMENT_DENSITY_MAP_EXTENSION_NAME"/>
+ <enum offset="0" extends="VkStructureType" name="VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FRAGMENT_DENSITY_MAP_FEATURES_EXT"/>
+ <enum offset="1" extends="VkStructureType" name="VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FRAGMENT_DENSITY_MAP_PROPERTIES_EXT"/>
+ <enum offset="2" extends="VkStructureType" name="VK_STRUCTURE_TYPE_RENDER_PASS_FRAGMENT_DENSITY_MAP_CREATE_INFO_EXT"/>
+ <enum bitpos="14" extends="VkImageCreateFlagBits" name="VK_IMAGE_CREATE_SUBSAMPLED_BIT_EXT"/>
+ <enum offset="0" extends="VkImageLayout" name="VK_IMAGE_LAYOUT_FRAGMENT_DENSITY_MAP_OPTIMAL_EXT"/>
+ <enum bitpos="24" extends="VkAccessFlagBits" name="VK_ACCESS_FRAGMENT_DENSITY_MAP_READ_BIT_EXT"/>
+ <enum bitpos="24" extends="VkFormatFeatureFlagBits" name="VK_FORMAT_FEATURE_FRAGMENT_DENSITY_MAP_BIT_EXT"/>
+ <enum bitpos="9" extends="VkImageUsageFlagBits" name="VK_IMAGE_USAGE_FRAGMENT_DENSITY_MAP_BIT_EXT"/>
+ <enum bitpos="0" extends="VkImageViewCreateFlagBits" name="VK_IMAGE_VIEW_CREATE_FRAGMENT_DENSITY_MAP_DYNAMIC_BIT_EXT"/>
+ <enum bitpos="23" extends="VkPipelineStageFlagBits" name="VK_PIPELINE_STAGE_FRAGMENT_DENSITY_PROCESS_BIT_EXT"/>
+ <enum bitpos="0" extends="VkSamplerCreateFlagBits" name="VK_SAMPLER_CREATE_SUBSAMPLED_BIT_EXT"/>
+ <enum bitpos="1" extends="VkSamplerCreateFlagBits" name="VK_SAMPLER_CREATE_SUBSAMPLED_COARSE_RECONSTRUCTION_BIT_EXT"/>
+ <type name="VkPhysicalDeviceFragmentDensityMapFeaturesEXT"/>
+ <type name="VkPhysicalDeviceFragmentDensityMapPropertiesEXT"/>
+ <type name="VkRenderPassFragmentDensityMapCreateInfoEXT"/>
</require>
</extension>
<extension name="VK_EXT_extension_220" number="220" author="EXT" contact="Dzmitry Malyshau @kvark" supported="disabled">
@@ -10175,5 +10241,11 @@ server.
<enum value=""VK_EXT_extension_248"" name="VK_EXT_EXTENSION_248_EXTENSION_NAME"/>
</require>
</extension>
+ <extension name="VK_KHR_extension_249" number="249" author="KHR" contact="Keith Packard @keithp" supported="disabled">
+ <require>
+ <enum value="0" name="VK_KHR_EXTENSION_249_SPEC_VERSION"/>
+ <enum value=""VK_KHR_extension_249"" name="VK_KHR_EXTENSION_249_EXTENSION_NAME"/>
+ </require>
+ </extension>
</extensions>
</registry>

--
2.17.1

Iago Toral Quiroga

2018-12-04 07:16:55 UTC

We were assuming 32-bit elements.
---
src/intel/compiler/brw_fs_generator.cpp | 34 +++++++++++++------------
1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp
index 08dd83dded7..bffd9bc4787 100644
--- a/src/intel/compiler/brw_fs_generator.cpp
+++ b/src/intel/compiler/brw_fs_generator.cpp
@@ -1259,7 +1259,7 @@ fs_generator::generate_ddx(const fs_inst *inst,
struct brw_reg src0 = src;
struct brw_reg src1 = src;

- src0.subnr = sizeof(float);
+ src0.subnr = type_sz(src.type);
src0.vstride = vstride;
src0.width = width;
src0.hstride = BRW_HORIZONTAL_STRIDE_0;
@@ -1278,23 +1278,25 @@ void
fs_generator::generate_ddy(const fs_inst *inst,
struct brw_reg dst, struct brw_reg src)
{
+ const uint32_t type_size = type_sz(src.type);
+
if (inst->opcode == FS_OPCODE_DDY_FINE) {
/* produce accurate derivatives */
if (devinfo->gen >= 11) {
src = stride(src, 0, 2, 1);
- struct brw_reg src_0 = byte_offset(src, 0 * sizeof(float));
- struct brw_reg src_2 = byte_offset(src, 2 * sizeof(float));
- struct brw_reg src_4 = byte_offset(src, 4 * sizeof(float));
- struct brw_reg src_6 = byte_offset(src, 6 * sizeof(float));
- struct brw_reg src_8 = byte_offset(src, 8 * sizeof(float));
- struct brw_reg src_10 = byte_offset(src, 10 * sizeof(float));
- struct brw_reg src_12 = byte_offset(src, 12 * sizeof(float));
- struct brw_reg src_14 = byte_offset(src, 14 * sizeof(float));
-
- struct brw_reg dst_0 = byte_offset(dst, 0 * sizeof(float));
- struct brw_reg dst_4 = byte_offset(dst, 4 * sizeof(float));
- struct brw_reg dst_8 = byte_offset(dst, 8 * sizeof(float));
- struct brw_reg dst_12 = byte_offset(dst, 12 * sizeof(float));
+ struct brw_reg src_0 = byte_offset(src, 0 * type_size);
+ struct brw_reg src_2 = byte_offset(src, 2 * type_size);
+ struct brw_reg src_4 = byte_offset(src, 4 * type_size);
+ struct brw_reg src_6 = byte_offset(src, 6 * type_size);
+ struct brw_reg src_8 = byte_offset(src, 8 * type_size);
+ struct brw_reg src_10 = byte_offset(src, 10 * type_size);
+ struct brw_reg src_12 = byte_offset(src, 12 * type_size);
+ struct brw_reg src_14 = byte_offset(src, 14 * type_size);
+
+ struct brw_reg dst_0 = byte_offset(dst, 0 * type_size);
+ struct brw_reg dst_4 = byte_offset(dst, 4 * type_size);
+ struct brw_reg dst_8 = byte_offset(dst, 8 * type_size);
+ struct brw_reg dst_12 = byte_offset(dst, 12 * type_size);

brw_push_insn_state(p);
brw_set_default_exec_size(p, BRW_EXECUTE_4);
@@ -1323,8 +1325,8 @@ fs_generator::generate_ddy(const fs_inst *inst,
/* replicate the derivative at the top-left pixel to other pixels */
struct brw_reg src0 = stride(src, 4, 4, 0);
struct brw_reg src1 = stride(src, 4, 4, 0);
- src0.subnr = 0 * sizeof(float);
- src1.subnr = 2 * sizeof(float);
+ src0.subnr = 0 * type_size;
+ src1.subnr = 2 * type_size;

brw_ADD(p, dst, negate(src0), src1);
}

--
2.17.1

Iago Toral Quiroga

2018-12-04 07:17:03 UTC

---
src/intel/vulkan/anv_device.c | 9 +++++++++
1 file changed, 9 insertions(+)

diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c
index 6b5ba25c6bc..caf25ad8a03 100644
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@@ -966,6 +966,15 @@ void anv_GetPhysicalDeviceFeatures2(
break;
}

+ case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR: {
+ VkPhysicalDeviceFloat16Int8FeaturesKHR *features = (void *)ext;
+ ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice);
+
+ features->shaderFloat16 = pdevice->info.gen >= 8;
+ features->shaderInt8 = false;
+ break;
+ }
+
default:
anv_debug_ignored_stype(ext->sType);
break;

--
2.17.1

Iago Toral Quiroga

2018-12-04 07:17:05 UTC

This function is used in two different scenarios that for 32-bit
instructions are the same, but for 16-bit instructions are not.

One scenario is that in which we are working at a SIMD8 register
level and we need to know if a register is fully defined or written.
This is useful, for example, in the context of liveness analysis or
register allocation, where we work with units of registers.

The other scenario is that in which we want to know if an instruction
is writing a full scalar component or just some subset of it. This is
useful, for example, in the context of some optimization passes
like copy propagation.

For 32-bit instructions (or larger), a SIMD8 dispatch will always write
at least a full SIMD8 register (32B) if the write is not partial. The
function is_partial_write() checks this to determine if we have a partial
write. However, when we deal with 16-bit instructions, that logic disables
some optimizations that should be safe. For example, a SIMD8 16-bit MOV will
only update half of a SIMD register, but it is still a complete write of the
variable for a SIMD8 dispatch, so we should not prevent copy propagation in
this scenario because we don't write all 32 bytes in the SIMD register
or because the write starts at offset 16B (wehere we pack components Y or
W of 16-bit vectors).

This is a problem for SIMD8 executions (VS, TCS, TES, GS) of 16-bit
instructions, which lose a number of optimizations because of this, most
important of which is copy-propagation.

This patch splits is_partial_write() into is_partial_reg_write(), which
represents the current is_partial_write(), useful for things like
liveness analysis, and is_partial_var_write(), which considers
the dispatch size to check if we are writing a full variable (rather
than a full register) to decide if the write is partial or not, which
is what we really want in many optimization passes.

Then the patch goes on and rewrites all uses of is_partial_write() to use
one or the other version. Specifically, we use is_partial_var_write()
in the following places: copy propagation, cmod propagation, common
subexpression elimination, saturate propagation and sel peephole.

Notice that the semantics of is_partial_var_write() exactly match the
current implementation of is_partial_write() for anything that is
32-bit or larger, so no changes are expected for 32-bit instructions.

Tested against ~5000 tests involving 16-bit instructions in CTS produced
the following changes in instruction counts:

Patched | Master | % |
================================================
SIMD8 | 621,900 | 706,721 | -12.00% |
================================================
SIMD16 | 93,252 | 93,252 | 0.00% |
================================================

As expected, the change only affects SIMD8 dispatches.
---
src/intel/compiler/brw_fs.cpp | 31 +++++++++++++++----
.../compiler/brw_fs_cmod_propagation.cpp | 20 ++++++------
.../compiler/brw_fs_copy_propagation.cpp | 8 ++---
src/intel/compiler/brw_fs_cse.cpp | 3 +-
.../compiler/brw_fs_dead_code_eliminate.cpp | 2 +-
src/intel/compiler/brw_fs_live_variables.cpp | 2 +-
src/intel/compiler/brw_fs_reg_allocate.cpp | 2 +-
.../compiler/brw_fs_register_coalesce.cpp | 2 +-
.../compiler/brw_fs_saturate_propagation.cpp | 7 +++--
src/intel/compiler/brw_fs_sel_peephole.cpp | 4 +--
src/intel/compiler/brw_ir_fs.h | 3 +-
11 files changed, 54 insertions(+), 30 deletions(-)

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 1d5d1dd0d22..9ea67975e1e 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -698,14 +698,33 @@ fs_visitor::limit_dispatch_width(unsigned n, const char *msg)
* it.
*/
bool
-fs_inst::is_partial_write() const
+fs_inst::is_partial_reg_write() const
{
return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
- (this->exec_size * type_sz(this->dst.type)) < 32 ||
!this->dst.is_contiguous() ||
+ (this->exec_size * type_sz(this->dst.type)) < REG_SIZE ||
this->dst.offset % REG_SIZE != 0);
}

+/**
+ * Returns true if the instruction has a flag that means it won't
+ * update an entire variable for the given dispatch width.
+ *
+ * This is only different from is_partial_reg_write() for SIMD8
+ * dispatches of 16-bit (or smaller) instructions.
+ */
+bool
+fs_inst::is_partial_var_write(uint32_t dispatch_width) const
+{
+ const uint32_t type_size = type_sz(this->dst.type);
+ uint32_t var_size = MIN2(REG_SIZE, dispatch_width * type_size);
+
+ return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
+ !this->dst.is_contiguous() ||
+ (this->exec_size * type_sz(this->dst.type)) < var_size ||
+ this->dst.offset % var_size != 0);
+}
+
unsigned
fs_inst::components_read(unsigned i) const
{
@@ -2847,7 +2866,7 @@ fs_visitor::opt_register_renaming()
if (depth == 0 &&
inst->dst.file == VGRF &&
alloc.sizes[inst->dst.nr] * REG_SIZE == inst->size_written &&
- !inst->is_partial_write()) {
+ !inst->is_partial_reg_write()) {
if (remap[dst] == -1) {
remap[dst] = dst;
} else {
@@ -3050,7 +3069,7 @@ fs_visitor::compute_to_mrf()
next_ip++;

if (inst->opcode != BRW_OPCODE_MOV ||
- inst->is_partial_write() ||
+ inst->is_partial_reg_write() ||
inst->dst.file != MRF || inst->src[0].file != VGRF ||
inst->dst.type != inst->src[0].type ||
inst->src[0].abs || inst->src[0].negate ||
@@ -3083,7 +3102,7 @@ fs_visitor::compute_to_mrf()
* that writes that reg, but it would require smarter
* tracking.
*/
- if (scan_inst->is_partial_write())
+ if (scan_inst->is_partial_reg_write())
break;

/* Handling things not fully contained in the source of the copy
@@ -3395,7 +3414,7 @@ fs_visitor::remove_duplicate_mrf_writes()
if (inst->opcode == BRW_OPCODE_MOV &&
inst->dst.file == MRF &&
inst->src[0].file != ARF &&
- !inst->is_partial_write()) {
+ !inst->is_partial_reg_write()) {
last_mrf_move[inst->dst.nr] = inst;
}
}
diff --git a/src/intel/compiler/brw_fs_cmod_propagation.cpp b/src/intel/compiler/brw_fs_cmod_propagation.cpp
index 5fb522f810f..7bb5c9afbc9 100644
--- a/src/intel/compiler/brw_fs_cmod_propagation.cpp
+++ b/src/intel/compiler/brw_fs_cmod_propagation.cpp
@@ -50,13 +50,13 @@

static bool
cmod_propagate_cmp_to_add(const gen_device_info *devinfo, bblock_t *block,
- fs_inst *inst)
+ fs_inst *inst, unsigned dispatch_width)
{
bool read_flag = false;

foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
if (scan_inst->opcode == BRW_OPCODE_ADD &&
- !scan_inst->is_partial_write() &&
+ !scan_inst->is_partial_var_write(dispatch_width) &&
scan_inst->exec_size == inst->exec_size) {
bool negate;

@@ -126,7 +126,7 @@ cmod_propagate_cmp_to_add(const gen_device_info *devinfo, bblock_t *block,
*/
static bool
cmod_propagate_not(const gen_device_info *devinfo, bblock_t *block,
- fs_inst *inst)
+ fs_inst *inst, unsigned dispatch_width)
{
const enum brw_conditional_mod cond = brw_negate_cmod(inst->conditional_mod);
bool read_flag = false;
@@ -141,7 +141,7 @@ cmod_propagate_not(const gen_device_info *devinfo, bblock_t *block,
scan_inst->opcode != BRW_OPCODE_AND)
break;

- if (scan_inst->is_partial_write() ||
+ if (scan_inst->is_partial_var_write(dispatch_width) ||
scan_inst->dst.offset != inst->src[0].offset ||
scan_inst->exec_size != inst->exec_size)
break;
@@ -166,7 +166,9 @@ cmod_propagate_not(const gen_device_info *devinfo, bblock_t *block,
}

static bool
-opt_cmod_propagation_local(const gen_device_info *devinfo, bblock_t *block)
+opt_cmod_propagation_local(const gen_device_info *devinfo,
+ bblock_t *block,
+ unsigned dispatch_width)
{
bool progress = false;
int ip = block->end_ip + 1;
@@ -219,14 +221,14 @@ opt_cmod_propagation_local(const gen_device_info *devinfo, bblock_t *block)
*/
if (inst->opcode == BRW_OPCODE_CMP && !inst->src[1].is_zero()) {
if (brw_reg_type_is_floating_point(inst->src[0].type) &&
- cmod_propagate_cmp_to_add(devinfo, block, inst))
+ cmod_propagate_cmp_to_add(devinfo, block, inst, dispatch_width))
progress = true;

continue;
}

if (inst->opcode == BRW_OPCODE_NOT) {
- progress = cmod_propagate_not(devinfo, block, inst) || progress;
+ progress = cmod_propagate_not(devinfo, block, inst, dispatch_width) || progress;
continue;
}

@@ -234,7 +236,7 @@ opt_cmod_propagation_local(const gen_device_info *devinfo, bblock_t *block)
foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
if (regions_overlap(scan_inst->dst, scan_inst->size_written,
inst->src[0], inst->size_read(0))) {
- if (scan_inst->is_partial_write() ||
+ if (scan_inst->is_partial_var_write(dispatch_width) ||
scan_inst->dst.offset != inst->src[0].offset ||
scan_inst->exec_size != inst->exec_size)
break;
@@ -342,7 +344,7 @@ fs_visitor::opt_cmod_propagation()
bool progress = false;

foreach_block_reverse(block, cfg) {
- progress = opt_cmod_propagation_local(devinfo, block) || progress;
+ progress = opt_cmod_propagation_local(devinfo, block, dispatch_width) || progress;
}

if (progress)
diff --git a/src/intel/compiler/brw_fs_copy_propagation.cpp b/src/intel/compiler/brw_fs_copy_propagation.cpp
index c01d4ec4a4f..0f0284115fb 100644
--- a/src/intel/compiler/brw_fs_copy_propagation.cpp
+++ b/src/intel/compiler/brw_fs_copy_propagation.cpp
@@ -505,7 +505,7 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
/* Compute the first component of the copy that the instruction is
* reading, and the base byte offset within that component.
*/
- assert(entry->dst.offset % REG_SIZE == 0 && entry->dst.stride == 1);
+ assert(entry->dst.stride == 1);
const unsigned component = rel_offset / type_sz(entry->dst.type);
const unsigned suboffset = rel_offset % type_sz(entry->dst.type);

@@ -783,7 +783,7 @@ fs_visitor::try_constant_propagate(fs_inst *inst, acp_entry *entry)
}

static bool
-can_propagate_from(fs_inst *inst)
+can_propagate_from(fs_inst *inst, unsigned dispatch_width)
{
return (inst->opcode == BRW_OPCODE_MOV &&
inst->dst.file == VGRF &&
@@ -794,7 +794,7 @@ can_propagate_from(fs_inst *inst)
inst->src[0].file == UNIFORM ||
inst->src[0].file == IMM) &&
inst->src[0].type == inst->dst.type &&
- !inst->is_partial_write());
+ !inst->is_partial_var_write(dispatch_width));
}

/* Walks a basic block and does copy propagation on it using the acp
@@ -846,7 +846,7 @@ fs_visitor::opt_copy_propagation_local(void *copy_prop_ctx, bblock_t *block,
/* If this instruction's source could potentially be folded into the
* operand of another instruction, add it to the ACP.
*/
- if (can_propagate_from(inst)) {
+ if (can_propagate_from(inst, dispatch_width)) {
acp_entry *entry = ralloc(copy_prop_ctx, acp_entry);
entry->dst = inst->dst;
entry->src = inst->src[0];
diff --git a/src/intel/compiler/brw_fs_cse.cpp b/src/intel/compiler/brw_fs_cse.cpp
index 6859733d58c..56813df2d2a 100644
--- a/src/intel/compiler/brw_fs_cse.cpp
+++ b/src/intel/compiler/brw_fs_cse.cpp
@@ -247,7 +247,8 @@ fs_visitor::opt_cse_local(bblock_t *block)
int ip = block->start_ip;
foreach_inst_in_block(fs_inst, inst, block) {
/* Skip some cases. */
- if (is_expression(this, inst) && !inst->is_partial_write() &&
+ if (is_expression(this, inst) &&
+ !inst->is_partial_var_write(dispatch_width) &&
((inst->dst.file != ARF && inst->dst.file != FIXED_GRF) ||
inst->dst.is_null()))
{
diff --git a/src/intel/compiler/brw_fs_dead_code_eliminate.cpp b/src/intel/compiler/brw_fs_dead_code_eliminate.cpp
index eeb71dd2b92..f24fd0643b8 100644
--- a/src/intel/compiler/brw_fs_dead_code_eliminate.cpp
+++ b/src/intel/compiler/brw_fs_dead_code_eliminate.cpp
@@ -110,7 +110,7 @@ fs_visitor::dead_code_eliminate()
}

if (inst->dst.file == VGRF) {
- if (!inst->is_partial_write()) {
+ if (!inst->is_partial_reg_write()) {
int var = live_intervals->var_from_reg(inst->dst);
for (unsigned i = 0; i < regs_written(inst); i++) {
BITSET_CLEAR(live, var + i);
diff --git a/src/intel/compiler/brw_fs_live_variables.cpp b/src/intel/compiler/brw_fs_live_variables.cpp
index 059f076fa51..30625aa586a 100644
--- a/src/intel/compiler/brw_fs_live_variables.cpp
+++ b/src/intel/compiler/brw_fs_live_variables.cpp
@@ -84,7 +84,7 @@ fs_live_variables::setup_one_write(struct block_data *bd, fs_inst *inst,
* screens off previous updates of that variable (VGRF channel).
*/
if (inst->dst.file == VGRF) {
- if (!inst->is_partial_write() && !BITSET_TEST(bd->use, var))
+ if (!inst->is_partial_reg_write() && !BITSET_TEST(bd->use, var))
BITSET_SET(bd->def, var);

BITSET_SET(bd->defout, var);
diff --git a/src/intel/compiler/brw_fs_reg_allocate.cpp b/src/intel/compiler/brw_fs_reg_allocate.cpp
index 73b8b7841f5..5147e8b4426 100644
--- a/src/intel/compiler/brw_fs_reg_allocate.cpp
+++ b/src/intel/compiler/brw_fs_reg_allocate.cpp
@@ -1026,7 +1026,7 @@ fs_visitor::spill_reg(int spill_reg)
* write, there should be no need for the unspill since the
* instruction will be overwriting the whole destination in any case.
*/
- if (inst->is_partial_write() ||
+ if (inst->is_partial_reg_write() ||
(!inst->force_writemask_all && !per_channel))
emit_unspill(ubld, spill_src, subset_spill_offset,
regs_written(inst));
diff --git a/src/intel/compiler/brw_fs_register_coalesce.cpp b/src/intel/compiler/brw_fs_register_coalesce.cpp
index 952276faed8..b27956023c6 100644
--- a/src/intel/compiler/brw_fs_register_coalesce.cpp
+++ b/src/intel/compiler/brw_fs_register_coalesce.cpp
@@ -70,7 +70,7 @@ is_coalesce_candidate(const fs_visitor *v, const fs_inst *inst)
{
if ((inst->opcode != BRW_OPCODE_MOV &&
inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD) ||
- inst->is_partial_write() ||
+ inst->is_partial_reg_write() ||
inst->saturate ||
inst->src[0].file != VGRF ||
inst->src[0].negate ||
diff --git a/src/intel/compiler/brw_fs_saturate_propagation.cpp b/src/intel/compiler/brw_fs_saturate_propagation.cpp
index d6cfa79a618..1e1461063ae 100644
--- a/src/intel/compiler/brw_fs_saturate_propagation.cpp
+++ b/src/intel/compiler/brw_fs_saturate_propagation.cpp
@@ -43,7 +43,8 @@
*/

static bool
-opt_saturate_propagation_local(fs_visitor *v, bblock_t *block)
+opt_saturate_propagation_local(fs_visitor *v, bblock_t *block,
+ unsigned dispatch_width)
{
bool progress = false;
int ip = block->end_ip + 1;
@@ -66,7 +67,7 @@ opt_saturate_propagation_local(fs_visitor *v, bblock_t *block)
foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
if (regions_overlap(scan_inst->dst, scan_inst->size_written,
inst->src[0], inst->size_read(0))) {
- if (scan_inst->is_partial_write() ||
+ if (scan_inst->is_partial_var_write(dispatch_width) ||
(scan_inst->dst.type != inst->dst.type &&
!scan_inst->can_change_types()))
break;
@@ -153,7 +154,7 @@ fs_visitor::opt_saturate_propagation()
calculate_live_intervals();

foreach_block (block, cfg) {
- progress = opt_saturate_propagation_local(this, block) || progress;
+ progress = opt_saturate_propagation_local(this, block, dispatch_width) || progress;
}

/* Live intervals are still valid. */
diff --git a/src/intel/compiler/brw_fs_sel_peephole.cpp b/src/intel/compiler/brw_fs_sel_peephole.cpp
index 6395b409b7c..98d640a3bfe 100644
--- a/src/intel/compiler/brw_fs_sel_peephole.cpp
+++ b/src/intel/compiler/brw_fs_sel_peephole.cpp
@@ -167,8 +167,8 @@ fs_visitor::opt_peephole_sel()
then_mov[i]->exec_size != else_mov[i]->exec_size ||
then_mov[i]->group != else_mov[i]->group ||
then_mov[i]->force_writemask_all != else_mov[i]->force_writemask_all ||
- then_mov[i]->is_partial_write() ||
- else_mov[i]->is_partial_write() ||
+ then_mov[i]->is_partial_var_write(dispatch_width) ||
+ else_mov[i]->is_partial_var_write(dispatch_width) ||
then_mov[i]->conditional_mod != BRW_CONDITIONAL_NONE ||
else_mov[i]->conditional_mod != BRW_CONDITIONAL_NONE) {
movs = i;
diff --git a/src/intel/compiler/brw_ir_fs.h b/src/intel/compiler/brw_ir_fs.h
index 07e7224e0f8..02c91b246a3 100644
--- a/src/intel/compiler/brw_ir_fs.h
+++ b/src/intel/compiler/brw_ir_fs.h
@@ -349,7 +349,8 @@ public:

bool equals(fs_inst *inst) const;
bool is_send_from_grf() const;
- bool is_partial_write() const;
+ bool is_partial_reg_write() const;
+ bool is_partial_var_write(unsigned dispatch_width) const;
bool is_copy_payload(const brw::simple_allocator &grf_alloc) const;
unsigned components_read(unsigned i) const;
unsigned size_read(int arg) const;

--
2.17.1

Pohjolainen, Topi

2018-12-07 13:30:11 UTC

Post by Iago Toral Quiroga
This function is used in two different scenarios that for 32-bit
instructions are the same, but for 16-bit instructions are not.
One scenario is that in which we are working at a SIMD8 register
level and we need to know if a register is fully defined or written.
This is useful, for example, in the context of liveness analysis or
register allocation, where we work with units of registers.
The other scenario is that in which we want to know if an instruction
is writing a full scalar component or just some subset of it. This is
useful, for example, in the context of some optimization passes
like copy propagation.
For 32-bit instructions (or larger), a SIMD8 dispatch will always write
at least a full SIMD8 register (32B) if the write is not partial. The
function is_partial_write() checks this to determine if we have a partial
write. However, when we deal with 16-bit instructions, that logic disables
some optimizations that should be safe. For example, a SIMD8 16-bit MOV will
only update half of a SIMD register, but it is still a complete write of the
variable for a SIMD8 dispatch, so we should not prevent copy propagation in
this scenario because we don't write all 32 bytes in the SIMD register
or because the write starts at offset 16B (wehere we pack components Y or
W of 16-bit vectors).
This is a problem for SIMD8 executions (VS, TCS, TES, GS) of 16-bit
instructions, which lose a number of optimizations because of this, most
important of which is copy-propagation.
This patch splits is_partial_write() into is_partial_reg_write(), which
represents the current is_partial_write(), useful for things like
liveness analysis, and is_partial_var_write(), which considers
the dispatch size to check if we are writing a full variable (rather
than a full register) to decide if the write is partial or not, which
is what we really want in many optimization passes.
Then the patch goes on and rewrites all uses of is_partial_write() to use
one or the other version. Specifically, we use is_partial_var_write()
in the following places: copy propagation, cmod propagation, common
subexpression elimination, saturate propagation and sel peephole.
Notice that the semantics of is_partial_var_write() exactly match the
current implementation of is_partial_write() for anything that is
32-bit or larger, so no changes are expected for 32-bit instructions.
Tested against ~5000 tests involving 16-bit instructions in CTS produced
Patched | Master | % |
================================================
SIMD8 | 621,900 | 706,721 | -12.00% |
================================================
SIMD16 | 93,252 | 93,252 | 0.00% |
================================================
As expected, the change only affects SIMD8 dispatches.

I like this. But I think I want to try and rebase my fp16 work on top to see
if there are any differences in the final assembly between this and my
"register padding" scheme.

Post by Iago Toral Quiroga
---
src/intel/compiler/brw_fs.cpp | 31 +++++++++++++++----
.../compiler/brw_fs_cmod_propagation.cpp | 20 ++++++------
.../compiler/brw_fs_copy_propagation.cpp | 8 ++---
src/intel/compiler/brw_fs_cse.cpp | 3 +-
.../compiler/brw_fs_dead_code_eliminate.cpp | 2 +-
src/intel/compiler/brw_fs_live_variables.cpp | 2 +-
src/intel/compiler/brw_fs_reg_allocate.cpp | 2 +-
.../compiler/brw_fs_register_coalesce.cpp | 2 +-
.../compiler/brw_fs_saturate_propagation.cpp | 7 +++--
src/intel/compiler/brw_fs_sel_peephole.cpp | 4 +--
src/intel/compiler/brw_ir_fs.h | 3 +-
11 files changed, 54 insertions(+), 30 deletions(-)
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 1d5d1dd0d22..9ea67975e1e 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -698,14 +698,33 @@ fs_visitor::limit_dispatch_width(unsigned n, const char *msg)
* it.
*/
bool
-fs_inst::is_partial_write() const
+fs_inst::is_partial_reg_write() const
{
return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
- (this->exec_size * type_sz(this->dst.type)) < 32 ||
!this->dst.is_contiguous() ||
+ (this->exec_size * type_sz(this->dst.type)) < REG_SIZE ||
this->dst.offset % REG_SIZE != 0);
}
+/**
+ * Returns true if the instruction has a flag that means it won't
+ * update an entire variable for the given dispatch width.
+ *
+ * This is only different from is_partial_reg_write() for SIMD8
+ * dispatches of 16-bit (or smaller) instructions.
+ */
+bool
+fs_inst::is_partial_var_write(uint32_t dispatch_width) const
+{
+ const uint32_t type_size = type_sz(this->dst.type);
+ uint32_t var_size = MIN2(REG_SIZE, dispatch_width * type_size);
+
+ return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
+ !this->dst.is_contiguous() ||
+ (this->exec_size * type_sz(this->dst.type)) < var_size ||
+ this->dst.offset % var_size != 0);
+}
+
unsigned
fs_inst::components_read(unsigned i) const
{
@@ -2847,7 +2866,7 @@ fs_visitor::opt_register_renaming()
if (depth == 0 &&
inst->dst.file == VGRF &&
alloc.sizes[inst->dst.nr] * REG_SIZE == inst->size_written &&
- !inst->is_partial_write()) {
+ !inst->is_partial_reg_write()) {
if (remap[dst] == -1) {
remap[dst] = dst;
} else {
@@ -3050,7 +3069,7 @@ fs_visitor::compute_to_mrf()
next_ip++;
if (inst->opcode != BRW_OPCODE_MOV ||
- inst->is_partial_write() ||
+ inst->is_partial_reg_write() ||
inst->dst.file != MRF || inst->src[0].file != VGRF ||
inst->dst.type != inst->src[0].type ||
inst->src[0].abs || inst->src[0].negate ||
@@ -3083,7 +3102,7 @@ fs_visitor::compute_to_mrf()
* that writes that reg, but it would require smarter
* tracking.
*/
- if (scan_inst->is_partial_write())
+ if (scan_inst->is_partial_reg_write())
break;
/* Handling things not fully contained in the source of the copy
@@ -3395,7 +3414,7 @@ fs_visitor::remove_duplicate_mrf_writes()
if (inst->opcode == BRW_OPCODE_MOV &&
inst->dst.file == MRF &&
inst->src[0].file != ARF &&
- !inst->is_partial_write()) {
+ !inst->is_partial_reg_write()) {
last_mrf_move[inst->dst.nr] = inst;
}
}
diff --git a/src/intel/compiler/brw_fs_cmod_propagation.cpp b/src/intel/compiler/brw_fs_cmod_propagation.cpp
index 5fb522f810f..7bb5c9afbc9 100644
--- a/src/intel/compiler/brw_fs_cmod_propagation.cpp
+++ b/src/intel/compiler/brw_fs_cmod_propagation.cpp
@@ -50,13 +50,13 @@
static bool
cmod_propagate_cmp_to_add(const gen_device_info *devinfo, bblock_t *block,
- fs_inst *inst)
+ fs_inst *inst, unsigned dispatch_width)
{
bool read_flag = false;
foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
if (scan_inst->opcode == BRW_OPCODE_ADD &&
- !scan_inst->is_partial_write() &&
+ !scan_inst->is_partial_var_write(dispatch_width) &&
scan_inst->exec_size == inst->exec_size) {
bool negate;
@@ -126,7 +126,7 @@ cmod_propagate_cmp_to_add(const gen_device_info *devinfo, bblock_t *block,
*/
static bool
cmod_propagate_not(const gen_device_info *devinfo, bblock_t *block,
- fs_inst *inst)
+ fs_inst *inst, unsigned dispatch_width)
{
const enum brw_conditional_mod cond = brw_negate_cmod(inst->conditional_mod);
bool read_flag = false;
@@ -141,7 +141,7 @@ cmod_propagate_not(const gen_device_info *devinfo, bblock_t *block,
scan_inst->opcode != BRW_OPCODE_AND)
break;
- if (scan_inst->is_partial_write() ||
+ if (scan_inst->is_partial_var_write(dispatch_width) ||
scan_inst->dst.offset != inst->src[0].offset ||
scan_inst->exec_size != inst->exec_size)
break;
@@ -166,7 +166,9 @@ cmod_propagate_not(const gen_device_info *devinfo, bblock_t *block,
}
static bool
-opt_cmod_propagation_local(const gen_device_info *devinfo, bblock_t *block)
+opt_cmod_propagation_local(const gen_device_info *devinfo,
+ bblock_t *block,
+ unsigned dispatch_width)
{
bool progress = false;
int ip = block->end_ip + 1;
@@ -219,14 +221,14 @@ opt_cmod_propagation_local(const gen_device_info *devinfo, bblock_t *block)
*/
if (inst->opcode == BRW_OPCODE_CMP && !inst->src[1].is_zero()) {
if (brw_reg_type_is_floating_point(inst->src[0].type) &&
- cmod_propagate_cmp_to_add(devinfo, block, inst))
+ cmod_propagate_cmp_to_add(devinfo, block, inst, dispatch_width))
progress = true;
continue;
}
if (inst->opcode == BRW_OPCODE_NOT) {
- progress = cmod_propagate_not(devinfo, block, inst) || progress;
+ progress = cmod_propagate_not(devinfo, block, inst, dispatch_width) || progress;
continue;
}
@@ -234,7 +236,7 @@ opt_cmod_propagation_local(const gen_device_info *devinfo, bblock_t *block)
foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
if (regions_overlap(scan_inst->dst, scan_inst->size_written,
inst->src[0], inst->size_read(0))) {
- if (scan_inst->is_partial_write() ||
+ if (scan_inst->is_partial_var_write(dispatch_width) ||
scan_inst->dst.offset != inst->src[0].offset ||
scan_inst->exec_size != inst->exec_size)
break;
@@ -342,7 +344,7 @@ fs_visitor::opt_cmod_propagation()
bool progress = false;
foreach_block_reverse(block, cfg) {
- progress = opt_cmod_propagation_local(devinfo, block) || progress;
+ progress = opt_cmod_propagation_local(devinfo, block, dispatch_width) || progress;
}
if (progress)
diff --git a/src/intel/compiler/brw_fs_copy_propagation.cpp b/src/intel/compiler/brw_fs_copy_propagation.cpp
index c01d4ec4a4f..0f0284115fb 100644
--- a/src/intel/compiler/brw_fs_copy_propagation.cpp
+++ b/src/intel/compiler/brw_fs_copy_propagation.cpp
@@ -505,7 +505,7 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
/* Compute the first component of the copy that the instruction is
* reading, and the base byte offset within that component.
*/
- assert(entry->dst.offset % REG_SIZE == 0 && entry->dst.stride == 1);
+ assert(entry->dst.stride == 1);
const unsigned component = rel_offset / type_sz(entry->dst.type);
const unsigned suboffset = rel_offset % type_sz(entry->dst.type);
@@ -783,7 +783,7 @@ fs_visitor::try_constant_propagate(fs_inst *inst, acp_entry *entry)
}
static bool
-can_propagate_from(fs_inst *inst)
+can_propagate_from(fs_inst *inst, unsigned dispatch_width)
{
return (inst->opcode == BRW_OPCODE_MOV &&
inst->dst.file == VGRF &&
@@ -794,7 +794,7 @@ can_propagate_from(fs_inst *inst)
inst->src[0].file == UNIFORM ||
inst->src[0].file == IMM) &&
inst->src[0].type == inst->dst.type &&
- !inst->is_partial_write());
+ !inst->is_partial_var_write(dispatch_width));
}
/* Walks a basic block and does copy propagation on it using the acp
@@ -846,7 +846,7 @@ fs_visitor::opt_copy_propagation_local(void *copy_prop_ctx, bblock_t *block,
/* If this instruction's source could potentially be folded into the
* operand of another instruction, add it to the ACP.
*/
- if (can_propagate_from(inst)) {
+ if (can_propagate_from(inst, dispatch_width)) {
acp_entry *entry = ralloc(copy_prop_ctx, acp_entry);
entry->dst = inst->dst;
entry->src = inst->src[0];
diff --git a/src/intel/compiler/brw_fs_cse.cpp b/src/intel/compiler/brw_fs_cse.cpp
index 6859733d58c..56813df2d2a 100644
--- a/src/intel/compiler/brw_fs_cse.cpp
+++ b/src/intel/compiler/brw_fs_cse.cpp
@@ -247,7 +247,8 @@ fs_visitor::opt_cse_local(bblock_t *block)
int ip = block->start_ip;
foreach_inst_in_block(fs_inst, inst, block) {
/* Skip some cases. */
- if (is_expression(this, inst) && !inst->is_partial_write() &&
+ if (is_expression(this, inst) &&
+ !inst->is_partial_var_write(dispatch_width) &&
((inst->dst.file != ARF && inst->dst.file != FIXED_GRF) ||
inst->dst.is_null()))
{
diff --git a/src/intel/compiler/brw_fs_dead_code_eliminate.cpp b/src/intel/compiler/brw_fs_dead_code_eliminate.cpp
index eeb71dd2b92..f24fd0643b8 100644
--- a/src/intel/compiler/brw_fs_dead_code_eliminate.cpp
+++ b/src/intel/compiler/brw_fs_dead_code_eliminate.cpp
@@ -110,7 +110,7 @@ fs_visitor::dead_code_eliminate()
}
if (inst->dst.file == VGRF) {
- if (!inst->is_partial_write()) {
+ if (!inst->is_partial_reg_write()) {
int var = live_intervals->var_from_reg(inst->dst);
for (unsigned i = 0; i < regs_written(inst); i++) {
BITSET_CLEAR(live, var + i);
diff --git a/src/intel/compiler/brw_fs_live_variables.cpp b/src/intel/compiler/brw_fs_live_variables.cpp
index 059f076fa51..30625aa586a 100644
--- a/src/intel/compiler/brw_fs_live_variables.cpp
+++ b/src/intel/compiler/brw_fs_live_variables.cpp
@@ -84,7 +84,7 @@ fs_live_variables::setup_one_write(struct block_data *bd, fs_inst *inst,
* screens off previous updates of that variable (VGRF channel).
*/
if (inst->dst.file == VGRF) {
- if (!inst->is_partial_write() && !BITSET_TEST(bd->use, var))
+ if (!inst->is_partial_reg_write() && !BITSET_TEST(bd->use, var))
BITSET_SET(bd->def, var);
BITSET_SET(bd->defout, var);
diff --git a/src/intel/compiler/brw_fs_reg_allocate.cpp b/src/intel/compiler/brw_fs_reg_allocate.cpp
index 73b8b7841f5..5147e8b4426 100644
--- a/src/intel/compiler/brw_fs_reg_allocate.cpp
+++ b/src/intel/compiler/brw_fs_reg_allocate.cpp
@@ -1026,7 +1026,7 @@ fs_visitor::spill_reg(int spill_reg)
* write, there should be no need for the unspill since the
* instruction will be overwriting the whole destination in any case.
*/
- if (inst->is_partial_write() ||
+ if (inst->is_partial_reg_write() ||
(!inst->force_writemask_all && !per_channel))
emit_unspill(ubld, spill_src, subset_spill_offset,
regs_written(inst));
diff --git a/src/intel/compiler/brw_fs_register_coalesce.cpp b/src/intel/compiler/brw_fs_register_coalesce.cpp
index 952276faed8..b27956023c6 100644
--- a/src/intel/compiler/brw_fs_register_coalesce.cpp
+++ b/src/intel/compiler/brw_fs_register_coalesce.cpp
@@ -70,7 +70,7 @@ is_coalesce_candidate(const fs_visitor *v, const fs_inst *inst)
{
if ((inst->opcode != BRW_OPCODE_MOV &&
inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD) ||
- inst->is_partial_write() ||
+ inst->is_partial_reg_write() ||
inst->saturate ||
inst->src[0].file != VGRF ||
inst->src[0].negate ||
diff --git a/src/intel/compiler/brw_fs_saturate_propagation.cpp b/src/intel/compiler/brw_fs_saturate_propagation.cpp
index d6cfa79a618..1e1461063ae 100644
--- a/src/intel/compiler/brw_fs_saturate_propagation.cpp
+++ b/src/intel/compiler/brw_fs_saturate_propagation.cpp
@@ -43,7 +43,8 @@
*/
static bool
-opt_saturate_propagation_local(fs_visitor *v, bblock_t *block)
+opt_saturate_propagation_local(fs_visitor *v, bblock_t *block,
+ unsigned dispatch_width)
{
bool progress = false;
int ip = block->end_ip + 1;
@@ -66,7 +67,7 @@ opt_saturate_propagation_local(fs_visitor *v, bblock_t *block)
foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
if (regions_overlap(scan_inst->dst, scan_inst->size_written,
inst->src[0], inst->size_read(0))) {
- if (scan_inst->is_partial_write() ||
+ if (scan_inst->is_partial_var_write(dispatch_width) ||
(scan_inst->dst.type != inst->dst.type &&
!scan_inst->can_change_types()))
break;
@@ -153,7 +154,7 @@ fs_visitor::opt_saturate_propagation()
calculate_live_intervals();
foreach_block (block, cfg) {
- progress = opt_saturate_propagation_local(this, block) || progress;
+ progress = opt_saturate_propagation_local(this, block, dispatch_width) || progress;
}
/* Live intervals are still valid. */
diff --git a/src/intel/compiler/brw_fs_sel_peephole.cpp b/src/intel/compiler/brw_fs_sel_peephole.cpp
index 6395b409b7c..98d640a3bfe 100644
--- a/src/intel/compiler/brw_fs_sel_peephole.cpp
+++ b/src/intel/compiler/brw_fs_sel_peephole.cpp
@@ -167,8 +167,8 @@ fs_visitor::opt_peephole_sel()
then_mov[i]->exec_size != else_mov[i]->exec_size ||
then_mov[i]->group != else_mov[i]->group ||
then_mov[i]->force_writemask_all != else_mov[i]->force_writemask_all ||
- then_mov[i]->is_partial_write() ||
- else_mov[i]->is_partial_write() ||
+ then_mov[i]->is_partial_var_write(dispatch_width) ||
+ else_mov[i]->is_partial_var_write(dispatch_width) ||
then_mov[i]->conditional_mod != BRW_CONDITIONAL_NONE ||
else_mov[i]->conditional_mod != BRW_CONDITIONAL_NONE) {
movs = i;
diff --git a/src/intel/compiler/brw_ir_fs.h b/src/intel/compiler/brw_ir_fs.h
index 07e7224e0f8..02c91b246a3 100644
--- a/src/intel/compiler/brw_ir_fs.h
+++ b/src/intel/compiler/brw_ir_fs.h
bool equals(fs_inst *inst) const;
bool is_send_from_grf() const;
- bool is_partial_write() const;
+ bool is_partial_reg_write() const;
+ bool is_partial_var_write(unsigned dispatch_width) const;
bool is_copy_payload(const brw::simple_allocator &grf_alloc) const;
unsigned components_read(unsigned i) const;
unsigned size_read(int arg) const;
--
2.17.1
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Iago Toral Quiroga

2018-12-04 07:16:59 UTC

Broadwell hardware has a bug that manifests in SIMD8 executions of
16-bit MAD instructions when any of the sources is a Y or W component.
We pack these components in the same SIMD register as components X and
Z respectively, but starting at offset 16B (so they live in the second
half of the register). The problem does not exist in SKL or later.

We work around this issue by moving any such sources to a temporary
starting at offset 0B. We want to do this after the main optimization loop
to prevent copy-propagation and friends to undo the fix.
---
src/intel/compiler/brw_fs.cpp | 48 +++++++++++++++++++++++++++++++++++
src/intel/compiler/brw_fs.h | 1 +
2 files changed, 49 insertions(+)

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index a9ddafc05d1..1d5d1dd0d22 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -6451,6 +6451,48 @@ fs_visitor::optimize()
validate();
}

+/**
+ * Broadwell hardware has a bug that manifests in SIMD8 executions of 16-bit
+ * MAD instructions when any of the sources is a Y or W component. We pack
+ * these components in the same SIMD register as components X and Z
+ * respectively, but starting at offset 16B (so they live in the second half
+ * of the register).
+ *
+ * We work around this issue by moving any such sources to a temporary
+ * starting at offset 0B. We want to do this after the main optimization loop
+ * to prevent copy-propagation and friends to undo the fix.
+ */
+void
+fs_visitor::fixup_hf_mad()
+{
+ if (devinfo->gen > 8)
+ return;
+
+ bool progress = false;
+
+ foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
+ if (inst->opcode != BRW_OPCODE_MAD ||
+ inst->dst.type != BRW_REGISTER_TYPE_HF ||
+ inst->exec_size > 8)
+ continue;
+
+ for (int i = 0; i < 3; i++) {
+ if (inst->src[i].offset > 0) {
+ assert(inst->src[i].type == BRW_REGISTER_TYPE_HF);
+ const fs_builder ibld =
+ bld.at(block, inst).exec_all().group(inst->exec_size, 0);
+ fs_reg tmp = ibld.vgrf(inst->src[i].type);
+ ibld.MOV(tmp, inst->src[i]);
+ inst->src[i] = tmp;
+ progress = true;
+ }
+ }
+ }
+
+ if (progress)
+ invalidate_live_intervals();
+}
+
/**
* Three source instruction must have a GRF/MRF destination register.
* ARF NULL is not allowed. Fix that up by allocating a temporary GRF.
@@ -6609,6 +6651,7 @@ fs_visitor::run_vs()
assign_curb_setup();
assign_vs_urb_setup();

+ fixup_hf_mad();
fixup_3src_null_dest();
allocate_registers(8, true);

@@ -6693,6 +6736,7 @@ fs_visitor::run_tcs_single_patch()
assign_curb_setup();
assign_tcs_single_patch_urb_setup();

+ fixup_hf_mad();
fixup_3src_null_dest();
allocate_registers(8, true);

@@ -6727,6 +6771,7 @@ fs_visitor::run_tes()
assign_curb_setup();
assign_tes_urb_setup();

+ fixup_hf_mad();
fixup_3src_null_dest();
allocate_registers(8, true);

@@ -6776,6 +6821,7 @@ fs_visitor::run_gs()
assign_curb_setup();
assign_gs_urb_setup();

+ fixup_hf_mad();
fixup_3src_null_dest();
allocate_registers(8, true);

@@ -6876,6 +6922,7 @@ fs_visitor::run_fs(bool allow_spilling, bool do_rep_send)

assign_urb_setup();

+ fixup_hf_mad();
fixup_3src_null_dest();
allocate_registers(8, allow_spilling);

@@ -6920,6 +6967,7 @@ fs_visitor::run_cs(unsigned min_dispatch_width)

assign_curb_setup();

+ fixup_hf_mad();
fixup_3src_null_dest();
allocate_registers(min_dispatch_width, true);

diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h
index 163c0008820..f79f8554fb9 100644
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@@ -103,6 +103,7 @@ public:
void setup_vs_payload();
void setup_gs_payload();
void setup_cs_payload();
+ void fixup_hf_mad();
void fixup_3src_null_dest();
void assign_curb_setup();
void calculate_urb_setup();

--
2.17.1

Iago Toral Quiroga

2018-12-04 07:17:08 UTC

There are hardware restrictions that need to be considered.
---
src/intel/compiler/brw_fs_nir.cpp | 23 ++++++++++++++++++-----
1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index c1ba799d59c..4f815fef891 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -854,6 +854,22 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
case nir_op_i2i64:
case nir_op_u2f64:
case nir_op_u2u64:
+ /* SKL PRM, vol 02a, Command Reference: Instructions, Move:
+ *
+ * "There is no direct conversion from B/UB to DF or DF to B/UB. Use
+ * two instructions and a word or DWord intermediate type."
+ *
+ * "There is no direct conversion from B/UB to Q/UQ or Q/UQ to B/UB.
+ * Use two instructions and a word or DWord intermediate integer
+ * type."
+ */
+ if (nir_src_bit_size(instr->src[0].src) == 8) {
+ fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_D, 1);
+ inst = bld.MOV(tmp, op[0]);
+ inst->saturate = instr->dest.saturate;
+ op[0] = tmp;
+ }
+
/* CHV PRM, vol07, 3D Media GPGPU Engine, Register Region Restrictions:
*
* "When source or destination is 64b (...), regioning in Align1
@@ -867,15 +883,12 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
* 64-bit need to have the source data elements aligned to 64-bit.
* This restriction does not apply to BDW and later.
*/
- if (nir_dest_bit_size(instr->dest.dest) == 64 &&
- nir_src_bit_size(instr->src[0].src) < 64 &&
+ if (type_sz(result.type) == 8 && type_sz(op[0].type) < 8 &&
(devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) {
fs_reg tmp = bld.vgrf(result.type, 1);
tmp = subscript(tmp, op[0].type, 0);
inst = bld.MOV(tmp, op[0]);
- inst = bld.MOV(result, tmp);
- inst->saturate = instr->dest.saturate;
- break;
+ op[0] = tmp;
}
inst = bld.MOV(result, op[0]);
inst->saturate = instr->dest.saturate;

--
2.17.1

Iago Toral Quiroga

2018-12-04 07:17:23 UTC

---
src/intel/compiler/brw_fs_copy_propagation.cpp | 17 ++++++++---------
1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/src/intel/compiler/brw_fs_copy_propagation.cpp b/src/intel/compiler/brw_fs_copy_propagation.cpp
index 0f0284115fb..3d0fe883324 100644
--- a/src/intel/compiler/brw_fs_copy_propagation.cpp
+++ b/src/intel/compiler/brw_fs_copy_propagation.cpp
@@ -762,16 +762,15 @@ fs_visitor::try_constant_propagate(fs_inst *inst, acp_entry *entry)

case BRW_OPCODE_MAD:
case BRW_OPCODE_LRP:
- /* 3-src instructions can't take IMM registers, however, for 32-bit
- * floating instructions we rely on the combine constants pass to fix
- * it up. For anything else, we shouldn't be promoting immediates
- * until we can make the pass capable of combining constants of
- * different sizes.
+ /* 3-src instructions can't take IMM registers, but we allow this
+ * here in the hope that it can activate algebraic optimizations and
+ * then we rely on the combine constants pass to fix up the remaining
+ * cases.
*/
- if (val.type == BRW_REGISTER_TYPE_F) {
- inst->src[i] = val;
- progress = true;
- }
+ assert(val.type == BRW_REGISTER_TYPE_F ||
+ val.type == BRW_REGISTER_TYPE_HF);
+ inst->src[i] = val;
+ progress = true;
break;

default:

--
2.17.1

Iago Toral Quiroga

2018-12-04 07:16:45 UTC

Reviewed-by: Jason Ekstrand <***@jlekstrand.net>
---
src/intel/compiler/brw_compiler.c | 1 +
1 file changed, 1 insertion(+)

diff --git a/src/intel/compiler/brw_compiler.c b/src/intel/compiler/brw_compiler.c
index f885e79c3e6..04a1a7cac4e 100644
--- a/src/intel/compiler/brw_compiler.c
+++ b/src/intel/compiler/brw_compiler.c
@@ -33,6 +33,7 @@
.lower_sub = true, \
.lower_fdiv = true, \
.lower_scmp = true, \
+ .lower_flrp16 = true, \
.lower_fmod16 = true, \
.lower_fmod32 = true, \
.lower_fmod64 = false, \

--
2.17.1

Iago Toral Quiroga

2018-12-04 07:17:15 UTC

---
src/intel/vulkan/anv_pipeline.c | 1 +
1 file changed, 1 insertion(+)

diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c
index 5b583c28582..d55e51adcbb 100644
--- a/src/intel/vulkan/anv_pipeline.c
+++ b/src/intel/vulkan/anv_pipeline.c
@@ -147,6 +147,7 @@ anv_shader_compile_to_nir(struct anv_pipeline *pipeline,
.storage_16bit = device->instance->physicalDevice.info.gen >= 8,
.int16 = device->instance->physicalDevice.info.gen >= 8,
.float16 = device->instance->physicalDevice.info.gen >= 8,
+ .int8 = device->instance->physicalDevice.info.gen >= 8,
.shader_viewport_index_layer = true,
.subgroup_arithmetic = true,
.subgroup_basic = true,

--
2.17.1

Iago Toral Quiroga

2018-12-04 07:17:10 UTC

Section Register Region Restriction of the 3D Media GPGPU chapter states:

"Conversion between Integer and HF (Half Float) must be DWord
aligned and strided by a DWord on the destination."

The same restriction shows up in all hardware platforms that support
half-float, however, empirical testing suggests that only atom
platforms are affected.
---
src/intel/compiler/brw_fs_nir.cpp | 41 +++++++++++++++++++++++++++++--
1 file changed, 39 insertions(+), 2 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index 3f98c6a4474..db3a8812ae3 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -917,6 +917,25 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
inst->saturate = instr->dest.saturate;
break;
}
+
+ /* CHV PRM, 3D Media GPGPU Engine, Register Region Restrictions,
+ * Special Restrictions:
+ *
+ * "Conversion between Integer and HF (Half Float) must be DWord
+ * aligned and strided by a DWord on the destination."
+ *
+ * The same restriction is listed for other hardware platforms, however,
+ * empirical testing suggests that only atom platforms are affected.
+ */
+ if ((devinfo->is_cherryview || gen_device_info_is_9lp(devinfo)) &&
+ nir_dest_bit_size(instr->dest.dest) == 16) {
+ assert(result.type == BRW_REGISTER_TYPE_HF);
+ fs_reg tmp =
+ horiz_stride(retype(bld.vgrf(BRW_REGISTER_TYPE_F, 1), result.type), 2);
+ bld.MOV(tmp, op[0]);
+ op[0] = tmp;
+ }
+
inst = bld.MOV(result, op[0]);
inst->saturate = instr->dest.saturate;
break;
@@ -939,11 +958,29 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
}
/* Fallthrough */

+ case nir_op_f2i16:
+ case nir_op_f2u16:
+ /* CHV PRM, 3D Media GPGPU Engine, Register Region Restrictions,
+ * Special Restrictions:
+ *
+ * "Conversion between Integer and HF (Half Float) must be DWord
+ * aligned and strided by a DWord on the destination."
+ *
+ * The same restriction is listed for other hardware platforms, however,
+ * empirical testing suggests that only atom platforms are affected.
+ */
+ if ((devinfo->is_cherryview || gen_device_info_is_9lp(devinfo)) &&
+ nir_src_bit_size(instr->src[0].src) == 16) {
+ fs_reg tmp =
+ horiz_stride(retype(bld.vgrf(BRW_REGISTER_TYPE_D, 1), result.type), 2);
+ bld.MOV(tmp, op[0]);
+ op[0] = tmp;
+ }
+ /* Fallthrough */
+
case nir_op_f2f32:
case nir_op_f2i32:
case nir_op_f2u32:
- case nir_op_f2i16:
- case nir_op_f2u16:
case nir_op_i2i32:
case nir_op_u2u32:
case nir_op_i2i16:

--
2.17.1

Iago Toral Quiroga

2018-12-04 07:16:51 UTC

---
src/intel/compiler/brw_eu_emit.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c
index 5f066d17a1f..2c9fc9a5c7c 100644
--- a/src/intel/compiler/brw_eu_emit.c
+++ b/src/intel/compiler/brw_eu_emit.c
@@ -755,7 +755,8 @@ brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest,
assert(dest.type == BRW_REGISTER_TYPE_F ||
dest.type == BRW_REGISTER_TYPE_DF ||
dest.type == BRW_REGISTER_TYPE_D ||
- dest.type == BRW_REGISTER_TYPE_UD);
+ dest.type == BRW_REGISTER_TYPE_UD ||
+ (dest.type == BRW_REGISTER_TYPE_HF && devinfo->gen >= 8));
if (devinfo->gen == 6) {
brw_inst_set_3src_a16_dst_reg_file(devinfo, inst,
dest.file == BRW_MESSAGE_REGISTER_FILE);

--
2.17.1

Pohjolainen, Topi

2018-12-05 13:13:00 UTC

Post by Iago Toral Quiroga
---
src/intel/compiler/brw_eu_emit.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c
index 5f066d17a1f..2c9fc9a5c7c 100644
--- a/src/intel/compiler/brw_eu_emit.c
+++ b/src/intel/compiler/brw_eu_emit.c
@@ -755,7 +755,8 @@ brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest,
assert(dest.type == BRW_REGISTER_TYPE_F ||
dest.type == BRW_REGISTER_TYPE_DF ||
dest.type == BRW_REGISTER_TYPE_D ||
- dest.type == BRW_REGISTER_TYPE_UD);
+ dest.type == BRW_REGISTER_TYPE_UD ||
+ (dest.type == BRW_REGISTER_TYPE_HF && devinfo->gen >= 8));
if (devinfo->gen == 6) {
brw_inst_set_3src_a16_dst_reg_file(devinfo, inst,
dest.file == BRW_MESSAGE_REGISTER_FILE);
--
2.17.1
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Iago Toral Quiroga

2018-12-04 07:16:31 UTC

---
src/compiler/spirv/vtn_glsl450.c | 29 +++++++++++++++++++----------
1 file changed, 19 insertions(+), 10 deletions(-)

diff --git a/src/compiler/spirv/vtn_glsl450.c b/src/compiler/spirv/vtn_glsl450.c
index 8bdef9db822..85851755aab 100644
--- a/src/compiler/spirv/vtn_glsl450.c
+++ b/src/compiler/spirv/vtn_glsl450.c
@@ -672,7 +672,7 @@ handle_glsl450_alu(struct vtn_builder *b, enum GLSLstd450 entrypoint,
case GLSLstd450Sinh:
/* 0.5 * (e^x - e^(-x)) */
val->ssa->def =
- nir_fmul(nb, nir_imm_float(nb, 0.5f),
+ nir_fmul(nb, nir_imm_floatN_t(nb, 0.5f, src[0]->bit_size),
nir_fsub(nb, build_exp(nb, src[0]),
build_exp(nb, nir_fneg(nb, src[0]))));
return;
@@ -680,7 +680,7 @@ handle_glsl450_alu(struct vtn_builder *b, enum GLSLstd450 entrypoint,
case GLSLstd450Cosh:
/* 0.5 * (e^x + e^(-x)) */
val->ssa->def =
- nir_fmul(nb, nir_imm_float(nb, 0.5f),
+ nir_fmul(nb, nir_imm_floatN_t(nb, 0.5f, src[0]->bit_size),
nir_fadd(nb, build_exp(nb, src[0]),
build_exp(nb, nir_fneg(nb, src[0]))));
return;
@@ -693,11 +693,20 @@ handle_glsl450_alu(struct vtn_builder *b, enum GLSLstd450 entrypoint,
* We clamp x to (-inf, +10] to avoid precision problems. When x > 10,
* e^2x is so much larger than 1.0 that 1.0 gets flushed to zero in the
* computation e^2x +/- 1 so it can be ignored.
+ *
+ * For 16-bit precision we clamp x to (-inf, +4.2] since the maximum
+ * representable number is only 65,504 and e^(2*6) exceeds that. Also,
+ * if x > 4.2, tanh(x) will return 1.0 in fp16.
*/
- nir_ssa_def *x = nir_fmin(nb, src[0], nir_imm_float(nb, 10));
- nir_ssa_def *exp2x = build_exp(nb, nir_fmul(nb, x, nir_imm_float(nb, 2)));
- val->ssa->def = nir_fdiv(nb, nir_fsub(nb, exp2x, nir_imm_float(nb, 1)),
- nir_fadd(nb, exp2x, nir_imm_float(nb, 1)));
+ const uint32_t bit_size = src[0]->bit_size;
+ const double clamped_x = bit_size > 16 ? 10.0 : 4.2;
+ nir_ssa_def *x = nir_fmin(nb, src[0],
+ nir_imm_floatN_t(nb, clamped_x, bit_size));
+ nir_ssa_def *one = nir_imm_floatN_t(nb, 1.0, bit_size);
+ nir_ssa_def *two = nir_imm_floatN_t(nb, 2.0, bit_size);
+ nir_ssa_def *exp2x = build_exp(nb, nir_fmul(nb, x, two));
+ val->ssa->def = nir_fdiv(nb, nir_fsub(nb, exp2x, one),
+ nir_fadd(nb, exp2x, one));
return;
}

@@ -705,16 +714,16 @@ handle_glsl450_alu(struct vtn_builder *b, enum GLSLstd450 entrypoint,
val->ssa->def = nir_fmul(nb, nir_fsign(nb, src[0]),
build_log(nb, nir_fadd(nb, nir_fabs(nb, src[0]),
nir_fsqrt(nb, nir_fadd(nb, nir_fmul(nb, src[0], src[0]),
- nir_imm_float(nb, 1.0f))))));
+ nir_imm_floatN_t(nb, 1.0f, src[0]->bit_size))))));
return;
case GLSLstd450Acosh:
val->ssa->def = build_log(nb, nir_fadd(nb, src[0],
nir_fsqrt(nb, nir_fsub(nb, nir_fmul(nb, src[0], src[0]),
- nir_imm_float(nb, 1.0f)))));
+ nir_imm_floatN_t(nb, 1.0f, src[0]->bit_size)))));
return;
case GLSLstd450Atanh: {
- nir_ssa_def *one = nir_imm_float(nb, 1.0);
- val->ssa->def = nir_fmul(nb, nir_imm_float(nb, 0.5f),
+ nir_ssa_def *one = nir_imm_floatN_t(nb, 1.0, src[0]->bit_size);
+ val->ssa->def = nir_fmul(nb, nir_imm_floatN_t(nb, 0.5f, src[0]->bit_size),
build_log(nb, nir_fdiv(nb, nir_fadd(nb, one, src[0]),
nir_fsub(nb, one, src[0]))));
return;

--
2.17.1

Jason Ekstrand

2018-12-07 15:21:58 UTC

My comment earlier, I think, applies to all of the first 7. Let's just add
nir_fadd_imm and nir_fmul_imm and rewrite them to use those. That'll make
them handle doubles as well if we ever need it.

Post by Iago Toral Quiroga
---
src/compiler/spirv/vtn_glsl450.c | 29 +++++++++++++++++++----------
1 file changed, 19 insertions(+), 10 deletions(-)
diff --git a/src/compiler/spirv/vtn_glsl450.c
b/src/compiler/spirv/vtn_glsl450.c
index 8bdef9db822..85851755aab 100644
--- a/src/compiler/spirv/vtn_glsl450.c
+++ b/src/compiler/spirv/vtn_glsl450.c
@@ -672,7 +672,7 @@ handle_glsl450_alu(struct vtn_builder *b, enum GLSLstd450 entrypoint,
/* 0.5 * (e^x - e^(-x)) */
val->ssa->def =
- nir_fmul(nb, nir_imm_float(nb, 0.5f),
+ nir_fmul(nb, nir_imm_floatN_t(nb, 0.5f, src[0]->bit_size),
nir_fsub(nb, build_exp(nb, src[0]),
build_exp(nb, nir_fneg(nb, src[0]))));
return;
@@ -680,7 +680,7 @@ handle_glsl450_alu(struct vtn_builder *b, enum GLSLstd450 entrypoint,
/* 0.5 * (e^x + e^(-x)) */
val->ssa->def =
- nir_fmul(nb, nir_imm_float(nb, 0.5f),
+ nir_fmul(nb, nir_imm_floatN_t(nb, 0.5f, src[0]->bit_size),
nir_fadd(nb, build_exp(nb, src[0]),
build_exp(nb, nir_fneg(nb, src[0]))));
return;
@@ -693,11 +693,20 @@ handle_glsl450_alu(struct vtn_builder *b, enum GLSLstd450 entrypoint,
* We clamp x to (-inf, +10] to avoid precision problems. When x > 10,
* e^2x is so much larger than 1.0 that 1.0 gets flushed to zero in the
* computation e^2x +/- 1 so it can be ignored.
+ *
+ * For 16-bit precision we clamp x to (-inf, +4.2] since the maximum
+ * representable number is only 65,504 and e^(2*6) exceeds that. Also,
+ * if x > 4.2, tanh(x) will return 1.0 in fp16.
*/
- nir_ssa_def *x = nir_fmin(nb, src[0], nir_imm_float(nb, 10));
- nir_ssa_def *exp2x = build_exp(nb, nir_fmul(nb, x,
nir_imm_float(nb, 2)));
- val->ssa->def = nir_fdiv(nb, nir_fsub(nb, exp2x, nir_imm_float(nb, 1)),
- nir_fadd(nb, exp2x, nir_imm_float(nb, 1)));
+ const uint32_t bit_size = src[0]->bit_size;
+ const double clamped_x = bit_size > 16 ? 10.0 : 4.2;
+ nir_ssa_def *x = nir_fmin(nb, src[0],
+ nir_imm_floatN_t(nb, clamped_x, bit_size));
+ nir_ssa_def *one = nir_imm_floatN_t(nb, 1.0, bit_size);
+ nir_ssa_def *two = nir_imm_floatN_t(nb, 2.0, bit_size);
+ nir_ssa_def *exp2x = build_exp(nb, nir_fmul(nb, x, two));
+ val->ssa->def = nir_fdiv(nb, nir_fsub(nb, exp2x, one),
+ nir_fadd(nb, exp2x, one));
return;
}
@@ -705,16 +714,16 @@ handle_glsl450_alu(struct vtn_builder *b, enum GLSLstd450 entrypoint,
val->ssa->def = nir_fmul(nb, nir_fsign(nb, src[0]),
build_log(nb, nir_fadd(nb, nir_fabs(nb, src[0]),
nir_fsqrt(nb, nir_fadd(nb, nir_fmul(nb, src[0], src[0]),
- nir_imm_float(nb, 1.0f))))));
+ nir_imm_floatN_t(nb,
1.0f, src[0]->bit_size))))));
return;
val->ssa->def = build_log(nb, nir_fadd(nb, src[0],
nir_fsqrt(nb, nir_fsub(nb, nir_fmul(nb, src[0], src[0]),
- nir_imm_float(nb, 1.0f)))));
+ nir_imm_floatN_t(nb, 1.0f, src[0]->bit_size)))));
return;
case GLSLstd450Atanh: {
- nir_ssa_def *one = nir_imm_float(nb, 1.0);
- val->ssa->def = nir_fmul(nb, nir_imm_float(nb, 0.5f),
+ nir_ssa_def *one = nir_imm_floatN_t(nb, 1.0, src[0]->bit_size);
+ val->ssa->def = nir_fmul(nb, nir_imm_floatN_t(nb, 0.5f,
src[0]->bit_size),
build_log(nb, nir_fdiv(nb, nir_fadd(nb, one, src[0]),
nir_fsub(nb, one, src[0]))));
return;
--
2.17.1
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Iago Toral Quiroga

2018-12-04 07:16:35 UTC

From: Samuel Iglesias Gonsálvez <***@igalia.com>

It is not supported directly in the HW, we need to convert to a 32-bit
type first as intermediate step.

v2 (Iago): handle conversions from 64-bit integers as well

Signed-off-by: Samuel Iglesias Gonsálvez <***@igalia.com>
---
src/intel/compiler/brw_fs_nir.cpp | 42 ++++++++++++++++++++++++++++---
1 file changed, 39 insertions(+), 3 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index 7294f49ddc0..9f3d3bf9762 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -784,6 +784,19 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
*/

case nir_op_f2f16:
+ /* BDW PRM, vol02, Command Reference Instructions, mov - MOVE:
+ *
+ * "There is no direct conversion from HF to DF or DF to HF.
+ * Use two instructions and F (Float) as an intermediate type.
+ */
+ if (nir_src_bit_size(instr->src[0].src) == 64) {
+ fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 1);
+ inst = bld.MOV(tmp, op[0]);
+ inst->saturate = instr->dest.saturate;
+ inst = bld.MOV(result, tmp);
+ inst->saturate = instr->dest.saturate;
+ break;
+ }
inst = bld.MOV(result, op[0]);
inst->saturate = instr->dest.saturate;
break;
@@ -864,7 +877,32 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
inst->saturate = instr->dest.saturate;
break;
}
- /* fallthrough */
+ inst = bld.MOV(result, op[0]);
+ inst->saturate = instr->dest.saturate;
+ break;
+
+ case nir_op_i2f16:
+ case nir_op_u2f16:
+ /* BDW PRM, vol02, Command Reference Instructions, mov - MOVE:
+ *
+ * "There is no direct conversion from HF to Q/UQ or Q/UQ to HF.
+ * Use two instructions and F (Float) or a word integer type or a
+ * DWord integer type as an intermediate type."
+ */
+ if (nir_src_bit_size(instr->src[0].src) == 64) {
+ brw_reg_type reg_type = instr->op == nir_op_i2f16 ?
+ BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_UD;
+ fs_reg tmp = bld.vgrf(reg_type, 1);
+ inst = bld.MOV(tmp, op[0]);
+ inst->saturate = instr->dest.saturate;
+ inst = bld.MOV(result, tmp);
+ inst->saturate = instr->dest.saturate;
+ break;
+ }
+ inst = bld.MOV(result, op[0]);
+ inst->saturate = instr->dest.saturate;
+ break;
+
case nir_op_f2f32:
case nir_op_f2i32:
case nir_op_f2u32:
@@ -874,8 +912,6 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
case nir_op_u2u32:
case nir_op_i2i16:
case nir_op_u2u16:
- case nir_op_i2f16:
- case nir_op_u2f16:
case nir_op_i2i8:
case nir_op_u2u8:
inst = bld.MOV(result, op[0]);

--
2.17.1

Pohjolainen, Topi

2018-12-04 12:57:51 UTC

Post by Iago Toral Quiroga
It is not supported directly in the HW, we need to convert to a 32-bit
type first as intermediate step.
v2 (Iago): handle conversions from 64-bit integers as well
---
src/intel/compiler/brw_fs_nir.cpp | 42 ++++++++++++++++++++++++++++---
1 file changed, 39 insertions(+), 3 deletions(-)
diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index 7294f49ddc0..9f3d3bf9762 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -784,6 +784,19 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
*/
+ *
+ * "There is no direct conversion from HF to DF or DF to HF.
+ * Use two instructions and F (Float) as an intermediate type.
+ */
+ if (nir_src_bit_size(instr->src[0].src) == 64) {
+ fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 1);
+ inst = bld.MOV(tmp, op[0]);
+ inst->saturate = instr->dest.saturate;
+ inst = bld.MOV(result, tmp);
+ inst->saturate = instr->dest.saturate;
+ break;
+ }
inst = bld.MOV(result, op[0]);
inst->saturate = instr->dest.saturate;
break;
@@ -864,7 +877,32 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
inst->saturate = instr->dest.saturate;
break;
}
- /* fallthrough */

This is more or less nit-picking but I thought I ask anyway. The fallthru
comment gets now dropped also for other cases than "i2f16" and "u2f16". And if
we added the logic for nir_op_i2f16/nir_op_u2f16 cases just after the f2f16
case that would yield a diff without the following three copy-paste lines as
well. Or amd I missing something?

Post by Iago Toral Quiroga
+ inst = bld.MOV(result, op[0]);
+ inst->saturate = instr->dest.saturate;
+ break;
+
+ *
+ * "There is no direct conversion from HF to Q/UQ or Q/UQ to HF.
+ * Use two instructions and F (Float) or a word integer type or a
+ * DWord integer type as an intermediate type."
+ */
+ if (nir_src_bit_size(instr->src[0].src) == 64) {
+ brw_reg_type reg_type = instr->op == nir_op_i2f16 ?
+ BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_UD;
+ fs_reg tmp = bld.vgrf(reg_type, 1);
+ inst = bld.MOV(tmp, op[0]);
+ inst->saturate = instr->dest.saturate;
+ inst = bld.MOV(result, tmp);
+ inst->saturate = instr->dest.saturate;
+ break;
+ }
+ inst = bld.MOV(result, op[0]);
+ inst->saturate = instr->dest.saturate;
+ break;
+
@@ -874,8 +912,6 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
inst = bld.MOV(result, op[0]);
--
2.17.1
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Iago Toral

2018-12-05 08:49:29 UTC

Post by Iago Toral Quiroga
It is not supported directly in the HW, we need to convert to a 32-
bit
type first as intermediate step.
v2 (Iago): handle conversions from 64-bit integers as well
---
src/intel/compiler/brw_fs_nir.cpp | 42
++++++++++++++++++++++++++++---
1 file changed, 39 insertions(+), 3 deletions(-)
diff --git a/src/intel/compiler/brw_fs_nir.cpp
b/src/intel/compiler/brw_fs_nir.cpp
index 7294f49ddc0..9f3d3bf9762 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -784,6 +784,19 @@ fs_visitor::nir_emit_alu(const fs_builder
&bld, nir_alu_instr *instr)
*/
+ *
+ * "There is no direct conversion from HF to DF or DF to HF.
+ * Use two instructions and F (Float) as an intermediate type.
+ */
+ if (nir_src_bit_size(instr->src[0].src) == 64) {
+ fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 1);
+ inst = bld.MOV(tmp, op[0]);
+ inst->saturate = instr->dest.saturate;
+ inst = bld.MOV(result, tmp);
+ inst->saturate = instr->dest.saturate;
+ break;
+ }
inst = bld.MOV(result, op[0]);
inst->saturate = instr->dest.saturate;
break;
@@ -864,7 +877,32 @@ fs_visitor::nir_emit_alu(const fs_builder
&bld, nir_alu_instr *instr)
inst->saturate = instr->dest.saturate;
break;
}
- /* fallthrough */

This is more or less nit-picking but I thought I ask anyway. The fallthru
comment gets now dropped also for other cases than "i2f16" and
"u2f16". And if
we added the logic for nir_op_i2f16/nir_op_u2f16 cases just after the f2f16
case that would yield a diff without the following three copy-paste lines as
well. Or amd I missing something?

Yes, I think you're right and if you look at this patch standalone I
think it would make sense to do that. The thing is that later on in the
series we have to change this further to incorporate more restrictions
for conversions to/from integer and half-float for atom platforms, so
having the f2f16 case separated from the {i,u}2f16 cases will make more
sense. That would be patch 46 in the series, which comes later because
that is when we addressed integer conversions from 8-bit and noticed
this whole thing on atom.

I can still make the change you suggest in this patch and then do the
split later on if you think that helps though. I could also try to move
the fix for atom earlier in the series, that will lead to conflicts and
I'd need to slightly rewrite other patches in the series to accomodate
to that, but it is certainly doable if you that makes the commit
history better.

Iago

Post by Iago Toral Quiroga
+ inst = bld.MOV(result, op[0]);
+ inst->saturate = instr->dest.saturate;
+ break;
+
+ *
+ * "There is no direct conversion from HF to Q/UQ or Q/UQ to HF.
+ * Use two instructions and F (Float) or a word integer type or a
+ * DWord integer type as an intermediate type."
+ */
+ if (nir_src_bit_size(instr->src[0].src) == 64) {
+ brw_reg_type reg_type = instr->op == nir_op_i2f16 ?
+ BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_UD;
+ fs_reg tmp = bld.vgrf(reg_type, 1);
+ inst = bld.MOV(tmp, op[0]);
+ inst->saturate = instr->dest.saturate;
+ inst = bld.MOV(result, tmp);
+ inst->saturate = instr->dest.saturate;
+ break;
+ }
+ inst = bld.MOV(result, op[0]);
+ inst->saturate = instr->dest.saturate;
+ break;
+
@@ -874,8 +912,6 @@ fs_visitor::nir_emit_alu(const fs_builder &bld,
nir_alu_instr *instr)
inst = bld.MOV(result, op[0]);
--
2.17.1
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Pohjolainen, Topi

2018-12-05 09:08:23 UTC

Post by Iago Toral Quiroga
It is not supported directly in the HW, we need to convert to a 32-
bit
type first as intermediate step.
v2 (Iago): handle conversions from 64-bit integers as well
---
src/intel/compiler/brw_fs_nir.cpp | 42
++++++++++++++++++++++++++++---
1 file changed, 39 insertions(+), 3 deletions(-)
diff --git a/src/intel/compiler/brw_fs_nir.cpp
b/src/intel/compiler/brw_fs_nir.cpp
index 7294f49ddc0..9f3d3bf9762 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -784,6 +784,19 @@ fs_visitor::nir_emit_alu(const fs_builder
&bld, nir_alu_instr *instr)
*/
+ *
+ * "There is no direct conversion from HF to DF or DF to HF.
+ * Use two instructions and F (Float) as an intermediate type.
+ */
+ if (nir_src_bit_size(instr->src[0].src) == 64) {
+ fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 1);
+ inst = bld.MOV(tmp, op[0]);
+ inst->saturate = instr->dest.saturate;
+ inst = bld.MOV(result, tmp);
+ inst->saturate = instr->dest.saturate;
+ break;
+ }
inst = bld.MOV(result, op[0]);
inst->saturate = instr->dest.saturate;
break;
@@ -864,7 +877,32 @@ fs_visitor::nir_emit_alu(const fs_builder
&bld, nir_alu_instr *instr)
inst->saturate = instr->dest.saturate;
break;
}
- /* fallthrough */

Yes, I think you're right and if you look at this patch standalone I
think it would make sense to do that. The thing is that later on in the
series we have to change this further to incorporate more restrictions
for conversions to/from integer and half-float for atom platforms, so
having the f2f16 case separated from the {i,u}2f16 cases will make more
sense. That would be patch 46 in the series, which comes later because
that is when we addressed integer conversions from 8-bit and noticed
this whole thing on atom.
I can still make the change you suggest in this patch and then do the
split later on if you think that helps though. I could also try to move
the fix for atom earlier in the series, that will lead to conflicts and
I'd need to slightly rewrite other patches in the series to accomodate
to that, but it is certainly doable if you that makes the commit
history better.

I'm not sure if I understood correctly your answer but I didn't suggest to
merge f2f16 case with {i,u}2f16 cases. I thought that having:

case nir_op_f2f16:
...
break;

case nir_op_i2f16:
case nir_op_u2f16:
...
break;

case nir_op_b2i:
...

would have yielded smaller diff than:

case nir_op_f2f16:
...
break;

case nir_op_b2i:
...

case nir_op_u2u64:
...
/* fallthrough */

case nir_op_i2f16:
case nir_op_u2f16:
...
break;

case nir_op_f2f32
...
break;

Post by Iago Toral
Iago

Post by Iago Toral Quiroga
+ inst = bld.MOV(result, op[0]);
+ inst->saturate = instr->dest.saturate;
+ break;
+
+ *
+ * "There is no direct conversion from HF to Q/UQ or Q/UQ to HF.
+ * Use two instructions and F (Float) or a word integer type or a
+ * DWord integer type as an intermediate type."
+ */
+ if (nir_src_bit_size(instr->src[0].src) == 64) {
+ brw_reg_type reg_type = instr->op == nir_op_i2f16 ?
+ BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_UD;
+ fs_reg tmp = bld.vgrf(reg_type, 1);
+ inst = bld.MOV(tmp, op[0]);
+ inst->saturate = instr->dest.saturate;
+ inst = bld.MOV(result, tmp);
+ inst->saturate = instr->dest.saturate;
+ break;
+ }
+ inst = bld.MOV(result, op[0]);
+ inst->saturate = instr->dest.saturate;
+ break;
+
@@ -874,8 +912,6 @@ fs_visitor::nir_emit_alu(const fs_builder &bld,
nir_alu_instr *instr)
inst = bld.MOV(result, op[0]);
--
2.17.1
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Iago Toral

2018-12-05 09:56:06 UTC

Post by Iago Toral Quiroga
It is not supported directly in the HW, we need to convert to a 32-
bit
type first as intermediate step.
v2 (Iago): handle conversions from 64-bit integers as well
---
src/intel/compiler/brw_fs_nir.cpp | 42
++++++++++++++++++++++++++++---
1 file changed, 39 insertions(+), 3 deletions(-)
diff --git a/src/intel/compiler/brw_fs_nir.cpp
b/src/intel/compiler/brw_fs_nir.cpp
index 7294f49ddc0..9f3d3bf9762 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -784,6 +784,19 @@ fs_visitor::nir_emit_alu(const fs_builder
&bld, nir_alu_instr *instr)
*/
+ *
+ * "There is no direct conversion from HF to DF or DF
to
HF.
+ * Use two instructions and F (Float) as an
intermediate
type.
+ */
+ if (nir_src_bit_size(instr->src[0].src) == 64) {
+ fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 1);
+ inst = bld.MOV(tmp, op[0]);
+ inst->saturate = instr->dest.saturate;
+ inst = bld.MOV(result, tmp);
+ inst->saturate = instr->dest.saturate;
+ break;
+ }
inst = bld.MOV(result, op[0]);
inst->saturate = instr->dest.saturate;
break;
@@ -864,7 +877,32 @@ fs_visitor::nir_emit_alu(const fs_builder
&bld, nir_alu_instr *instr)
inst->saturate = instr->dest.saturate;
break;
}
- /* fallthrough */

This is more or less nit-picking but I thought I ask anyway. The fallthru
comment gets now dropped also for other cases than "i2f16" and "u2f16". And if
we added the logic for nir_op_i2f16/nir_op_u2f16 cases just after
the
f2f16
case that would yield a diff without the following three copy-
paste
lines as
well. Or amd I missing something?

Yes, I think you're right and if you look at this patch standalone I
think it would make sense to do that. The thing is that later on in the
series we have to change this further to incorporate more
restrictions
for conversions to/from integer and half-float for atom platforms, so
having the f2f16 case separated from the {i,u}2f16 cases will make more
sense. That would be patch 46 in the series, which comes later because
that is when we addressed integer conversions from 8-bit and
noticed
this whole thing on atom.
I can still make the change you suggest in this patch and then do the
split later on if you think that helps though. I could also try to move
the fix for atom earlier in the series, that will lead to conflicts and
I'd need to slightly rewrite other patches in the series to
accomodate
to that, but it is certainly doable if you that makes the commit
history better.

I'm not sure if I understood correctly your answer but I didn't suggest to
...
break;
...
break;
...
...
break;
...
...
/* fallthrough */
...
break;
case nir_op_f2f32
...
break;

Ah, yes, I see what you mean now. I guess this is very subjective in
the end but in general my preference was to separate the cases and try
to avoid too many fallthroughs and specially for large blocks of
opcodes, at least for the cases where the main benefit was to reuse
that 3-line block which is basically the MOV instruction that is going
to be there for all conversion cases. I found that as we added more
types (there is also 8-bit conversions coming up later in the series)
and some of these conversions come with additional restrictions for
specific platforms or source/destination types, the combination of
fallthoughs and conditionals started to be a real pain and made it more
difficult to immediately identify which cases where relevant to a
particular code inside a particular block of case statements. We
actually had a couple of bugs during development about this so I ended
up thinking that trying less hard to fallthough was better.

With that being said, if you feel different about this I have no
problem switching to what you suggest.

Post by Iago Toral
Iago

Post by Iago Toral Quiroga
+ inst = bld.MOV(result, op[0]);
+ inst->saturate = instr->dest.saturate;
+ break;
+
+ *
+ * "There is no direct conversion from HF to Q/UQ or
Q/UQ
to HF.
+ * Use two instructions and F (Float) or a word
integer
type or a
+ * DWord integer type as an intermediate type."
+ */
+ if (nir_src_bit_size(instr->src[0].src) == 64) {
+ brw_reg_type reg_type = instr->op == nir_op_i2f16 ?
+ BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_UD;
+ fs_reg tmp = bld.vgrf(reg_type, 1);
+ inst = bld.MOV(tmp, op[0]);
+ inst->saturate = instr->dest.saturate;
+ inst = bld.MOV(result, tmp);
+ inst->saturate = instr->dest.saturate;
+ break;
+ }
+ inst = bld.MOV(result, op[0]);
+ inst->saturate = instr->dest.saturate;
+ break;
+
@@ -874,8 +912,6 @@ fs_visitor::nir_emit_alu(const fs_builder &bld,
nir_alu_instr *instr)
inst = bld.MOV(result, op[0]);
--
2.17.1
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Jason Ekstrand

2018-12-07 15:34:36 UTC

Now I'm wondering even more about my previous question about just splitting
it into two instructions in NIR.

Post by Iago Toral Quiroga
It is not supported directly in the HW, we need to convert to a 32-bit
type first as intermediate step.
v2 (Iago): handle conversions from 64-bit integers as well
---
src/intel/compiler/brw_fs_nir.cpp | 42 ++++++++++++++++++++++++++++---
1 file changed, 39 insertions(+), 3 deletions(-)
diff --git a/src/intel/compiler/brw_fs_nir.cpp
b/src/intel/compiler/brw_fs_nir.cpp
index 7294f49ddc0..9f3d3bf9762 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -784,6 +784,19 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
*/
+ *
+ * "There is no direct conversion from HF to DF or DF to HF.
+ * Use two instructions and F (Float) as an intermediate type.
+ */
+ if (nir_src_bit_size(instr->src[0].src) == 64) {
+ fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 1);
+ inst = bld.MOV(tmp, op[0]);
+ inst->saturate = instr->dest.saturate;
+ inst = bld.MOV(result, tmp);
+ inst->saturate = instr->dest.saturate;

I don't think you need both saturates. Just the first one should be
sufficient as it clamps to [0, 1] which is representable in 16-bit float.

Post by Iago Toral Quiroga
+ break;
+ }
inst = bld.MOV(result, op[0]);
inst->saturate = instr->dest.saturate;
break;
@@ -864,7 +877,32 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
inst->saturate = instr->dest.saturate;
break;
}
- /* fallthrough */
+ inst = bld.MOV(result, op[0]);
+ inst->saturate = instr->dest.saturate;
+ break;
+
+ *
+ * "There is no direct conversion from HF to Q/UQ or Q/UQ to HF.
+ * Use two instructions and F (Float) or a word integer type or a
+ * DWord integer type as an intermediate type."
+ */
+ if (nir_src_bit_size(instr->src[0].src) == 64) {
+ brw_reg_type reg_type = instr->op == nir_op_i2f16 ?
+ BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_UD;
+ fs_reg tmp = bld.vgrf(reg_type, 1);
+ inst = bld.MOV(tmp, op[0]);
+ inst->saturate = instr->dest.saturate;
+ inst = bld.MOV(result, tmp);
+ inst->saturate = instr->dest.saturate;
+ break;
+ }
+ inst = bld.MOV(result, op[0]);
+ inst->saturate = instr->dest.saturate;
+ break;
+
@@ -874,8 +912,6 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
inst = bld.MOV(result, op[0]);
--
2.17.1
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Iago Toral Quiroga

2018-12-04 07:17:14 UTC

---
src/compiler/shader_info.h | 1 +
src/compiler/spirv/spirv_to_nir.c | 4 +++-
2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/compiler/shader_info.h b/src/compiler/shader_info.h
index 0a3cb37069c..e745cc15fc5 100644
--- a/src/compiler/shader_info.h
+++ b/src/compiler/shader_info.h
@@ -46,6 +46,7 @@ struct spirv_supported_capabilities {
bool storage_16bit;
bool int16;
bool float16;
+ bool int8;
bool shader_viewport_index_layer;
bool subgroup_arithmetic;
bool subgroup_ballot;
diff --git a/src/compiler/spirv/spirv_to_nir.c b/src/compiler/spirv/spirv_to_nir.c
index 6f6673c8fb1..47b11b6ddc3 100644
--- a/src/compiler/spirv/spirv_to_nir.c
+++ b/src/compiler/spirv/spirv_to_nir.c
@@ -3417,7 +3417,6 @@ vtn_handle_preamble_instruction(struct vtn_builder *b, SpvOp opcode,
case SpvCapabilityFloat16Buffer:
case SpvCapabilityInt64Atomics:
case SpvCapabilityStorageImageMultisample:
- case SpvCapabilityInt8:
case SpvCapabilitySparseResidency:
case SpvCapabilityMinLod:
vtn_warn("Unsupported SPIR-V capability: %s",
@@ -3440,6 +3439,9 @@ vtn_handle_preamble_instruction(struct vtn_builder *b, SpvOp opcode,
case SpvCapabilityInt16:
spv_check_supported(int16, cap);
break;
+ case SpvCapabilityInt8:
+ spv_check_supported(int8, cap);
+ break;

case SpvCapabilityTransformFeedback:
spv_check_supported(transform_feedback, cap);

--
2.17.1

Iago Toral Quiroga

2018-12-04 07:16:54 UTC

This optimization depends on two other optimization passes: the
constant propagation pass, which allows immediate propagation
on MAD/LRP instructions even though the hardware can't do it,
and the combine constants pass to fix this up afterwards for the
cases that we could not optimize here.

Also, the optimization can generate cases for MUL/ADD that we
should not find otherwise, which are then implemented building
on that assumption, so better documenting these is useful.
---
src/intel/compiler/brw_fs.cpp | 22 ++++++++++++++++++++++
1 file changed, 22 insertions(+)

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 509c6febf38..a9ddafc05d1 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -2461,6 +2461,11 @@ fs_visitor::opt_algebraic()
}

if (inst->src[0].file == IMM) {
+ /* We produce these from the MAD optimization below, which
+ * should only be happening for 32-bit float because we
+ * prevent constant propagation to MAD sources for other
+ * bit-sizes.
+ */
assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
inst->opcode = BRW_OPCODE_MOV;
inst->src[0].f *= inst->src[1].f;
@@ -2482,6 +2487,11 @@ fs_visitor::opt_algebraic()
}

if (inst->src[0].file == IMM) {
+ /* We produce these from the MAD optimization below, which
+ * should only be happening for 32-bit float because we
+ * prevent constant propagation to MAD sources for other
+ * bit-sizes.
+ */
assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
inst->opcode = BRW_OPCODE_MOV;
inst->src[0].f += inst->src[1].f;
@@ -2565,6 +2575,11 @@ fs_visitor::opt_algebraic()
}
break;
case BRW_OPCODE_MAD:
+ /* ALign16 MAD can't do immediate sources, however we allow constant
+ * propagation to these instructions to enable these algebraic
+ * optimizations. For the cases that we can't optmize here, we
+ * rely on the combine constants pass to fix it up later.
+ */
if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
inst->opcode = BRW_OPCODE_MOV;
inst->src[1] = reg_undef;
@@ -2585,6 +2600,13 @@ fs_visitor::opt_algebraic()
inst->src[2] = reg_undef;
progress = true;
} else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
+ /* We should not be getting here for anything other than 32-bit
+ * float since we prevent constant-propagation to MAD instructions
+ * for everything else.
+ */
+ assert(inst->src[1].type == inst->src[2].type &&
+ inst->src[1].type == BRW_REGISTER_TYPE_F);
+
inst->opcode = BRW_OPCODE_ADD;
inst->src[1].f *= inst->src[2].f;
inst->src[2] = reg_undef;

--
2.17.1

Pohjolainen, Topi

2018-12-07 12:14:59 UTC

Post by Iago Toral Quiroga
This optimization depends on two other optimization passes: the
constant propagation pass, which allows immediate propagation
on MAD/LRP instructions even though the hardware can't do it,
and the combine constants pass to fix this up afterwards for the
cases that we could not optimize here.
Also, the optimization can generate cases for MUL/ADD that we
should not find otherwise, which are then implemented building
on that assumption, so better documenting these is useful.
---
src/intel/compiler/brw_fs.cpp | 22 ++++++++++++++++++++++
1 file changed, 22 insertions(+)
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 509c6febf38..a9ddafc05d1 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -2461,6 +2461,11 @@ fs_visitor::opt_algebraic()
}
if (inst->src[0].file == IMM) {
+ /* We produce these from the MAD optimization below, which
+ * should only be happening for 32-bit float because we
+ * prevent constant propagation to MAD sources for other
+ * bit-sizes.
+ */
assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
inst->opcode = BRW_OPCODE_MOV;
inst->src[0].f *= inst->src[1].f;
@@ -2482,6 +2487,11 @@ fs_visitor::opt_algebraic()
}
if (inst->src[0].file == IMM) {
+ /* We produce these from the MAD optimization below, which
+ * should only be happening for 32-bit float because we
+ * prevent constant propagation to MAD sources for other
+ * bit-sizes.
+ */
assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
inst->opcode = BRW_OPCODE_MOV;
inst->src[0].f += inst->src[1].f;
@@ -2565,6 +2575,11 @@ fs_visitor::opt_algebraic()
}
break;
+ /* ALign16 MAD can't do immediate sources, however we allow constant
+ * propagation to these instructions to enable these algebraic
+ * optimizations. For the cases that we can't optmize here, we
+ * rely on the combine constants pass to fix it up later.
+ */
if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
inst->opcode = BRW_OPCODE_MOV;
inst->src[1] = reg_undef;
@@ -2585,6 +2600,13 @@ fs_visitor::opt_algebraic()
inst->src[2] = reg_undef;
progress = true;
} else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
+ /* We should not be getting here for anything other than 32-bit
+ * float since we prevent constant-propagation to MAD instructions
+ * for everything else.
+ */
+ assert(inst->src[1].type == inst->src[2].type &&
+ inst->src[1].type == BRW_REGISTER_TYPE_F);
+
inst->opcode = BRW_OPCODE_ADD;
inst->src[1].f *= inst->src[2].f;
inst->src[2] = reg_undef;
--
2.17.1
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Iago Toral Quiroga

2018-12-04 07:17:11 UTC

The hardware only has two bits to specify the horizontal stride, so the
maximum horizontal stride we can use is 4. The pass calculates strides
based on the sizes of the types involved, and for conversions between
64-bit and 8-bit types that can lead to strides of 8.

The compiler should make sure that such conversions are handled in two
steps to avoid that situation. If we fail to do this properly, the
generated assembly will be invalid and validation will fail, but
asserting here makes debugging easier.
---
src/intel/compiler/brw_fs_lower_conversions.cpp | 7 +++++++
1 file changed, 7 insertions(+)

diff --git a/src/intel/compiler/brw_fs_lower_conversions.cpp b/src/intel/compiler/brw_fs_lower_conversions.cpp
index 145fb55f995..00781e824e8 100644
--- a/src/intel/compiler/brw_fs_lower_conversions.cpp
+++ b/src/intel/compiler/brw_fs_lower_conversions.cpp
@@ -90,6 +90,13 @@ fs_visitor::lower_conversions()
fs_reg temp = ibld.vgrf(get_exec_type(inst));
fs_reg strided_temp = subscript(temp, dst.type, 0);

+ /* Make sure we don't exceed hardware limits here. If we have code
+ * that hits this assertion it means that we need to split the
+ * instruction in two, using intermediary types (see for
+ * example nir_op_i2i8).
+ */
+ assert(strided_temp.stride <= 4);
+
assert(inst->size_written == inst->dst.component_size(inst->exec_size));
inst->dst = strided_temp;
inst->saturate = false;

--
2.17.1

Iago Toral Quiroga

2018-12-04 07:16:37 UTC

Now that this case only handles 64-bit destinations we can simplify
a bit the code.
---
src/intel/compiler/brw_fs_nir.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index 6c765fc2661..3eba8a478f5 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -833,7 +833,7 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
* 64-bit need to have the source data elements aligned to 64-bit.
* This restriction does not apply to BDW and later.
*/
- if (type_sz(result.type) == 8 && type_sz(op[0].type) < 8 &&
+ if (type_sz(op[0].type) < 8 &&
(devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) {
fs_reg tmp = bld.vgrf(result.type, 1);
tmp = subscript(tmp, op[0].type, 0);

--
2.17.1

Jason Ekstrand

2018-12-07 15:39:22 UTC

Post by Iago Toral Quiroga
Now that this case only handles 64-bit destinations we can simplify
a bit the code.

"the code a bit". Sorry, English is hard....

Post by Iago Toral Quiroga
---
src/intel/compiler/brw_fs_nir.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/intel/compiler/brw_fs_nir.cpp
b/src/intel/compiler/brw_fs_nir.cpp
index 6c765fc2661..3eba8a478f5 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -833,7 +833,7 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
* 64-bit need to have the source data elements aligned to 64-bit.
* This restriction does not apply to BDW and later.
*/
- if (type_sz(result.type) == 8 && type_sz(op[0].type) < 8 &&
+ if (type_sz(op[0].type) < 8 &&
(devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) {
fs_reg tmp = bld.vgrf(result.type, 1);
tmp = subscript(tmp, op[0].type, 0);
--
2.17.1
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Iago Toral Quiroga

2018-12-04 07:16:47 UTC

From the Skylake PRM, Extended Math Function:

"The execution size must be no more than 8 when half-floats
are used in source or destination operand."

Earlier generations do not support Extended Math with half-float.
---
src/intel/compiler/brw_fs.cpp | 30 +++++++++++++++++++++++-------
1 file changed, 23 insertions(+), 7 deletions(-)

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 43b920ae33d..509c6febf38 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -5386,18 +5386,34 @@ get_lowered_simd_width(const struct gen_device_info *devinfo,
case SHADER_OPCODE_EXP2:
case SHADER_OPCODE_LOG2:
case SHADER_OPCODE_SIN:
- case SHADER_OPCODE_COS:
+ case SHADER_OPCODE_COS: {
/* Unary extended math instructions are limited to SIMD8 on Gen4 and
* Gen6.
*/
- return (devinfo->gen >= 7 ? MIN2(16, inst->exec_size) :
- devinfo->gen == 5 || devinfo->is_g4x ? MIN2(16, inst->exec_size) :
- MIN2(8, inst->exec_size));
+ unsigned max_width =
+ (devinfo->gen >= 7 ? MIN2(16, inst->exec_size) :
+ devinfo->gen == 5 || devinfo->is_g4x ? MIN2(16, inst->exec_size) :
+ MIN2(8, inst->exec_size));

- case SHADER_OPCODE_POW:
+ /* Extended Math Function is limited to SIMD8 with half-float */
+ if (inst->dst.type == BRW_REGISTER_TYPE_HF)
+ max_width = MIN2(max_width, 8);
+
+ return max_width;
+ }
+
+ case SHADER_OPCODE_POW: {
/* SIMD16 is only allowed on Gen7+. */
- return (devinfo->gen >= 7 ? MIN2(16, inst->exec_size) :
- MIN2(8, inst->exec_size));
+ unsigned max_width =
+ (devinfo->gen >= 7 ? MIN2(16, inst->exec_size) :
+ MIN2(8, inst->exec_size));
+
+ /* Extended Math Function is limited to SIMD8 with half-float */
+ if (inst->dst.type == BRW_REGISTER_TYPE_HF)
+ max_width = MIN2(max_width, 8);
+
+ return max_width;
+ }

case SHADER_OPCODE_INT_QUOTIENT:
case SHADER_OPCODE_INT_REMAINDER:

--
2.17.1

Pohjolainen, Topi

2018-12-05 09:43:37 UTC

Post by Iago Toral Quiroga
"The execution size must be no more than 8 when half-floats
are used in source or destination operand."
Earlier generations do not support Extended Math with half-float.
---
src/intel/compiler/brw_fs.cpp | 30 +++++++++++++++++++++++-------
1 file changed, 23 insertions(+), 7 deletions(-)
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 43b920ae33d..509c6febf38 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -5386,18 +5386,34 @@ get_lowered_simd_width(const struct gen_device_info *devinfo,
+ case SHADER_OPCODE_COS: {
/* Unary extended math instructions are limited to SIMD8 on Gen4 and
* Gen6.
*/
- MIN2(8, inst->exec_size));
+ unsigned max_width =
+ MIN2(8, inst->exec_size));
+ /* Extended Math Function is limited to SIMD8 with half-float */
+ if (inst->dst.type == BRW_REGISTER_TYPE_HF)
+ max_width = MIN2(max_width, 8);
+
+ return max_width;
+ }
+
+ case SHADER_OPCODE_POW: {
/* SIMD16 is only allowed on Gen7+. */
- MIN2(8, inst->exec_size));
+ unsigned max_width =
+ MIN2(8, inst->exec_size));
+
+ /* Extended Math Function is limited to SIMD8 with half-float */
+ if (inst->dst.type == BRW_REGISTER_TYPE_HF)
+ max_width = MIN2(max_width, 8);
+
+ return max_width;
+ }
--
2.17.1
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Iago Toral Quiroga

2018-12-04 07:16:53 UTC

3-src instructions don't support immediates, but since 36bc5f06dd22,
we allow them on MAD and LRP relying on the combine constants pass to
fix it up later. However, that pass is specialized for 32-bit float
immediates and can't handle HF constants at present, so this patch
ensures that copy-propagation only does this for 32-bit constants.
---
src/intel/compiler/brw_fs_copy_propagation.cpp | 12 ++++++++++--
1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/intel/compiler/brw_fs_copy_propagation.cpp b/src/intel/compiler/brw_fs_copy_propagation.cpp
index ab34b63748e..58d5080b4e9 100644
--- a/src/intel/compiler/brw_fs_copy_propagation.cpp
+++ b/src/intel/compiler/brw_fs_copy_propagation.cpp
@@ -741,8 +741,16 @@ fs_visitor::try_constant_propagate(fs_inst *inst, acp_entry *entry)

case BRW_OPCODE_MAD:
case BRW_OPCODE_LRP:
- inst->src[i] = val;
- progress = true;
+ /* 3-src instructions can't take IMM registers, however, for 32-bit
+ * floating instructions we rely on the combine constants pass to fix
+ * it up. For anything else, we shouldn't be promoting immediates
+ * until we can make the pass capable of combining constants of
+ * different sizes.
+ */
+ if (val.type == BRW_REGISTER_TYPE_F) {
+ inst->src[i] = val;
+ progress = true;
+ }
break;

default:

--
2.17.1

Pohjolainen, Topi

2018-12-07 09:43:50 UTC

Post by Iago Toral Quiroga
3-src instructions don't support immediates, but since 36bc5f06dd22,
we allow them on MAD and LRP relying on the combine constants pass to
fix it up later. However, that pass is specialized for 32-bit float
immediates and can't handle HF constants at present, so this patch
ensures that copy-propagation only does this for 32-bit constants.
---
src/intel/compiler/brw_fs_copy_propagation.cpp | 12 ++++++++++--
1 file changed, 10 insertions(+), 2 deletions(-)
diff --git a/src/intel/compiler/brw_fs_copy_propagation.cpp b/src/intel/compiler/brw_fs_copy_propagation.cpp
index ab34b63748e..58d5080b4e9 100644
--- a/src/intel/compiler/brw_fs_copy_propagation.cpp
+++ b/src/intel/compiler/brw_fs_copy_propagation.cpp
@@ -741,8 +741,16 @@ fs_visitor::try_constant_propagate(fs_inst *inst, acp_entry *entry)
- inst->src[i] = val;
- progress = true;
+ /* 3-src instructions can't take IMM registers, however, for 32-bit
+ * floating instructions we rely on the combine constants pass to fix
+ * it up. For anything else, we shouldn't be promoting immediates
+ * until we can make the pass capable of combining constants of
+ * different sizes.
+ */
+ if (val.type == BRW_REGISTER_TYPE_F) {
+ inst->src[i] = val;
+ progress = true;
+ }
break;
--
2.17.1
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Iago Toral Quiroga

2018-12-04 07:16:39 UTC

Extended math desn't support half-float on these generations.
---
src/intel/compiler/brw_nir.c | 13 ++++++++++++-
1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c
index e0027f5179c..0b3094724c4 100644
--- a/src/intel/compiler/brw_nir.c
+++ b/src/intel/compiler/brw_nir.c
@@ -614,6 +614,8 @@ lower_bit_size_callback(const nir_alu_instr *alu, UNUSED void *data)
if (alu->dest.dest.ssa.bit_size != 16)
return 0;

+ const struct brw_compiler *compiler = (const struct brw_compiler *) data;
+
switch (alu->op) {
case nir_op_idiv:
case nir_op_imod:
@@ -626,6 +628,15 @@ lower_bit_size_callback(const nir_alu_instr *alu, UNUSED void *data)
case nir_op_fround_even:
case nir_op_ftrunc:
return 32;
+ case nir_op_frcp:
+ case nir_op_frsq:
+ case nir_op_fsqrt:
+ case nir_op_fpow:
+ case nir_op_fexp2:
+ case nir_op_flog2:
+ case nir_op_fsin:
+ case nir_op_fcos:
+ return compiler->devinfo->gen < 9 ? 32 : 0;
default:
return 0;
}
@@ -692,7 +703,7 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir)
OPT(nir_opt_large_constants, NULL, 32);
}

- OPT(nir_lower_bit_size, lower_bit_size_callback, NULL);
+ OPT(nir_lower_bit_size, lower_bit_size_callback, (void *)compiler);

if (is_scalar) {
OPT(nir_lower_load_const_to_scalar);

--
2.17.1

Iago Toral Quiroga

2018-12-04 07:17:22 UTC

---
src/intel/compiler/brw_fs.cpp | 46 +++++++++++++++++++++++++----------
1 file changed, 33 insertions(+), 13 deletions(-)

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 9ea67975e1e..32e0817ce02 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -2481,13 +2481,19 @@ fs_visitor::opt_algebraic()

if (inst->src[0].file == IMM) {
/* We produce these from the MAD optimization below, which
- * should only be happening for 32-bit float because we
- * prevent constant propagation to MAD sources for other
- * bit-sizes.
+ * should only be happening for 16/32-bit float
*/
- assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
+ assert(inst->src[0].type == BRW_REGISTER_TYPE_F ||
+ inst->src[0].type == BRW_REGISTER_TYPE_HF);
inst->opcode = BRW_OPCODE_MOV;
- inst->src[0].f *= inst->src[1].f;
+ if (inst->src[0].type == BRW_REGISTER_TYPE_F) {
+ inst->src[0].f *= inst->src[1].f;
+ } else {
+ float val1_f = _mesa_half_to_float(inst->src[0].d & 0xffff);
+ float val2_f = _mesa_half_to_float(inst->src[1].d & 0xffff);
+ uint16_t res_hf = _mesa_float_to_half(val1_f * val2_f);
+ inst->src[0] = retype(brw_imm_uw(res_hf), BRW_REGISTER_TYPE_HF);
+ }
inst->src[1] = reg_undef;
progress = true;
break;
@@ -2507,13 +2513,19 @@ fs_visitor::opt_algebraic()

if (inst->src[0].file == IMM) {
/* We produce these from the MAD optimization below, which
- * should only be happening for 32-bit float because we
- * prevent constant propagation to MAD sources for other
- * bit-sizes.
+ * should only be happening for 16/32-bit float
*/
- assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
+ assert(inst->src[0].type == BRW_REGISTER_TYPE_F ||
+ inst->src[0].type == BRW_REGISTER_TYPE_HF);
inst->opcode = BRW_OPCODE_MOV;
- inst->src[0].f += inst->src[1].f;
+ if (inst->src[0].type == BRW_REGISTER_TYPE_F) {
+ inst->src[0].f += inst->src[1].f;
+ } else {
+ float val1_f = _mesa_half_to_float(inst->src[0].d & 0xffff);
+ float val2_f = _mesa_half_to_float(inst->src[1].d & 0xffff);
+ uint16_t res_hf = _mesa_float_to_half(val1_f + val2_f);
+ inst->src[0] = retype(brw_imm_uw(res_hf), BRW_REGISTER_TYPE_HF);
+ }
inst->src[1] = reg_undef;
progress = true;
break;
@@ -2619,15 +2631,23 @@ fs_visitor::opt_algebraic()
inst->src[2] = reg_undef;
progress = true;
} else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
- /* We should not be getting here for anything other than 32-bit
+ /* We should not be getting here for anything other than 16/32-bit
* float since we prevent constant-propagation to MAD instructions
* for everything else.
*/
assert(inst->src[1].type == inst->src[2].type &&
- inst->src[1].type == BRW_REGISTER_TYPE_F);
+ (inst->src[1].type == BRW_REGISTER_TYPE_F ||
+ inst->src[1].type == BRW_REGISTER_TYPE_HF));

inst->opcode = BRW_OPCODE_ADD;
- inst->src[1].f *= inst->src[2].f;
+ if (inst->src[1].type == BRW_REGISTER_TYPE_F) {
+ inst->src[1].f *= inst->src[2].f;
+ } else {
+ float val1_f = _mesa_half_to_float(inst->src[1].d & 0xffff);
+ float val2_f = _mesa_half_to_float(inst->src[2].d & 0xffff);
+ uint16_t res_hf = _mesa_float_to_half(val1_f * val2_f);
+ inst->src[1] = retype(brw_imm_uw(res_hf), BRW_REGISTER_TYPE_HF);
+ }
inst->src[2] = reg_undef;
progress = true;
}

--
2.17.1

Iago Toral Quiroga

2018-12-04 07:16:36 UTC

Since we handle booleans as integers this makes more sense.
---
src/intel/compiler/brw_fs_nir.cpp | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index 9f3d3bf9762..6c765fc2661 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -801,11 +801,6 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
inst->saturate = instr->dest.saturate;
break;

- case nir_op_b2i:
- case nir_op_b2f:
- op[0].type = BRW_REGISTER_TYPE_D;
- op[0].negate = !op[0].negate;
- /* fallthrough */
case nir_op_f2f64:
case nir_op_f2i64:
case nir_op_f2u64:
@@ -850,6 +845,11 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
inst->saturate = instr->dest.saturate;
break;

+ case nir_op_b2i:
+ case nir_op_b2f:
+ op[0].type = BRW_REGISTER_TYPE_D;
+ op[0].negate = !op[0].negate;
+ /* fallthrough */
case nir_op_i2f64:
case nir_op_i2i64:
case nir_op_u2f64:

--
2.17.1

Pohjolainen, Topi

2018-12-04 16:16:34 UTC

Post by Iago Toral Quiroga
Since we handle booleans as integers this makes more sense.

If this is applied before patch 10, can we merge 10 and 13?

Post by Iago Toral Quiroga
---
src/intel/compiler/brw_fs_nir.cpp | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index 9f3d3bf9762..6c765fc2661 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -801,11 +801,6 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
inst->saturate = instr->dest.saturate;
break;
- op[0].type = BRW_REGISTER_TYPE_D;
- op[0].negate = !op[0].negate;
- /* fallthrough */
@@ -850,6 +845,11 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
inst->saturate = instr->dest.saturate;
break;
+ op[0].type = BRW_REGISTER_TYPE_D;
+ op[0].negate = !op[0].negate;
+ /* fallthrough */
--
2.17.1
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Iago Toral

2018-12-05 10:23:06 UTC

Post by Iago Toral Quiroga
Since we handle booleans as integers this makes more sense.

If this is applied before patch 10, can we merge 10 and 13?

We can't apply this before patch 10 because patch 10 is the one that
splits the f264 and {i,u}264 opcodes. However, we could merge this and
patch 13 into patch 10 if that looks better to you.

Iago

Post by Iago Toral Quiroga
---
src/intel/compiler/brw_fs_nir.cpp | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/src/intel/compiler/brw_fs_nir.cpp
b/src/intel/compiler/brw_fs_nir.cpp
index 9f3d3bf9762..6c765fc2661 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -801,11 +801,6 @@ fs_visitor::nir_emit_alu(const fs_builder
&bld, nir_alu_instr *instr)
inst->saturate = instr->dest.saturate;
break;
- op[0].type = BRW_REGISTER_TYPE_D;
- op[0].negate = !op[0].negate;
- /* fallthrough */
@@ -850,6 +845,11 @@ fs_visitor::nir_emit_alu(const fs_builder
&bld, nir_alu_instr *instr)
inst->saturate = instr->dest.saturate;
break;
+ op[0].type = BRW_REGISTER_TYPE_D;
+ op[0].negate = !op[0].negate;
+ /* fallthrough */
--
2.17.1
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Pohjolainen, Topi

2018-12-05 11:25:14 UTC

Post by Iago Toral Quiroga
Since we handle booleans as integers this makes more sense.

If this is applied before patch 10, can we merge 10 and 13?

We can't apply this before patch 10 because patch 10 is the one that
splits the f264 and {i,u}264 opcodes. However, we could merge this and
patch 13 into patch 10 if that looks better to you.

What you have is just fine. I just didn't see all the corners involved.

Post by Iago Toral
Iago

Post by Iago Toral Quiroga
---
src/intel/compiler/brw_fs_nir.cpp | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/src/intel/compiler/brw_fs_nir.cpp
b/src/intel/compiler/brw_fs_nir.cpp
index 9f3d3bf9762..6c765fc2661 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -801,11 +801,6 @@ fs_visitor::nir_emit_alu(const fs_builder
&bld, nir_alu_instr *instr)
inst->saturate = instr->dest.saturate;
break;
- op[0].type = BRW_REGISTER_TYPE_D;
- op[0].negate = !op[0].negate;
- /* fallthrough */
@@ -850,6 +845,11 @@ fs_visitor::nir_emit_alu(const fs_builder
&bld, nir_alu_instr *instr)
inst->saturate = instr->dest.saturate;
break;
+ op[0].type = BRW_REGISTER_TYPE_D;
+ op[0].negate = !op[0].negate;
+ /* fallthrough */
--
2.17.1
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Jason Ekstrand

2018-12-07 15:37:13 UTC

This'll have to be rebased on dca6cd9ce6510 but otherwise looks fine. I've
been a bit annoyed by this myself.

Reviewed-by: Jason Ekstrand <***@jlekstrand.net>

Incidentally, this could also be lowered in NIR.... Not sure if we want to
but there it is.

Post by Iago Toral Quiroga
Since we handle booleans as integers this makes more sense.
---
src/intel/compiler/brw_fs_nir.cpp | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/src/intel/compiler/brw_fs_nir.cpp
b/src/intel/compiler/brw_fs_nir.cpp
index 9f3d3bf9762..6c765fc2661 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -801,11 +801,6 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
inst->saturate = instr->dest.saturate;
break;
- op[0].type = BRW_REGISTER_TYPE_D;
- op[0].negate = !op[0].negate;
- /* fallthrough */
@@ -850,6 +845,11 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
inst->saturate = instr->dest.saturate;
break;
+ op[0].type = BRW_REGISTER_TYPE_D;
+ op[0].negate = !op[0].negate;
+ /* fallthrough */
--
2.17.1
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Iago Toral Quiroga

2018-12-04 07:16:49 UTC

This is available since gen8.
---
src/intel/compiler/brw_reg_type.c | 35 +++++++++++++++++++++++++++----
1 file changed, 31 insertions(+), 4 deletions(-)

diff --git a/src/intel/compiler/brw_reg_type.c b/src/intel/compiler/brw_reg_type.c
index 60240ba1513..72295a2bd75 100644
--- a/src/intel/compiler/brw_reg_type.c
+++ b/src/intel/compiler/brw_reg_type.c
@@ -138,6 +138,7 @@ enum hw_3src_reg_type {
GEN7_3SRC_TYPE_D = 1,
GEN7_3SRC_TYPE_UD = 2,
GEN7_3SRC_TYPE_DF = 3,
+ GEN8_3SRC_TYPE_HF = 4,

/** When ExecutionDatatype is 1: @{ */
GEN10_ALIGN1_3SRC_REG_TYPE_HF = 0b000,
@@ -166,6 +167,14 @@ static const struct hw_3src_type {
[BRW_REGISTER_TYPE_D] = { GEN7_3SRC_TYPE_D },
[BRW_REGISTER_TYPE_UD] = { GEN7_3SRC_TYPE_UD },
[BRW_REGISTER_TYPE_DF] = { GEN7_3SRC_TYPE_DF },
+}, gen8_hw_3src_type[] = {
+ [0 ... BRW_REGISTER_TYPE_LAST] = { INVALID },
+
+ [BRW_REGISTER_TYPE_F] = { GEN7_3SRC_TYPE_F },
+ [BRW_REGISTER_TYPE_D] = { GEN7_3SRC_TYPE_D },
+ [BRW_REGISTER_TYPE_UD] = { GEN7_3SRC_TYPE_UD },
+ [BRW_REGISTER_TYPE_DF] = { GEN7_3SRC_TYPE_DF },
+ [BRW_REGISTER_TYPE_HF] = { GEN8_3SRC_TYPE_HF },
}, gen10_hw_3src_align1_type[] = {
#define E(x) BRW_ALIGN1_3SRC_EXEC_TYPE_##x
[0 ... BRW_REGISTER_TYPE_LAST] = { INVALID },
@@ -249,6 +258,20 @@ brw_hw_type_to_reg_type(const struct gen_device_info *devinfo,
unreachable("not reached");
}

+static inline const struct hw_3src_type *
+get_hw_3src_type_map(const struct gen_device_info *devinfo, uint32_t *size)
+{
+ if (devinfo->gen < 8) {
+ if (size)
+ *size = ARRAY_SIZE(gen7_hw_3src_type);
+ return gen7_hw_3src_type;
+ } else {
+ if (size)
+ *size = ARRAY_SIZE(gen8_hw_3src_type);
+ return gen8_hw_3src_type;
+ }
+}
+
/**
* Convert a brw_reg_type enumeration value into the hardware representation
* for a 3-src align16 instruction
@@ -257,9 +280,11 @@ unsigned
brw_reg_type_to_a16_hw_3src_type(const struct gen_device_info *devinfo,
enum brw_reg_type type)
{
- assert(type < ARRAY_SIZE(gen7_hw_3src_type));
- assert(gen7_hw_3src_type[type].reg_type != (enum hw_3src_reg_type)INVALID);
- return gen7_hw_3src_type[type].reg_type;
+ uint32_t map_size;
+ const struct hw_3src_type *hw_3src_type_map =
+ get_hw_3src_type_map(devinfo, &map_size);
+ assert(hw_3src_type_map[type].reg_type != (enum hw_3src_reg_type)INVALID);
+ return hw_3src_type_map[type].reg_type;
}

/**
@@ -283,8 +308,10 @@ enum brw_reg_type
brw_a16_hw_3src_type_to_reg_type(const struct gen_device_info *devinfo,
unsigned hw_type)
{
+ const struct hw_3src_type *hw_3src_type_map =
+ get_hw_3src_type_map(devinfo, NULL);
for (enum brw_reg_type i = 0; i <= BRW_REGISTER_TYPE_LAST; i++) {
- if (gen7_hw_3src_type[i].reg_type == hw_type) {
+ if (hw_3src_type_map[i].reg_type == hw_type) {
return i;
}
}

--
2.17.1

Iago Toral Quiroga

2018-12-04 07:17:13 UTC

The hardware only allows a stride of 1 on a Byte destination for raw
byte MOV instructions. This is required even when the destination
is the NULL register.

Rather than making sure that we emit a proper NULL:B destination
every time we need one, just fix it at emission time.
---
src/intel/compiler/brw_eu_emit.c | 11 +++++++++++
1 file changed, 11 insertions(+)

diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c
index 66edfb43baf..eef36705c7b 100644
--- a/src/intel/compiler/brw_eu_emit.c
+++ b/src/intel/compiler/brw_eu_emit.c
@@ -94,6 +94,17 @@ brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct brw_reg dest)
else if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE)
assert(dest.nr < 128);

+ /* The hardware has a restriction where if the destination is Byte,
+ * the instruction needs to have a stride of 2 (except for packed byte
+ * MOV). This seems to be required even if the destination is the NULL
+ * register.
+ */
+ if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
+ dest.nr == BRW_ARF_NULL &&
+ type_sz(dest.type) == 1) {
+ dest.hstride = BRW_HORIZONTAL_STRIDE_2;
+ }
+
gen7_convert_mrf_to_grf(p, &dest);

brw_inst_set_dst_file_type(devinfo, inst, dest.file, dest.type);

--
2.17.1

Iago Toral Quiroga

2018-12-04 07:16:25 UTC

Reviewed-by: Jason Ekstrand <***@jlekstrand.net>
---
src/compiler/nir/nir_builtin_builder.h | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/compiler/nir/nir_builtin_builder.h b/src/compiler/nir/nir_builtin_builder.h
index 0e5b9db462a..1f5a1a12533 100644
--- a/src/compiler/nir/nir_builtin_builder.h
+++ b/src/compiler/nir/nir_builtin_builder.h
@@ -60,7 +60,9 @@ nir_uclamp(nir_builder *b,
static inline nir_ssa_def *
nir_degrees(nir_builder *b, nir_ssa_def *val)
{
- return nir_fmul(b, val, nir_imm_float(b, 57.2957795131));
+ return nir_fmul(b, val,
+ nir_imm_floatN_t(b, 57.2957795131,
+ val->bit_size));
}

static inline nir_ssa_def *
@@ -78,7 +80,9 @@ nir_fast_normalize(nir_builder *b, nir_ssa_def *vec)
static inline nir_ssa_def *
nir_radians(nir_builder *b, nir_ssa_def *val)
{
- return nir_fmul(b, val, nir_imm_float(b, 0.01745329251));
+ return nir_fmul(b, val,
+ nir_imm_floatN_t(b, 0.01745329251,
+ val->bit_size));
}

#endif /* NIR_BUILTIN_BUILDER_H */

--
2.17.1

Iago Toral Quiroga

2018-12-04 07:17:16 UTC

---
src/intel/vulkan/anv_device.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c
index caf25ad8a03..17b73c115cd 100644
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@@ -971,7 +971,7 @@ void anv_GetPhysicalDeviceFeatures2(
ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice);

features->shaderFloat16 = pdevice->info.gen >= 8;
- features->shaderInt8 = false;
+ features->shaderInt8 = pdevice->info.gen >= 8;
break;
}

--
2.17.1

Iago Toral Quiroga

2018-12-04 07:17:01 UTC

---
src/intel/vulkan/anv_pipeline.c | 1 +
1 file changed, 1 insertion(+)

diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c
index ee37685fccf..5b583c28582 100644
--- a/src/intel/vulkan/anv_pipeline.c
+++ b/src/intel/vulkan/anv_pipeline.c
@@ -146,6 +146,7 @@ anv_shader_compile_to_nir(struct anv_pipeline *pipeline,
.variable_pointers = true,
.storage_16bit = device->instance->physicalDevice.info.gen >= 8,
.int16 = device->instance->physicalDevice.info.gen >= 8,
+ .float16 = device->instance->physicalDevice.info.gen >= 8,
.shader_viewport_index_layer = true,
.subgroup_arithmetic = true,
.subgroup_basic = true,

--
2.17.1

Iago Toral Quiroga

2018-12-04 07:16:32 UTC

---
src/compiler/spirv/vtn_glsl450.c | 48 ++++++++++++++++++++++++++++++--
1 file changed, 46 insertions(+), 2 deletions(-)

diff --git a/src/compiler/spirv/vtn_glsl450.c b/src/compiler/spirv/vtn_glsl450.c
index 85851755aab..bb340c87416 100644
--- a/src/compiler/spirv/vtn_glsl450.c
+++ b/src/compiler/spirv/vtn_glsl450.c
@@ -389,6 +389,45 @@ build_atan2(nir_builder *b, nir_ssa_def *y, nir_ssa_def *x)
nir_fneg(b, arc), arc);
}

+static nir_ssa_def *
+build_frexp16(nir_builder *b, nir_ssa_def *x, nir_ssa_def **exponent)
+{
+ assert(x->bit_size == 16);
+
+ nir_ssa_def *abs_x = nir_fabs(b, x);
+ nir_ssa_def *zero = nir_imm_floatN_t(b, 0, 16);
+
+ /* Half-precision floating-point values are stored as
+ * 1 sign bit;
+ * 5 exponent bits;
+ * 10 mantissa bits.
+ *
+ * An exponent shift of 10 will shift the mantissa out, leaving only the
+ * exponent and sign bit (which itself may be zero, if the absolute value
+ * was taken before the bitcast and shift).
+ */
+ nir_ssa_def *exponent_shift = nir_imm_int(b, 10);
+ nir_ssa_def *exponent_bias = nir_imm_intN_t(b, -14, 16);
+
+ nir_ssa_def *sign_mantissa_mask = nir_imm_intN_t(b, 0x83ffu, 16);
+
+ /* Exponent of floating-point values in the range [0.5, 1.0). */
+ nir_ssa_def *exponent_value = nir_imm_intN_t(b, 0x3800u, 16);
+
+ nir_ssa_def *is_not_zero = nir_fne(b, abs_x, zero);
+
+ /* Significand return must be of the same type as the input, but the
+ * exponent must be a 32-bit integer.
+ */
+ *exponent =
+ nir_i2i32(b,
+ nir_iadd(b, nir_ushr(b, abs_x, exponent_shift),
+ nir_bcsel(b, is_not_zero, exponent_bias, zero)));
+
+ return nir_ior(b, nir_iand(b, x, sign_mantissa_mask),
+ nir_bcsel(b, is_not_zero, exponent_value, zero));
+}
+
static nir_ssa_def *
build_frexp32(nir_builder *b, nir_ssa_def *x, nir_ssa_def **exponent)
{
@@ -751,8 +790,10 @@ handle_glsl450_alu(struct vtn_builder *b, enum GLSLstd450 entrypoint,
nir_ssa_def *exponent;
if (src[0]->bit_size == 64)
val->ssa->def = build_frexp64(nb, src[0], &exponent);
- else
+ else if (src[0]->bit_size == 32)
val->ssa->def = build_frexp32(nb, src[0], &exponent);
+ else
+ val->ssa->def = build_frexp16(nb, src[0], &exponent);
nir_store_deref(nb, vtn_nir_deref(b, w[6]), exponent, 0xf);
return;
}
@@ -762,9 +803,12 @@ handle_glsl450_alu(struct vtn_builder *b, enum GLSLstd450 entrypoint,
if (src[0]->bit_size == 64)
val->ssa->elems[0]->def = build_frexp64(nb, src[0],
&val->ssa->elems[1]->def);
- else
+ else if (src[0]->bit_size == 32)
val->ssa->elems[0]->def = build_frexp32(nb, src[0],
&val->ssa->elems[1]->def);
+ else
+ val->ssa->elems[0]->def = build_frexp16(nb, src[0],
+ &val->ssa->elems[1]->def);
return;
}

--
2.17.1

Jason Ekstrand

2018-12-07 15:20:45 UTC

Looks the same as what we do for the others.

Post by Iago Toral Quiroga
---
src/compiler/spirv/vtn_glsl450.c | 48 ++++++++++++++++++++++++++++++--
1 file changed, 46 insertions(+), 2 deletions(-)
diff --git a/src/compiler/spirv/vtn_glsl450.c
b/src/compiler/spirv/vtn_glsl450.c
index 85851755aab..bb340c87416 100644
--- a/src/compiler/spirv/vtn_glsl450.c
+++ b/src/compiler/spirv/vtn_glsl450.c
@@ -389,6 +389,45 @@ build_atan2(nir_builder *b, nir_ssa_def *y, nir_ssa_def *x)
nir_fneg(b, arc), arc);
}
+static nir_ssa_def *
+build_frexp16(nir_builder *b, nir_ssa_def *x, nir_ssa_def **exponent)
+{
+ assert(x->bit_size == 16);
+
+ nir_ssa_def *abs_x = nir_fabs(b, x);
+ nir_ssa_def *zero = nir_imm_floatN_t(b, 0, 16);
+
+ /* Half-precision floating-point values are stored as
+ * 1 sign bit;
+ * 5 exponent bits;
+ * 10 mantissa bits.
+ *
+ * An exponent shift of 10 will shift the mantissa out, leaving only the
+ * exponent and sign bit (which itself may be zero, if the absolute value
+ * was taken before the bitcast and shift).
+ */
+ nir_ssa_def *exponent_shift = nir_imm_int(b, 10);
+ nir_ssa_def *exponent_bias = nir_imm_intN_t(b, -14, 16);
+
+ nir_ssa_def *sign_mantissa_mask = nir_imm_intN_t(b, 0x83ffu, 16);
+
+ /* Exponent of floating-point values in the range [0.5, 1.0). */
+ nir_ssa_def *exponent_value = nir_imm_intN_t(b, 0x3800u, 16);
+
+ nir_ssa_def *is_not_zero = nir_fne(b, abs_x, zero);
+
+ /* Significand return must be of the same type as the input, but the
+ * exponent must be a 32-bit integer.
+ */
+ *exponent =
+ nir_i2i32(b,
+ nir_iadd(b, nir_ushr(b, abs_x, exponent_shift),
+ nir_bcsel(b, is_not_zero, exponent_bias, zero)));
+
+ return nir_ior(b, nir_iand(b, x, sign_mantissa_mask),
+ nir_bcsel(b, is_not_zero, exponent_value, zero));
+}
+
static nir_ssa_def *
build_frexp32(nir_builder *b, nir_ssa_def *x, nir_ssa_def **exponent)
{
@@ -751,8 +790,10 @@ handle_glsl450_alu(struct vtn_builder *b, enum GLSLstd450 entrypoint,
nir_ssa_def *exponent;
if (src[0]->bit_size == 64)
val->ssa->def = build_frexp64(nb, src[0], &exponent);
- else
+ else if (src[0]->bit_size == 32)
val->ssa->def = build_frexp32(nb, src[0], &exponent);
+ else
+ val->ssa->def = build_frexp16(nb, src[0], &exponent);
nir_store_deref(nb, vtn_nir_deref(b, w[6]), exponent, 0xf);
return;
}
@@ -762,9 +803,12 @@ handle_glsl450_alu(struct vtn_builder *b, enum GLSLstd450 entrypoint,
if (src[0]->bit_size == 64)
val->ssa->elems[0]->def = build_frexp64(nb, src[0],
&val->ssa->elems[1]->def);
- else
+ else if (src[0]->bit_size == 32)
val->ssa->elems[0]->def = build_frexp32(nb, src[0],
&val->ssa->elems[1]->def);
+ else
+ val->ssa->elems[0]->def = build_frexp16(nb, src[0],
+
&val->ssa->elems[1]->def);
return;
}
--
2.17.1
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Iago Toral Quiroga

2018-12-04 07:16:48 UTC

The original SrcType is a 3-bit field that takes a subset of the types
supported for the hardware for 3-source instructions. Since gen8,
when the half-float type was added, 3-source floating point operations
can use use mixed precision mode, where not all the operands have the
same floating-point precision. While the precision for the first operand
is taken from the type in SrcType, the bits in Src1Type (bit 36) and
Src2Type (bit 35) define the precision for the other operands
(0: normal precision, 1: half precision).
---
src/intel/compiler/brw_inst.h | 2 ++
1 file changed, 2 insertions(+)

diff --git a/src/intel/compiler/brw_inst.h b/src/intel/compiler/brw_inst.h
index ce89bbba72f..c45697eaa3a 100644
--- a/src/intel/compiler/brw_inst.h
+++ b/src/intel/compiler/brw_inst.h
@@ -222,6 +222,8 @@ F8(3src_src1_negate, 39, 39, 40, 40)
F8(3src_src1_abs, 38, 38, 39, 39)
F8(3src_src0_negate, 37, 37, 38, 38)
F8(3src_src0_abs, 36, 36, 37, 37)
+F8(3src_a16_src1_type, -1, -1, 36, 36)
+F8(3src_a16_src2_type, -1, -1, 35, 35)
F8(3src_a16_flag_reg_nr, 34, 34, 33, 33)
F8(3src_a16_flag_subreg_nr, 33, 33, 32, 32)
FF(3src_a16_dst_reg_file,

--
2.17.1

Pohjolainen, Topi

2018-12-05 12:42:16 UTC

Post by Iago Toral Quiroga
The original SrcType is a 3-bit field that takes a subset of the types
supported for the hardware for 3-source instructions. Since gen8,
when the half-float type was added, 3-source floating point operations
can use use mixed precision mode, where not all the operands have the
same floating-point precision. While the precision for the first operand
is taken from the type in SrcType, the bits in Src1Type (bit 36) and
Src2Type (bit 35) define the precision for the other operands
(0: normal precision, 1: half precision).
---
src/intel/compiler/brw_inst.h | 2 ++
1 file changed, 2 insertions(+)
diff --git a/src/intel/compiler/brw_inst.h b/src/intel/compiler/brw_inst.h
index ce89bbba72f..c45697eaa3a 100644
--- a/src/intel/compiler/brw_inst.h
+++ b/src/intel/compiler/brw_inst.h
@@ -222,6 +222,8 @@ F8(3src_src1_negate, 39, 39, 40, 40)
F8(3src_src1_abs, 38, 38, 39, 39)
F8(3src_src0_negate, 37, 37, 38, 38)
F8(3src_src0_abs, 36, 36, 37, 37)
+F8(3src_a16_src1_type, -1, -1, 36, 36)
+F8(3src_a16_src2_type, -1, -1, 35, 35)
F8(3src_a16_flag_reg_nr, 34, 34, 33, 33)
F8(3src_a16_flag_subreg_nr, 33, 33, 32, 32)
FF(3src_a16_dst_reg_file,
--
2.17.1
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Iago Toral Quiroga

2018-12-04 07:17:17 UTC

There are no 8-bit immediates, so assert in that case.
16-bit immediates are replicated in each word of a 32-bit immediate, so
we only need to check the lower 16-bits.
---
src/intel/compiler/brw_shader.cpp | 20 ++++++++++++++++++++
1 file changed, 20 insertions(+)

diff --git a/src/intel/compiler/brw_shader.cpp b/src/intel/compiler/brw_shader.cpp
index b77bd798d17..9999adbb52f 100644
--- a/src/intel/compiler/brw_shader.cpp
+++ b/src/intel/compiler/brw_shader.cpp
@@ -708,11 +708,18 @@ backend_reg::is_zero() const
if (file != IMM)
return false;

+ assert(type_sz(type) > 1);
+
switch (type) {
+ case BRW_REGISTER_TYPE_HF:
+ return (d & 0xffff) == 0;
case BRW_REGISTER_TYPE_F:
return f == 0;
case BRW_REGISTER_TYPE_DF:
return df == 0;
+ case BRW_REGISTER_TYPE_W:
+ case BRW_REGISTER_TYPE_UW:
+ return (d & 0xffff) == 0;
case BRW_REGISTER_TYPE_D:
case BRW_REGISTER_TYPE_UD:
return d == 0;
@@ -730,11 +737,18 @@ backend_reg::is_one() const
if (file != IMM)
return false;

+ assert(type_sz(type) > 1);
+
switch (type) {
+ case BRW_REGISTER_TYPE_HF:
+ return (d & 0xffff) == 0x3c00;
case BRW_REGISTER_TYPE_F:
return f == 1.0f;
case BRW_REGISTER_TYPE_DF:
return df == 1.0;
+ case BRW_REGISTER_TYPE_W:
+ case BRW_REGISTER_TYPE_UW:
+ return (d & 0xffff) == 1;
case BRW_REGISTER_TYPE_D:
case BRW_REGISTER_TYPE_UD:
return d == 1;
@@ -752,11 +766,17 @@ backend_reg::is_negative_one() const
if (file != IMM)
return false;

+ assert(type_sz(type) > 1);
+
switch (type) {
+ case BRW_REGISTER_TYPE_HF:
+ return (d & 0xffff) == 0xbc00;
case BRW_REGISTER_TYPE_F:
return f == -1.0;
case BRW_REGISTER_TYPE_DF:
return df == -1.0;
+ case BRW_REGISTER_TYPE_W:
+ return (d & 0xffff) == -1;
case BRW_REGISTER_TYPE_D:
return d == -1;
case BRW_REGISTER_TYPE_Q:

--
2.17.1

Iago Toral Quiroga

2018-12-04 07:17:19 UTC

---
src/intel/compiler/brw_fs_cmod_propagation.cpp | 8 +++-----
1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/intel/compiler/brw_fs_cmod_propagation.cpp b/src/intel/compiler/brw_fs_cmod_propagation.cpp
index 7bb5c9afbc9..dfef9d720a2 100644
--- a/src/intel/compiler/brw_fs_cmod_propagation.cpp
+++ b/src/intel/compiler/brw_fs_cmod_propagation.cpp
@@ -244,8 +244,7 @@ opt_cmod_propagation_local(const gen_device_info *devinfo,
/* CMP's result is the same regardless of dest type. */
if (inst->conditional_mod == BRW_CONDITIONAL_NZ &&
scan_inst->opcode == BRW_OPCODE_CMP &&
- (inst->dst.type == BRW_REGISTER_TYPE_D ||
- inst->dst.type == BRW_REGISTER_TYPE_UD)) {
+ brw_reg_type_is_integer(inst->dst.type)) {
inst->remove(block);
progress = true;
break;
@@ -258,9 +257,8 @@ opt_cmod_propagation_local(const gen_device_info *devinfo,
break;

/* Comparisons operate differently for ints and floats */
- if (scan_inst->dst.type != inst->dst.type &&
- (scan_inst->dst.type == BRW_REGISTER_TYPE_F ||
- inst->dst.type == BRW_REGISTER_TYPE_F))
+ if (brw_reg_type_is_floating_point(scan_inst->dst.type) !=
+ brw_reg_type_is_floating_point(inst->dst.type))
break;

/* If the instruction generating inst's source also wrote the

--
2.17.1

Iago Toral Quiroga

2018-12-04 07:17:18 UTC

---
src/intel/compiler/brw_reg_type.h | 18 ++++++++++++++++++
1 file changed, 18 insertions(+)

diff --git a/src/intel/compiler/brw_reg_type.h b/src/intel/compiler/brw_reg_type.h
index ffbec90d3fe..a3365b7e34c 100644
--- a/src/intel/compiler/brw_reg_type.h
+++ b/src/intel/compiler/brw_reg_type.h
@@ -82,6 +82,24 @@ brw_reg_type_is_floating_point(enum brw_reg_type type)
}
}

+static inline bool
+brw_reg_type_is_integer(enum brw_reg_type type)
+{
+ switch (type) {
+ case BRW_REGISTER_TYPE_Q:
+ case BRW_REGISTER_TYPE_UQ:
+ case BRW_REGISTER_TYPE_D:
+ case BRW_REGISTER_TYPE_UD:
+ case BRW_REGISTER_TYPE_W:
+ case BRW_REGISTER_TYPE_UW:
+ case BRW_REGISTER_TYPE_B:
+ case BRW_REGISTER_TYPE_UV:
+ return true;
+ default:
+ return false;
+ }
+}
+
unsigned
brw_reg_type_to_hw_type(const struct gen_device_info *devinfo,
enum brw_reg_file file, enum brw_reg_type type);

--
2.17.1

Iago Toral Quiroga

2018-12-04 07:16:58 UTC

We use ALign16 mode for this, since it is more convenient, but the PRM
for Broadwell states in Volume 3D Media GPGPU, Chapter 'Register region
restrictions', Section '1. Special Restrictions':

"In Align16 mode, the channel selects and channel enables apply to a
pair of half-floats, because these parameters are defined for DWord
elements ONLY. This is applicable when both source and destination
are half-floats."

This means that we cannot select individual HF elements using swizzles
like we do with 32-bit floats so we can't implement the required
regioning for this.

Use the gen11 path for this instead, which uses Align1 mode.

The restriction is not present in gen9 of gen10, where the Align16
implementation seems to work just fine.
---
src/intel/compiler/brw_fs_generator.cpp | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp
index d8e4bae17e0..ba7ed07e692 100644
--- a/src/intel/compiler/brw_fs_generator.cpp
+++ b/src/intel/compiler/brw_fs_generator.cpp
@@ -1281,8 +1281,14 @@ fs_generator::generate_ddy(const fs_inst *inst,
const uint32_t type_size = type_sz(src.type);

if (inst->opcode == FS_OPCODE_DDY_FINE) {
- /* produce accurate derivatives */
- if (devinfo->gen >= 11) {
+ /* produce accurate derivatives. We can do this easily in Align16
+ * but this is not supported in gen11+ and gen8 Align16 swizzles
+ * for Half-Float operands work in units of 32-bit and always
+ * select pairs of consecutive half-float elements, so we can't use
+ * use it for this.
+ */
+ if (devinfo->gen >= 11 ||
+ (devinfo->gen == 8 && src.type == BRW_REGISTER_TYPE_HF)) {
src = stride(src, 0, 2, 1);
struct brw_reg src_0 = byte_offset(src, 0 * type_size);
struct brw_reg src_2 = byte_offset(src, 2 * type_size);

--
2.17.1

Pohjolainen, Topi

2018-12-07 13:06:17 UTC

Post by Iago Toral Quiroga
We use ALign16 mode for this, since it is more convenient, but the PRM
for Broadwell states in Volume 3D Media GPGPU, Chapter 'Register region
"In Align16 mode, the channel selects and channel enables apply to a
pair of half-floats, because these parameters are defined for DWord
elements ONLY. This is applicable when both source and destination
are half-floats."
This means that we cannot select individual HF elements using swizzles
like we do with 32-bit floats so we can't implement the required
regioning for this.
Use the gen11 path for this instead, which uses Align1 mode.
The restriction is not present in gen9 of gen10, where the Align16

or?

Post by Iago Toral Quiroga
implementation seems to work just fine.
---
src/intel/compiler/brw_fs_generator.cpp | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)
diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp
index d8e4bae17e0..ba7ed07e692 100644
--- a/src/intel/compiler/brw_fs_generator.cpp
+++ b/src/intel/compiler/brw_fs_generator.cpp
@@ -1281,8 +1281,14 @@ fs_generator::generate_ddy(const fs_inst *inst,
const uint32_t type_size = type_sz(src.type);
if (inst->opcode == FS_OPCODE_DDY_FINE) {
- /* produce accurate derivatives */
- if (devinfo->gen >= 11) {
+ /* produce accurate derivatives. We can do this easily in Align16
+ * but this is not supported in gen11+ and gen8 Align16 swizzles
+ * for Half-Float operands work in units of 32-bit and always
+ * select pairs of consecutive half-float elements, so we can't use
+ * use it for this.
+ */
+ if (devinfo->gen >= 11 ||
+ (devinfo->gen == 8 && src.type == BRW_REGISTER_TYPE_HF)) {
src = stride(src, 0, 2, 1);
struct brw_reg src_0 = byte_offset(src, 0 * type_size);
struct brw_reg src_2 = byte_offset(src, 2 * type_size);
--
2.17.1
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Iago Toral

2018-12-07 14:03:41 UTC

Post by Iago Toral Quiroga
We use ALign16 mode for this, since it is more convenient, but the PRM
for Broadwell states in Volume 3D Media GPGPU, Chapter 'Register region
"In Align16 mode, the channel selects and channel enables apply to a
pair of half-floats, because these parameters are defined for DWord
elements ONLY. This is applicable when both source and
destination
are half-floats."
This means that we cannot select individual HF elements using swizzles
like we do with 32-bit floats so we can't implement the required
regioning for this.
Use the gen11 path for this instead, which uses Align1 mode.
The restriction is not present in gen9 of gen10, where the Align16

or?

Right, the issue is exclusive to gen8.

Iago

Post by Iago Toral Quiroga
implementation seems to work just fine.
---
src/intel/compiler/brw_fs_generator.cpp | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)
diff --git a/src/intel/compiler/brw_fs_generator.cpp
b/src/intel/compiler/brw_fs_generator.cpp
index d8e4bae17e0..ba7ed07e692 100644
--- a/src/intel/compiler/brw_fs_generator.cpp
+++ b/src/intel/compiler/brw_fs_generator.cpp
@@ -1281,8 +1281,14 @@ fs_generator::generate_ddy(const fs_inst *inst,
const uint32_t type_size = type_sz(src.type);
if (inst->opcode == FS_OPCODE_DDY_FINE) {
- /* produce accurate derivatives */
- if (devinfo->gen >= 11) {
+ /* produce accurate derivatives. We can do this easily in Align16
+ * but this is not supported in gen11+ and gen8 Align16 swizzles
+ * for Half-Float operands work in units of 32-bit and always
+ * select pairs of consecutive half-float elements, so we can't use
+ * use it for this.
+ */
+ if (devinfo->gen >= 11 ||
+ (devinfo->gen == 8 && src.type == BRW_REGISTER_TYPE_HF)) {
src = stride(src, 0, 2, 1);
struct brw_reg src_0 = byte_offset(src, 0 * type_size);
struct brw_reg src_2 = byte_offset(src, 2 * type_size);
--
2.17.1
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Iago Toral Quiroga

2018-12-04 07:17:07 UTC

These need to be implemented in two steps using an intermediary type.
---
src/intel/compiler/brw_fs_nir.cpp | 11 +++++++----
1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index 559b55a0f84..c1ba799d59c 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -881,6 +881,8 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
inst->saturate = instr->dest.saturate;
break;

+ case nir_op_i2i8:
+ case nir_op_u2u8:
case nir_op_i2f16:
case nir_op_u2f16:
/* BDW PRM, vol02, Command Reference Instructions, mov - MOVE:
@@ -888,10 +890,13 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
* "There is no direct conversion from HF to Q/UQ or Q/UQ to HF.
* Use two instructions and F (Float) or a word integer type or a
* DWord integer type as an intermediate type."
+ *
+ * Similar text exists for conversions between Q/UQ and B/UB.
*/
if (nir_src_bit_size(instr->src[0].src) == 64) {
- brw_reg_type reg_type = instr->op == nir_op_i2f16 ?
- BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_UD;
+ brw_reg_type reg_type =
+ (instr->op == nir_op_i2f16 || instr->op == nir_op_i2i8) ?
+ BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_UD;
fs_reg tmp = bld.vgrf(reg_type, 1);
inst = bld.MOV(tmp, op[0]);
inst->saturate = instr->dest.saturate;
@@ -912,8 +917,6 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
case nir_op_u2u32:
case nir_op_i2i16:
case nir_op_u2u16:
- case nir_op_i2i8:
- case nir_op_u2u8:
inst = bld.MOV(result, op[0]);
inst->saturate = instr->dest.saturate;
break;

--
2.17.1

Iago Toral Quiroga

2018-12-04 07:17:04 UTC

---
src/intel/vulkan/anv_extensions.py | 1 +
1 file changed, 1 insertion(+)

diff --git a/src/intel/vulkan/anv_extensions.py b/src/intel/vulkan/anv_extensions.py
index 7c81228f705..9ca42d998ef 100644
--- a/src/intel/vulkan/anv_extensions.py
+++ b/src/intel/vulkan/anv_extensions.py
@@ -104,6 +104,7 @@ EXTENSIONS = [
Extension('VK_KHR_sampler_mirror_clamp_to_edge', 1, True),
Extension('VK_KHR_sampler_ycbcr_conversion', 1, True),
Extension('VK_KHR_shader_draw_parameters', 1, True),
+ Extension('VK_KHR_shader_float16_int8', 1, 'device->info.gen >= 8'),
Extension('VK_KHR_storage_buffer_storage_class', 1, True),
Extension('VK_KHR_surface', 25, 'ANV_HAS_SURFACE'),
Extension('VK_KHR_swapchain', 68, 'ANV_HAS_SURFACE'),

--
2.17.1

Iago Toral Quiroga

2018-12-04 07:17:06 UTC

Particularly, we need the same lowewrings we use for 16-bit
integers.
---
src/intel/compiler/brw_nir.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c
index 0b3094724c4..0a5aa35c700 100644
--- a/src/intel/compiler/brw_nir.c
+++ b/src/intel/compiler/brw_nir.c
@@ -611,7 +611,7 @@ static unsigned
lower_bit_size_callback(const nir_alu_instr *alu, UNUSED void *data)
{
assert(alu->dest.dest.is_ssa);
- if (alu->dest.dest.ssa.bit_size != 16)
+ if (alu->dest.dest.ssa.bit_size >= 32)
return 0;

const struct brw_compiler *compiler = (const struct brw_compiler *) data;

--
2.17.1

Iago Toral Quiroga

2018-12-04 07:16:46 UTC

---
src/compiler/nir/nir_opt_algebraic.py | 5 +++++
1 file changed, 5 insertions(+)

diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py
index 6c3b77c9b6e..747f1751086 100644
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@@ -778,6 +778,8 @@ def fexp2i(exp, bits):
return ('ishl', ('iadd', exp, 127), 23)
elif bits == 64:
return ('pack_64_2x32_split', 0, ('ishl', ('iadd', exp, 1023), 20))
+ elif bits == 16:
+ return ('i2i16', ('ishl', ('iadd', exp, 15), 10))
else:
assert False

@@ -796,6 +798,8 @@ def ldexp(f, exp, bits):
exp = ('imin', ('imax', exp, -252), 254)
elif bits == 64:
exp = ('imin', ('imax', exp, -2044), 2046)
+ elif bits == 16:
+ exp = ('imin', ('imax', exp, -30), 30)
else:
assert False

@@ -814,6 +818,7 @@ def ldexp(f, exp, bits):
optimizations += [
(('***@32', 'x', 'exp'), ldexp('x', 'exp', 32), 'options->lower_ldexp'),
(('***@64', 'x', 'exp'), ldexp('x', 'exp', 64), 'options->lower_ldexp'),
+ (('***@16', 'x', 'exp'), ldexp('x', 'exp', 16), 'options->lower_ldexp'),
]

# Unreal Engine 4 demo applications open-codes bitfieldReverse()

--
2.17.1

Pohjolainen, Topi

2018-12-05 09:39:18 UTC

I remember people preferring to order things 16, 32, 64 before. Should
we follow that here as well?

Post by Iago Toral Quiroga
---
src/compiler/nir/nir_opt_algebraic.py | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py
index 6c3b77c9b6e..747f1751086 100644
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
return ('ishl', ('iadd', exp, 127), 23)
return ('pack_64_2x32_split', 0, ('ishl', ('iadd', exp, 1023), 20))
+ return ('i2i16', ('ishl', ('iadd', exp, 15), 10))
assert False
exp = ('imin', ('imax', exp, -252), 254)
exp = ('imin', ('imax', exp, -2044), 2046)
+ exp = ('imin', ('imax', exp, -30), 30)

I expected this to be:

exp = ('imin', ('imax', exp, -29), 30)

Post by Iago Toral Quiroga
assert False
optimizations += [
]
# Unreal Engine 4 demo applications open-codes bitfieldReverse()
--
2.17.1
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Iago Toral

2018-12-05 10:53:44 UTC

Post by Pohjolainen, Topi
I remember people preferring to order things 16, 32, 64 before. Should
we follow that here as well?

Yes, it makes sense. I'll change that.

Post by Iago Toral Quiroga
---
src/compiler/nir/nir_opt_algebraic.py | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/src/compiler/nir/nir_opt_algebraic.py
b/src/compiler/nir/nir_opt_algebraic.py
index 6c3b77c9b6e..747f1751086 100644
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
return ('ishl', ('iadd', exp, 127), 23)
return ('pack_64_2x32_split', 0, ('ishl', ('iadd', exp, 1023), 20))
+ return ('i2i16', ('ishl', ('iadd', exp, 15), 10))
assert False
exp = ('imin', ('imax', exp, -252), 254)
exp = ('imin', ('imax', exp, -2044), 2046)
+ exp = ('imin', ('imax', exp, -30), 30)

exp = ('imin', ('imax', exp, -29), 30)

Actually, I think this should be -28, since the minimum exponent value
is -14.

Post by Iago Toral Quiroga
assert False
optimizations += [

Post by Iago Toral Quiroga
lower_ldexp'),
lower_ldexp'),
lower_ldexp'),

]
# Unreal Engine 4 demo applications open-codes bitfieldReverse()
--
2.17.1
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Pohjolainen, Topi

2018-12-05 11:20:42 UTC

Post by Pohjolainen, Topi
I remember people preferring to order things 16, 32, 64 before. Should
we follow that here as well?

Yes, it makes sense. I'll change that.

Post by Iago Toral Quiroga
---
src/compiler/nir/nir_opt_algebraic.py | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/src/compiler/nir/nir_opt_algebraic.py
b/src/compiler/nir/nir_opt_algebraic.py
index 6c3b77c9b6e..747f1751086 100644
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
return ('ishl', ('iadd', exp, 127), 23)
return ('pack_64_2x32_split', 0, ('ishl', ('iadd', exp, 1023), 20))
+ return ('i2i16', ('ishl', ('iadd', exp, 15), 10))
assert False
exp = ('imin', ('imax', exp, -252), 254)
exp = ('imin', ('imax', exp, -2044), 2046)
+ exp = ('imin', ('imax', exp, -30), 30)

exp = ('imin', ('imax', exp, -29), 30)

Actually, I think this should be -28, since the minimum exponent value
is -14.

I kept wondering about. The offset is 15 and -14 - 15 yields -29. But -28
in turn would be more in line with the 32- and 64-bit cases.

Post by Iago Toral Quiroga
assert False
optimizations += [

Post by Iago Toral Quiroga
lower_ldexp'),
lower_ldexp'),
lower_ldexp'),

Iago Toral

2018-12-05 11:26:06 UTC

Post by Pohjolainen, Topi
I remember people preferring to order things 16, 32, 64 before. Should
we follow that here as well?

Yes, it makes sense. I'll change that.

Post by Iago Toral Quiroga
---
src/compiler/nir/nir_opt_algebraic.py | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/src/compiler/nir/nir_opt_algebraic.py
b/src/compiler/nir/nir_opt_algebraic.py
index 6c3b77c9b6e..747f1751086 100644
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
return ('ishl', ('iadd', exp, 127), 23)
return ('pack_64_2x32_split', 0, ('ishl', ('iadd', exp, 1023), 20))
+ return ('i2i16', ('ishl', ('iadd', exp, 15), 10))
assert False
exp = ('imin', ('imax', exp, -252), 254)
exp = ('imin', ('imax', exp, -2044), 2046)
+ exp = ('imin', ('imax', exp, -30), 30)

exp = ('imin', ('imax', exp, -29), 30)

Actually, I think this should be -28, since the minimum exponent value
is -14.

I kept wondering about. The offset is 15 and -14 - 15 yields -29. But -28
in turn would be more in line with the 32- and 64-bit cases.

I think the idea is to have this be 2x the minimum (and maximum)
exponents we can represent, since below we are dividing it by two and
emitting two exponentials, each with half that exponent. That way we
ensure that when we divide the exponent by 2 we still produce a
representable exponent for the bit-size.

Iago

Post by Iago Toral Quiroga
assert False
optimizations += [

Post by Iago Toral Quiroga
lower_ldexp'),
lower_ldexp'),
lower_ldexp'),

]
# Unreal Engine 4 demo applications open-codes
bitfieldReverse()
--
2.17.1
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Pohjolainen, Topi

2018-12-05 11:32:18 UTC

Post by Pohjolainen, Topi
I remember people preferring to order things 16, 32, 64 before. Should
we follow that here as well?

Yes, it makes sense. I'll change that.

Post by Iago Toral Quiroga
---
src/compiler/nir/nir_opt_algebraic.py | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/src/compiler/nir/nir_opt_algebraic.py
b/src/compiler/nir/nir_opt_algebraic.py
index 6c3b77c9b6e..747f1751086 100644
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
return ('ishl', ('iadd', exp, 127), 23)
return ('pack_64_2x32_split', 0, ('ishl', ('iadd', exp, 1023), 20))
+ return ('i2i16', ('ishl', ('iadd', exp, 15), 10))
assert False
exp = ('imin', ('imax', exp, -252), 254)
exp = ('imin', ('imax', exp, -2044), 2046)
+ exp = ('imin', ('imax', exp, -30), 30)

exp = ('imin', ('imax', exp, -29), 30)

Actually, I think this should be -28, since the minimum exponent value
is -14.

I kept wondering about. The offset is 15 and -14 - 15 yields -29. But -28
in turn would be more in line with the 32- and 64-bit cases.

Ah, right. I should have checked the context, -28 makes sense now.

Reviewed-by: Topi Pohjolainen <***@intel.com>

Iago Toral Quiroga

2018-12-04 07:17:20 UTC

---
.../compiler/brw_fs_combine_constants.cpp | 60 +++++++++++++++----
1 file changed, 49 insertions(+), 11 deletions(-)

diff --git a/src/intel/compiler/brw_fs_combine_constants.cpp b/src/intel/compiler/brw_fs_combine_constants.cpp
index e0c95d379b8..24307e365ab 100644
--- a/src/intel/compiler/brw_fs_combine_constants.cpp
+++ b/src/intel/compiler/brw_fs_combine_constants.cpp
@@ -36,6 +36,7 @@

#include "brw_fs.h"
#include "brw_cfg.h"
+#include "util/half_float.h"

using namespace brw;

@@ -114,8 +115,9 @@ struct imm {
*/
exec_list *uses;

- /** The immediate value. We currently only handle floats. */
+ /** The immediate value. We currently only handle float and half-float. */
float val;
+ brw_reg_type type;

/**
* The GRF register and subregister number where we've decided to store the
@@ -145,10 +147,10 @@ struct table {
};

static struct imm *
-find_imm(struct table *table, float val)
+find_imm(struct table *table, float val, brw_reg_type type)
{
for (int i = 0; i < table->len; i++) {
- if (table->imm[i].val == val) {
+ if (table->imm[i].val == val && table->imm[i].type == type) {
return &table->imm[i];
}
}
@@ -190,6 +192,20 @@ compare(const void *_a, const void *_b)
return a->first_use_ip - b->first_use_ip;
}

+static bool
+needs_negate(float reg_val, float imm_val, brw_reg_type type)
+{
+ /* reg_val represents the immediate value in the register in its original
+ * bit-size, while imm_val is always a valid 32-bit float value.
+ */
+ if (type == BRW_REGISTER_TYPE_HF) {
+ uint32_t reg_val_ud = *((uint32_t *) &reg_val);
+ reg_val = _mesa_half_to_float(reg_val_ud & 0xffff);
+ }
+
+ return signbit(imm_val) != signbit(reg_val);
+}
+
bool
fs_visitor::opt_combine_constants()
{
@@ -215,12 +231,20 @@ fs_visitor::opt_combine_constants()

for (int i = 0; i < inst->sources; i++) {
if (inst->src[i].file != IMM ||
- inst->src[i].type != BRW_REGISTER_TYPE_F)
+ (inst->src[i].type != BRW_REGISTER_TYPE_F &&
+ inst->src[i].type != BRW_REGISTER_TYPE_HF))
continue;

- float val = !inst->can_do_source_mods(devinfo) ? inst->src[i].f :
- fabs(inst->src[i].f);
- struct imm *imm = find_imm(&table, val);
+ float val;
+ if (inst->src[i].type == BRW_REGISTER_TYPE_F) {
+ val = !inst->can_do_source_mods(devinfo) ? inst->src[i].f :
+ fabs(inst->src[i].f);
+ } else {
+ val = !inst->can_do_source_mods(devinfo) ?
+ _mesa_half_to_float(inst->src[i].d & 0xffff) :
+ fabs(_mesa_half_to_float(inst->src[i].d & 0xffff));
+ }
+ struct imm *imm = find_imm(&table, val, inst->src[i].type);

if (imm) {
bblock_t *intersection = cfg_t::intersect(block, imm->block);
@@ -238,6 +262,7 @@ fs_visitor::opt_combine_constants()
imm->uses = new(const_ctx) exec_list();
imm->uses->push_tail(link(const_ctx, &inst->src[i]));
imm->val = val;
+ imm->type = inst->src[i].type;
imm->uses_by_coissue = could_coissue(devinfo, inst);
imm->must_promote = must_promote_imm(devinfo, inst);
imm->first_use_ip = ip;
@@ -278,12 +303,23 @@ fs_visitor::opt_combine_constants()
imm->block->last_non_control_flow_inst()->next);
const fs_builder ibld = bld.at(imm->block, n).exec_all().group(1, 0);

- ibld.MOV(reg, brw_imm_f(imm->val));
+ reg = retype(reg, imm->type);
+ if (imm->type == BRW_REGISTER_TYPE_F) {
+ ibld.MOV(reg, brw_imm_f(imm->val));
+ } else {
+ const uint16_t val_hf = _mesa_float_to_half(imm->val);
+ ibld.MOV(reg, retype(brw_imm_uw(val_hf), BRW_REGISTER_TYPE_HF));
+ }
imm->nr = reg.nr;
imm->subreg_offset = reg.offset;

+ /* Keep offsets 32-bit aligned since we are mixing 32-bit and 16-bit
+ * constants into the same register
+ *
+ * TODO: try to pack pairs of HF constants into each 32-bit slot
+ */
reg.offset += sizeof(float);
- if (reg.offset == 8 * sizeof(float)) {
+ if (reg.offset == REG_SIZE) {
reg.nr = alloc.allocate(1);
reg.offset = 0;
}
@@ -296,11 +332,13 @@ fs_visitor::opt_combine_constants()
fs_reg *reg = link->reg;
reg->file = VGRF;
reg->nr = table.imm[i].nr;
+ reg->type = table.imm[i].type;
reg->offset = table.imm[i].subreg_offset;
reg->stride = 0;
- reg->negate = signbit(reg->f) != signbit(table.imm[i].val);
+ reg->negate = needs_negate(reg->f, table.imm[i].val, table.imm[i].type);
assert((isnan(reg->f) && isnan(table.imm[i].val)) ||
- fabsf(reg->f) == fabs(table.imm[i].val));
+ fabsf(reg->f) == fabs(table.imm[i].val) ||
+ table.imm[i].type == BRW_REGISTER_TYPE_HF);
}
}

--
2.17.1

Iago Toral Quiroga

2018-12-04 07:17:12 UTC

---
src/intel/compiler/brw_fs_nir.cpp | 25 +++++++++++++++++++++----
1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index db3a8812ae3..7a4594a24ac 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -1063,11 +1063,28 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
* Predicated OR sets 1 if val is positive.
*/
uint32_t bit_size = nir_dest_bit_size(instr->dest.dest);
- assert(bit_size == 32 || bit_size == 16);

- fs_reg zero = bit_size == 32 ? brw_imm_d(0) : brw_imm_w(0);
- fs_reg one = bit_size == 32 ? brw_imm_d(1) : brw_imm_w(1);
- fs_reg shift = bit_size == 32 ? brw_imm_d(31) : brw_imm_w(15);
+ fs_reg zero, one, shift;
+ switch (bit_size) {
+ case 32:
+ zero = brw_imm_d(0);
+ one = brw_imm_d(1);
+ shift = brw_imm_d(31);
+ break;
+ case 16:
+ zero = brw_imm_w(0);
+ one = brw_imm_w(1);
+ shift = brw_imm_w(15);
+ break;
+ case 8: {
+ zero = setup_imm_b(bld, 0);
+ one = setup_imm_b(bld, 1);
+ shift = setup_imm_b(bld, 7);
+ break;
+ }
+ default:
+ unreachable("unsupported bit-size");
+ };

bld.CMP(bld.null_reg_d(), op[0], zero, BRW_CONDITIONAL_G);
bld.ASR(result, op[0], shift);

--
2.17.1

Iago Toral Quiroga

2018-12-04 07:17:09 UTC

---
src/intel/compiler/brw_fs_nir.cpp | 18 ++++++++++++++++++
1 file changed, 18 insertions(+)

diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index 4f815fef891..3f98c6a4474 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -921,6 +921,24 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
inst->saturate = instr->dest.saturate;
break;

+ case nir_op_f2i8:
+ case nir_op_f2u8:
+ /* BDW PRM, vol02, Command Reference Instructions, mov - MOVE:
+ *
+ * "There is no direct conversion from B/UB to DF or DF to B/UB. Use
+ * two instructions and a word or DWord intermediate type."
+ */
+ if (nir_src_bit_size(instr->src[0].src) == 64 &&
+ nir_dest_bit_size(instr->dest.dest) == 8) {
+ fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 1);
+ inst = bld.MOV(tmp, op[0]);
+ inst->saturate = instr->dest.saturate;
+ inst = bld.MOV(result, tmp);
+ inst->saturate = instr->dest.saturate;
+ break;
+ }
+ /* Fallthrough */
+
case nir_op_f2f32:
case nir_op_f2i32:
case nir_op_f2u32:

--
2.17.1

Iago Toral Quiroga

2018-12-04 07:17:21 UTC

There is a hardware restriction where <0,1,0>:HF in Align16 doesn't replicate
a single 16-bit channel, but instead it replicates a full 32-bit channel.
---
.../compiler/brw_fs_combine_constants.cpp | 24 +++++++++++++++++--
1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/src/intel/compiler/brw_fs_combine_constants.cpp b/src/intel/compiler/brw_fs_combine_constants.cpp
index 24307e365ab..082aa536906 100644
--- a/src/intel/compiler/brw_fs_combine_constants.cpp
+++ b/src/intel/compiler/brw_fs_combine_constants.cpp
@@ -301,7 +301,26 @@ fs_visitor::opt_combine_constants()
*/
exec_node *n = (imm->inst ? imm->inst :
imm->block->last_non_control_flow_inst()->next);
- const fs_builder ibld = bld.at(imm->block, n).exec_all().group(1, 0);
+
+ /* Prior to gen9 we also have to deal with this restriction:
+ *
+ * "In Align16 mode, the channel selects and channel enables apply to a
+ * pair of half-floats, because these parameters are defined for DWord
+ * elements ONLY. This is applicable when both source and destination
+ * are half-floats."
+ *
+ * This means that when we emit a 3-src instruction such as MAD or LRP,
+ * for which we use Align16, if we need to promote an HF constant to a
+ * register we need to be aware that the <0,1,0>:HF region would still
+ * read 2 HF slots and not not replicate the single one like we want.
+ * We fix this by populating both HF slots with the constant we need to
+ * read.
+ */
+ const uint32_t width =
+ devinfo->gen < 9 &&
+ imm->type == BRW_REGISTER_TYPE_HF &&
+ (!imm->inst || imm->inst->is_3src(devinfo)) ? 2 : 1;
+ const fs_builder ibld = bld.at(imm->block, n).exec_all().group(width, 0);

reg = retype(reg, imm->type);
if (imm->type == BRW_REGISTER_TYPE_F) {
@@ -314,7 +333,8 @@ fs_visitor::opt_combine_constants()
imm->subreg_offset = reg.offset;

/* Keep offsets 32-bit aligned since we are mixing 32-bit and 16-bit
- * constants into the same register
+ * constants into the same register (and we are writing 32-bit slots
+ * prior to gen9 for HF constants anyway).
*
* TODO: try to pack pairs of HF constants into each 32-bit slot
*/

--
2.17.1

Iago Toral Quiroga

2018-12-04 07:16:50 UTC

We are now using these bits, so don't assert that they are not set, just
avoid compaction in that case.
---
src/intel/compiler/brw_eu_compact.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/intel/compiler/brw_eu_compact.c b/src/intel/compiler/brw_eu_compact.c
index ae14ef10ec0..20fed254331 100644
--- a/src/intel/compiler/brw_eu_compact.c
+++ b/src/intel/compiler/brw_eu_compact.c
@@ -928,8 +928,11 @@ has_3src_unmapped_bits(const struct gen_device_info *devinfo,
assert(!brw_inst_bits(src, 127, 126) &&
!brw_inst_bits(src, 105, 105) &&
!brw_inst_bits(src, 84, 84) &&
- !brw_inst_bits(src, 36, 35) &&
!brw_inst_bits(src, 7, 7));
+
+ /* Src1Type and Src2Type, used for mixed-precision floating point */
+ if (brw_inst_bits(src, 36, 35))
+ return true;
}

return false;

--
2.17.1

Pohjolainen, Topi

2018-12-07 09:55:09 UTC