Discussion:
[PATCH 2/4] i965: Use I915_EXEC_NO_RELOC
Add Reply
Chris Wilson
2017-06-19 10:06:48 UTC
Reply
Permalink
Raw Message
If we correctly fill the batch with the right relocation value, and that
matches the expected location of the object, we can then tell the kernel
it can forgo checking each individual relocation by only checking
whether the object moved.

v2: Rebase to apply ahead of I915_EXEC_HANDLE_LUT

Signed-off-by: Chris Wilson <***@chris-wilson.co.uk>
Cc: Kenneth Graunke <***@whitecape.org>
Cc: Matt Turner <***@gmail.com>
Cc: Jason Ekstrand <***@intel.com>
---
src/mesa/drivers/dri/i965/intel_batchbuffer.c | 53 +++++++++++++++++++++------
1 file changed, 41 insertions(+), 12 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
index ca7d6b81b1..7129209c26 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@@ -517,19 +517,19 @@ throttle(struct brw_context *brw)

#define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))

-static void
+static unsigned int
add_exec_bo(struct intel_batchbuffer *batch, struct brw_bo *bo)
{
if (bo != batch->bo) {
unsigned int index = READ_ONCE(bo->index);

if (index < batch->exec_count && batch->exec_bos[index] == bo)
- return;
+ return index;

/* May have been shared between multiple active batches */
for (index = 0; index < batch->exec_count; index++) {
if (batch->exec_bos[index] == bo)
- return;
+ return index;
}

brw_bo_reference(bo);
@@ -563,8 +563,9 @@ add_exec_bo(struct intel_batchbuffer *batch, struct brw_bo *bo)

bo->index = batch->exec_count;
batch->exec_bos[batch->exec_count] = bo;
- batch->exec_count++;
batch->aperture_space += bo->size;
+
+ return batch->exec_count++;
}

static int
@@ -642,12 +643,25 @@ do_flush_locked(struct brw_context *brw, int in_fence_fd, int *out_fence_fd)
}

if (!brw->screen->no_hw) {
- int flags;
-
+ unsigned int flags;
+
+ /* The requirement for using I915_EXEC_NO_RELOC are:
+ *
+ * The addresses written in the objects must match the corresponding
+ * reloc.presumed_offset which in turn must match the corresponding
+ * execobject.offset.
+ *
+ * Any render targets written to in the batch must be flagged with
+ * EXEC_OBJECT_WRITE.
+ *
+ * To avoid stalling, execobject.offset should match the current
+ * address of that object within the active context.
+ */
+ flags = I915_EXEC_NO_RELOC;
if (brw->gen >= 6 && batch->ring == BLT_RING) {
- flags = I915_EXEC_BLT;
+ flags |= I915_EXEC_BLT;
} else {
- flags = I915_EXEC_RENDER;
+ flags |= I915_EXEC_RENDER;
}
if (batch->needs_sol_reset)
flags |= I915_EXEC_GEN7_SOL_RESET;
@@ -779,16 +793,31 @@ brw_emit_reloc(struct intel_batchbuffer *batch, uint32_t batch_offset,
assert(batch_offset <= BATCH_SZ - sizeof(uint32_t));
assert(_mesa_bitcount(write_domain) <= 1);

- if (target != batch->bo)
- add_exec_bo(batch, target);
+ if (target != batch->bo) {
+ unsigned int index = add_exec_bo(batch, target);
+ struct drm_i915_gem_exec_object2 *exec = &batch->exec_objects[index];
+
+ if (write_domain) {
+ exec->flags |= EXEC_OBJECT_WRITE;
+
+ /* PIPECONTROL needs a w/a on gen6 */
+ if (write_domain == I915_GEM_DOMAIN_INSTRUCTION) {
+ struct brw_context *brw = container_of(batch, brw, batch);
+ if (brw->gen == 6)
+ exec->flags |= EXEC_OBJECT_NEEDS_GTT;
+ }
+ }
+
+ offset64 = exec->offset;
+ } else {
+ offset64 = target->offset64;
+ }

struct drm_i915_gem_relocation_entry *reloc =
&batch->relocs[batch->reloc_count];

batch->reloc_count++;

- /* ensure gcc doesn't reload */
- offset64 = *((volatile uint64_t *)&target->offset64);
reloc->offset = batch_offset;
reloc->delta = target_offset;
reloc->target_handle = target->gem_handle;
--
2.11.0
Chris Wilson
2017-06-19 10:06:49 UTC
Reply
Permalink
Raw Message
To avoid a forward declaration in the next patch, move the definition of
add_exec_bo() earlier.

Signed-off-by: Chris Wilson <***@chris-wilson.co.uk>
Cc: Kenneth Graunke <***@whitecape.org>
Cc: Matt Turner <***@gmail.com>
Cc: Jason Ekstrand <***@intel.com>
---
src/mesa/drivers/dri/i965/intel_batchbuffer.c | 106 +++++++++++++-------------
1 file changed, 53 insertions(+), 53 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
index 7129209c26..15aaf01e52 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@@ -87,6 +87,59 @@ intel_batchbuffer_init(struct intel_batchbuffer *batch,
}
}

+#define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))
+
+static unsigned int
+add_exec_bo(struct intel_batchbuffer *batch, struct brw_bo *bo)
+{
+ if (bo != batch->bo) {
+ unsigned int index = READ_ONCE(bo->index);
+
+ if (index < batch->exec_count && batch->exec_bos[index] == bo)
+ return index;
+
+ /* May have been shared between multiple active batches */
+ for (index = 0; index < batch->exec_count; index++) {
+ if (batch->exec_bos[index] == bo)
+ return index;
+ }
+
+ brw_bo_reference(bo);
+ }
+
+ if (batch->exec_count == batch->exec_array_size) {
+ batch->exec_array_size *= 2;
+ batch->exec_bos =
+ realloc(batch->exec_bos,
+ batch->exec_array_size * sizeof(batch->exec_bos[0]));
+ batch->exec_objects =
+ realloc(batch->exec_objects,
+ batch->exec_array_size * sizeof(batch->exec_objects[0]));
+ }
+
+ struct drm_i915_gem_exec_object2 *validation_entry =
+ &batch->exec_objects[batch->exec_count];
+ validation_entry->handle = bo->gem_handle;
+ if (bo == batch->bo) {
+ validation_entry->relocation_count = batch->reloc_count;
+ validation_entry->relocs_ptr = (uintptr_t) batch->relocs;
+ } else {
+ validation_entry->relocation_count = 0;
+ validation_entry->relocs_ptr = 0;
+ }
+ validation_entry->alignment = bo->align;
+ validation_entry->offset = bo->offset64;
+ validation_entry->flags = bo->kflags;
+ validation_entry->rsvd1 = 0;
+ validation_entry->rsvd2 = 0;
+
+ bo->index = batch->exec_count;
+ batch->exec_bos[batch->exec_count] = bo;
+ batch->aperture_space += bo->size;
+
+ return batch->exec_count++;
+}
+
static void
intel_batchbuffer_reset(struct intel_batchbuffer *batch,
struct brw_bufmgr *bufmgr,
@@ -515,59 +568,6 @@ throttle(struct brw_context *brw)
}
}

-#define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))
-
-static unsigned int
-add_exec_bo(struct intel_batchbuffer *batch, struct brw_bo *bo)
-{
- if (bo != batch->bo) {
- unsigned int index = READ_ONCE(bo->index);
-
- if (index < batch->exec_count && batch->exec_bos[index] == bo)
- return index;
-
- /* May have been shared between multiple active batches */
- for (index = 0; index < batch->exec_count; index++) {
- if (batch->exec_bos[index] == bo)
- return index;
- }
-
- brw_bo_reference(bo);
- }
-
- if (batch->exec_count == batch->exec_array_size) {
- batch->exec_array_size *= 2;
- batch->exec_bos =
- realloc(batch->exec_bos,
- batch->exec_array_size * sizeof(batch->exec_bos[0]));
- batch->exec_objects =
- realloc(batch->exec_objects,
- batch->exec_array_size * sizeof(batch->exec_objects[0]));
- }
-
- struct drm_i915_gem_exec_object2 *validation_entry =
- &batch->exec_objects[batch->exec_count];
- validation_entry->handle = bo->gem_handle;
- if (bo == batch->bo) {
- validation_entry->relocation_count = batch->reloc_count;
- validation_entry->relocs_ptr = (uintptr_t) batch->relocs;
- } else {
- validation_entry->relocation_count = 0;
- validation_entry->relocs_ptr = 0;
- }
- validation_entry->alignment = bo->align;
- validation_entry->offset = bo->offset64;
- validation_entry->flags = bo->kflags;
- validation_entry->rsvd1 = 0;
- validation_entry->rsvd2 = 0;
-
- bo->index = batch->exec_count;
- batch->exec_bos[batch->exec_count] = bo;
- batch->aperture_space += bo->size;
-
- return batch->exec_count++;
-}
-
static int
execbuffer(int fd,
struct intel_batchbuffer *batch,
--
2.11.0
Chris Wilson
2017-06-19 10:06:50 UTC
Reply
Permalink
Raw Message
Passing the index of the target buffer via the reloc.target_handle is
marginally more efficient for the kernel (it can avoid some allocations,
and can use a direct lookup rather than a hash or search). It is also
useful for ourselves as we can use the index into our exec_bos for other
tasks.

v2: Only enable HANDLE_LUT if we can use BATCH_FIRST and thereby avoid
a post-processing loop to fixup the relocations.

Signed-off-by: Chris Wilson <***@chris-wilson.co.uk>
Cc: Kenneth Graunke <***@whitecape.org>
Cc: Matt Turner <***@gmail.com>
Cc: Jason Ekstrand <***@intel.com>
---
src/mesa/drivers/dri/i965/brw_context.h | 1 +
src/mesa/drivers/dri/i965/intel_batchbuffer.c | 82 ++++++++++++++++++++-------
2 files changed, 61 insertions(+), 22 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 2fb2cab918..93ddd0825a 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -450,6 +450,7 @@ struct intel_batchbuffer {

uint32_t state_batch_offset;
enum brw_gpu_ring ring;
+ bool has_batch_first;
bool needs_sol_reset;
bool state_base_address_emitted;

diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
index 15aaf01e52..5fa849c5a5 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@@ -40,6 +40,10 @@

#define FILE_DEBUG_FLAG DEBUG_BUFMGR

+#define DBG_NO_BATCH_FIRST 0
+#define I915_PARAM_HAS_EXEC_BATCH_FIRST 48
+#define I915_EXEC_BATCH_FIRST (1 << 18)
+
static void
intel_batchbuffer_reset(struct intel_batchbuffer *batch,
struct brw_bufmgr *bufmgr,
@@ -57,13 +61,33 @@ uint_key_hash(const void *key)
return (uintptr_t) key;
}

+static int gem_param(int fd, int name)
+{
+ drm_i915_getparam_t gp;
+ int v = -1; /* No param uses (yet) the sign bit, reserve it for errors */
+
+ memset(&gp, 0, sizeof(gp));
+ gp.param = name;
+ gp.value = &v;
+ if (drmIoctl(fd, DRM_IOCTL_I915_GETPARAM, &gp))
+ return -1;
+
+ return v;
+}
+
+static bool test_has_batch_first(int fd)
+{
+ if (DBG_NO_BATCH_FIRST)
+ return DBG_NO_BATCH_FIRST < 0;
+
+ return gem_param(fd, I915_PARAM_HAS_EXEC_BATCH_FIRST) > 0;
+}
+
void
intel_batchbuffer_init(struct intel_batchbuffer *batch,
struct brw_bufmgr *bufmgr,
bool has_llc)
{
- intel_batchbuffer_reset(batch, bufmgr, has_llc);
-
if (!has_llc) {
batch->cpu_map = malloc(BATCH_SZ);
batch->map = batch->cpu_map;
@@ -85,6 +109,12 @@ intel_batchbuffer_init(struct intel_batchbuffer *batch,
batch->state_batch_sizes =
_mesa_hash_table_create(NULL, uint_key_hash, uint_key_compare);
}
+
+ struct brw_context *brw = container_of(batch, brw, batch);
+ __DRIscreen *dri_screen = brw->screen->driScrnPriv;
+ batch->has_batch_first = test_has_batch_first(dri_screen->fd);
+
+ intel_batchbuffer_reset(batch, bufmgr, has_llc);
}

#define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))
@@ -117,21 +147,12 @@ add_exec_bo(struct intel_batchbuffer *batch, struct brw_bo *bo)
batch->exec_array_size * sizeof(batch->exec_objects[0]));
}

- struct drm_i915_gem_exec_object2 *validation_entry =
- &batch->exec_objects[batch->exec_count];
- validation_entry->handle = bo->gem_handle;
- if (bo == batch->bo) {
- validation_entry->relocation_count = batch->reloc_count;
- validation_entry->relocs_ptr = (uintptr_t) batch->relocs;
- } else {
- validation_entry->relocation_count = 0;
- validation_entry->relocs_ptr = 0;
- }
- validation_entry->alignment = bo->align;
- validation_entry->offset = bo->offset64;
- validation_entry->flags = bo->kflags;
- validation_entry->rsvd1 = 0;
- validation_entry->rsvd2 = 0;
+ struct drm_i915_gem_exec_object2 *exec =
+ memset(&batch->exec_objects[batch->exec_count], 0, sizeof(*exec));
+ exec->handle = bo->gem_handle;
+ exec->alignment = bo->align;
+ exec->offset = bo->offset64;
+ exec->flags = bo->kflags;

bo->index = batch->exec_count;
batch->exec_bos[batch->exec_count] = bo;
@@ -157,6 +178,11 @@ intel_batchbuffer_reset(struct intel_batchbuffer *batch,
}
batch->map_next = batch->map;

+ if (batch->has_batch_first) {
+ add_exec_bo(batch, batch->bo);
+ assert(batch->bo->index == 0);
+ }
+
batch->reserved_space = BATCH_RESERVED;
batch->state_batch_offset = batch->bo->size;
batch->needs_sol_reset = false;
@@ -663,15 +689,25 @@ do_flush_locked(struct brw_context *brw, int in_fence_fd, int *out_fence_fd)
} else {
flags |= I915_EXEC_RENDER;
}
+
if (batch->needs_sol_reset)
flags |= I915_EXEC_GEN7_SOL_RESET;

+ unsigned int index;
+ if (batch->has_batch_first) {
+ flags |= I915_EXEC_BATCH_FIRST | I915_EXEC_HANDLE_LUT;
+ index = 0;
+ } else {
+ index = add_exec_bo(batch, batch->bo);
+ }
+
+ struct drm_i915_gem_exec_object2 *exec = &batch->exec_objects[index];
+ exec->relocation_count = batch->reloc_count;
+ exec->relocs_ptr = (uintptr_t) batch->relocs;
+
if (ret == 0) {
uint32_t hw_ctx = batch->ring == RENDER_RING ? brw->hw_ctx : 0;

- /* Add the batch itself to the end of the validation list */
- add_exec_bo(batch, batch->bo);
-
ret = execbuffer(dri_screen->fd, batch, hw_ctx,
4 * USED_BATCH(*batch),
in_fence_fd, out_fence_fd, flags);
@@ -793,8 +829,9 @@ brw_emit_reloc(struct intel_batchbuffer *batch, uint32_t batch_offset,
assert(batch_offset <= BATCH_SZ - sizeof(uint32_t));
assert(_mesa_bitcount(write_domain) <= 1);

+ unsigned int index;
if (target != batch->bo) {
- unsigned int index = add_exec_bo(batch, target);
+ index = add_exec_bo(batch, target);
struct drm_i915_gem_exec_object2 *exec = &batch->exec_objects[index];

if (write_domain) {
@@ -811,6 +848,7 @@ brw_emit_reloc(struct intel_batchbuffer *batch, uint32_t batch_offset,
offset64 = exec->offset;
} else {
offset64 = target->offset64;
+ index = target->index;
}

struct drm_i915_gem_relocation_entry *reloc =
@@ -820,7 +858,7 @@ brw_emit_reloc(struct intel_batchbuffer *batch, uint32_t batch_offset,

reloc->offset = batch_offset;
reloc->delta = target_offset;
- reloc->target_handle = target->gem_handle;
+ reloc->target_handle = batch->has_batch_first ? index : target->gem_handle;
reloc->read_domains = read_domains;
reloc->write_domain = write_domain;
reloc->presumed_offset = offset64;
--
2.11.0
Kenneth Graunke
2017-06-19 19:28:31 UTC
Reply
Permalink
Raw Message
Post by Chris Wilson
If we correctly fill the batch with the right relocation value, and that
matches the expected location of the object, we can then tell the kernel
it can forgo checking each individual relocation by only checking
whether the object moved.
v2: Rebase to apply ahead of I915_EXEC_HANDLE_LUT
---
src/mesa/drivers/dri/i965/intel_batchbuffer.c | 53 +++++++++++++++++++++------
1 file changed, 41 insertions(+), 12 deletions(-)
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
index ca7d6b81b1..7129209c26 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@@ -517,19 +517,19 @@ throttle(struct brw_context *brw)
#define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))
-static void
+static unsigned int
add_exec_bo(struct intel_batchbuffer *batch, struct brw_bo *bo)
{
if (bo != batch->bo) {
unsigned int index = READ_ONCE(bo->index);
if (index < batch->exec_count && batch->exec_bos[index] == bo)
- return;
+ return index;
/* May have been shared between multiple active batches */
for (index = 0; index < batch->exec_count; index++) {
if (batch->exec_bos[index] == bo)
- return;
+ return index;
}
brw_bo_reference(bo);
@@ -563,8 +563,9 @@ add_exec_bo(struct intel_batchbuffer *batch, struct brw_bo *bo)
bo->index = batch->exec_count;
batch->exec_bos[batch->exec_count] = bo;
- batch->exec_count++;
batch->aperture_space += bo->size;
+
+ return batch->exec_count++;
}
static int
@@ -642,12 +643,25 @@ do_flush_locked(struct brw_context *brw, int in_fence_fd, int *out_fence_fd)
}
if (!brw->screen->no_hw) {
- int flags;
-
+ unsigned int flags;
+
+ *
+ * The addresses written in the objects must match the corresponding
+ * reloc.presumed_offset which in turn must match the corresponding
+ * execobject.offset.
+ *
+ * Any render targets written to in the batch must be flagged with
+ * EXEC_OBJECT_WRITE.
+ *
+ * To avoid stalling, execobject.offset should match the current
+ * address of that object within the active context.
+ */
+ flags = I915_EXEC_NO_RELOC;
if (brw->gen >= 6 && batch->ring == BLT_RING) {
- flags = I915_EXEC_BLT;
+ flags |= I915_EXEC_BLT;
} else {
- flags = I915_EXEC_RENDER;
+ flags |= I915_EXEC_RENDER;
}
if (batch->needs_sol_reset)
flags |= I915_EXEC_GEN7_SOL_RESET;
@@ -779,16 +793,31 @@ brw_emit_reloc(struct intel_batchbuffer *batch, uint32_t batch_offset,
assert(batch_offset <= BATCH_SZ - sizeof(uint32_t));
assert(_mesa_bitcount(write_domain) <= 1);
- if (target != batch->bo)
- add_exec_bo(batch, target);
+ if (target != batch->bo) {
+ unsigned int index = add_exec_bo(batch, target);
+ struct drm_i915_gem_exec_object2 *exec = &batch->exec_objects[index];
+
+ if (write_domain) {
+ exec->flags |= EXEC_OBJECT_WRITE;
+
+ /* PIPECONTROL needs a w/a on gen6 */
+ if (write_domain == I915_GEM_DOMAIN_INSTRUCTION) {
+ struct brw_context *brw = container_of(batch, brw, batch);
+ if (brw->gen == 6)
+ exec->flags |= EXEC_OBJECT_NEEDS_GTT;
+ }
+ }
+
+ offset64 = exec->offset;
I don't think this works. brw_bo still has a single offset64 value that's
shared across all contexts. So, in a multi-context/multi-threaded case, we
may put a bogus offset in the validation list.

Pulling the value out of the validation list is nice, but it's still
potentially bogus.

I think we still need per-context-bo fields. I have a patch series in
progress to do that, but I moved it down the priority list a ways...
need to get back to that...
Post by Chris Wilson
+ } else {
+ offset64 = target->offset64;
+ }
struct drm_i915_gem_relocation_entry *reloc =
&batch->relocs[batch->reloc_count];
batch->reloc_count++;
- /* ensure gcc doesn't reload */
- offset64 = *((volatile uint64_t *)&target->offset64);
reloc->offset = batch_offset;
reloc->delta = target_offset;
reloc->target_handle = target->gem_handle;
Chris Wilson
2017-06-19 19:53:33 UTC
Reply
Permalink
Raw Message
Quoting Kenneth Graunke (2017-06-19 20:28:31)
Post by Kenneth Graunke
Post by Chris Wilson
- if (target != batch->bo)
- add_exec_bo(batch, target);
+ if (target != batch->bo) {
+ unsigned int index = add_exec_bo(batch, target);
+ struct drm_i915_gem_exec_object2 *exec = &batch->exec_objects[index];
+
+ if (write_domain) {
+ exec->flags |= EXEC_OBJECT_WRITE;
+
+ /* PIPECONTROL needs a w/a on gen6 */
+ if (write_domain == I915_GEM_DOMAIN_INSTRUCTION) {
+ struct brw_context *brw = container_of(batch, brw, batch);
+ if (brw->gen == 6)
+ exec->flags |= EXEC_OBJECT_NEEDS_GTT;
+ }
+ }
+
+ offset64 = exec->offset;
I don't think this works. brw_bo still has a single offset64 value that's
shared across all contexts. So, in a multi-context/multi-threaded case, we
may put a bogus offset in the validation list.
But the value that we put there is the *only* value that we then use for
all of the relocation entries. We are consistently wrong which is what
is mandated by the interface, i.e. we don't prevent a relocation stall
(unless the kernel can move the buffers for us) but more importantly the
relocation.presumed_offset == execobject.offset == batch[reloc.offset]
for all relocations.
Post by Kenneth Graunke
Pulling the value out of the validation list is nice, but it's still
potentially bogus.
I think we still need per-context-bo fields. I have a patch series in
progress to do that, but I moved it down the priority list a ways...
need to get back to that...
Per-context tracking will allow us to avoid stalls between contexts,
which will be an improvement above and beyond this patch. This patch
will work even with conflicting contexts, just no better than today. For
the single context (or no conflict cases), we do realise the improvement
from avoiding always having to inspect reloc[].

Hope that helps,
-Chris
Jason Ekstrand
2017-06-19 21:00:45 UTC
Reply
Permalink
Raw Message
Post by Chris Wilson
Quoting Kenneth Graunke (2017-06-19 20:28:31)
Post by Kenneth Graunke
Post by Chris Wilson
- if (target != batch->bo)
- add_exec_bo(batch, target);
+ if (target != batch->bo) {
+ unsigned int index = add_exec_bo(batch, target);
+ struct drm_i915_gem_exec_object2 *exec =
&batch->exec_objects[index];
Post by Kenneth Graunke
Post by Chris Wilson
+
+ if (write_domain) {
+ exec->flags |= EXEC_OBJECT_WRITE;
+
+ /* PIPECONTROL needs a w/a on gen6 */
+ if (write_domain == I915_GEM_DOMAIN_INSTRUCTION) {
+ struct brw_context *brw = container_of(batch, brw, batch);
+ if (brw->gen == 6)
+ exec->flags |= EXEC_OBJECT_NEEDS_GTT;
+ }
+ }
+
+ offset64 = exec->offset;
I don't think this works. brw_bo still has a single offset64 value
that's
Post by Kenneth Graunke
shared across all contexts. So, in a multi-context/multi-threaded case,
we
Post by Kenneth Graunke
may put a bogus offset in the validation list.
But the value that we put there is the *only* value that we then use for
all of the relocation entries. We are consistently wrong which is what
is mandated by the interface, i.e. we don't prevent a relocation stall
(unless the kernel can move the buffers for us) but more importantly the
relocation.presumed_offset == execobject.offset == batch[reloc.offset]
for all relocations.
Is the BO address in the kernel per-context or per-drm-fd? If it's the
later then we may be safe though it sounds scary. If it's the former, then
we definitely have a problem.
Post by Chris Wilson
Post by Kenneth Graunke
Pulling the value out of the validation list is nice, but it's still
potentially bogus.
I think we still need per-context-bo fields. I have a patch series in
progress to do that, but I moved it down the priority list a ways...
need to get back to that...
Per-context tracking will allow us to avoid stalls between contexts,
which will be an improvement above and beyond this patch. This patch
will work even with conflicting contexts, just no better than today. For
the single context (or no conflict cases), we do realise the improvement
from avoiding always having to inspect reloc[].
Hope that helps,
-Chris
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev
Chris Wilson
2017-06-19 22:32:21 UTC
Reply
Permalink
Raw Message
Quoting Jason Ekstrand (2017-06-19 22:00:45)
Post by Chris Wilson
Quoting Kenneth Graunke (2017-06-19 20:28:31)
-   if (target != batch->bo)
-      add_exec_bo(batch, target);
+   if (target != batch->bo) {
+      unsigned int index = add_exec_bo(batch, target);
+      struct drm_i915_gem_exec_object2 *exec = &batch->exec_objects
[index];
+
+      if (write_domain) {
+         exec->flags |= EXEC_OBJECT_WRITE;
+
+         /* PIPECONTROL needs a w/a on gen6 */
+         if (write_domain == I915_GEM_DOMAIN_INSTRUCTION) {
+            struct brw_context *brw = container_of(batch, brw, batch);
+            if (brw->gen == 6)
+               exec->flags |= EXEC_OBJECT_NEEDS_GTT;
+         }
+      }
+
+      offset64 = exec->offset;
I don't think this works.  brw_bo still has a single offset64 value
that's
shared across all contexts.  So, in a multi-context/multi-threaded case,
we
may put a bogus offset in the validation list.
But the value that we put there is the *only* value that we then use for
all of the relocation entries. We are consistently wrong which is what
is mandated by the interface, i.e. we don't prevent a relocation stall
(unless the kernel can move the buffers for us) but more importantly the
relocation.presumed_offset == execobject.offset == batch[reloc.offset]
for all relocations.
Is the BO address in the kernel per-context or per-drm-fd?  If it's the later
then we may be safe though it sounds scary.  If it's the former, then we
definitely have a problem.
The binding of a buffer into its ppGTT is local to that context. The
kernel allocates an address for a buffer separately for each context.
(At the start, the motivating factor was to keep the shared GGTT with
its mappable/non-mappable restrictions entirely separate from ppGTT).
One trick the kernel finally just learnt is to use the supplied offset
for new buffers if it is valid and available. That reduces the impact of
using an offset from context1 in context2, so long as that address was
not already allocated to something else. (So hit and miss in a long
running clients, but any hit is better than always missing.)

However, with softpin I suggest mesa allocates an address at the fd
level and assign that for all contexts using the bo. It does
dramatically reduce the working space, but it still should be many times
physical, except for the complications with 32b restrictions. It was
definitely a very simple proof-of-concept patch (device level monotonic
allocator, falling back to kernel on exhaustion).
-Chris

Loading...