Discussion:
[PATCH 1/1] gallium, clover: Wait for requested operation if blocking flag is set
Add Reply
Jan Vesely
2017-07-29 03:36:34 UTC
Reply
Permalink
Raw Message
Signed-off-by: Jan Vesely <***@rutgers.edu>
---
Found by inspection. Fixes clRetain/clRelease event piglit on Turks, no
regressions.

src/gallium/state_trackers/clover/api/transfer.cpp | 18 ++++++++++++++++++
1 file changed, 18 insertions(+)

diff --git a/src/gallium/state_trackers/clover/api/transfer.cpp b/src/gallium/state_trackers/clover/api/transfer.cpp
index f7046253be..2af28b1e94 100644
--- a/src/gallium/state_trackers/clover/api/transfer.cpp
+++ b/src/gallium/state_trackers/clover/api/transfer.cpp
@@ -295,6 +295,9 @@ clEnqueueReadBuffer(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
&mem, obj_origin, obj_pitch,
region));

+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;

@@ -325,6 +328,9 @@ clEnqueueWriteBuffer(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
ptr, {}, obj_pitch,
region));

+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;

@@ -362,6 +368,9 @@ clEnqueueReadBufferRect(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
&mem, obj_origin, obj_pitch,
region));

+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;

@@ -399,6 +408,9 @@ clEnqueueWriteBufferRect(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
ptr, host_origin, host_pitch,
region));

+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;

@@ -504,6 +516,9 @@ clEnqueueReadImage(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
&img, src_origin, src_pitch,
region));

+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;

@@ -538,6 +553,9 @@ clEnqueueWriteImage(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
ptr, {}, src_pitch,
region));

+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
--
2.13.3
Aaron Watry
2017-07-31 02:09:51 UTC
Reply
Permalink
Raw Message
What's here is:
Reviewed-By: Aaron Watry <***@gmail.com>

That being said, are clEnqueueMapBuffer/clEnqueueMapImage still
affected or are those always done as blocking operations and my c++ is
weak?

--Aaron
Post by Jan Vesely
---
Found by inspection. Fixes clRetain/clRelease event piglit on Turks, no
regressions.
src/gallium/state_trackers/clover/api/transfer.cpp | 18 ++++++++++++++++++
1 file changed, 18 insertions(+)
diff --git a/src/gallium/state_trackers/clover/api/transfer.cpp b/src/gallium/state_trackers/clover/api/transfer.cpp
index f7046253be..2af28b1e94 100644
--- a/src/gallium/state_trackers/clover/api/transfer.cpp
+++ b/src/gallium/state_trackers/clover/api/transfer.cpp
@@ -295,6 +295,9 @@ clEnqueueReadBuffer(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
&mem, obj_origin, obj_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -325,6 +328,9 @@ clEnqueueWriteBuffer(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
ptr, {}, obj_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -362,6 +368,9 @@ clEnqueueReadBufferRect(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
&mem, obj_origin, obj_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -399,6 +408,9 @@ clEnqueueWriteBufferRect(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
ptr, host_origin, host_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -504,6 +516,9 @@ clEnqueueReadImage(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
&img, src_origin, src_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -538,6 +553,9 @@ clEnqueueWriteImage(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
ptr, {}, src_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
--
2.13.3
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev
Jan Vesely
2017-08-02 21:04:24 UTC
Reply
Permalink
Raw Message
v2: wait in map_buffer and map_image as well

Signed-off-by: Jan Vesely <***@rutgers.edu>
---
Hi Aaron,

yes, I think you're right, we should wait in Map* as well.
If nothing else it's consistent, even if passing the flag to add_map might make it unnecessary (haven't actually checked).

thanks,
Jan

src/gallium/state_trackers/clover/api/transfer.cpp | 30 ++++++++++++++++++++--
1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/src/gallium/state_trackers/clover/api/transfer.cpp b/src/gallium/state_trackers/clover/api/transfer.cpp
index f7046253be..729a34590e 100644
--- a/src/gallium/state_trackers/clover/api/transfer.cpp
+++ b/src/gallium/state_trackers/clover/api/transfer.cpp
@@ -295,6 +295,9 @@ clEnqueueReadBuffer(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
&mem, obj_origin, obj_pitch,
region));

+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;

@@ -325,6 +328,9 @@ clEnqueueWriteBuffer(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
ptr, {}, obj_pitch,
region));

+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;

@@ -362,6 +368,9 @@ clEnqueueReadBufferRect(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
&mem, obj_origin, obj_pitch,
region));

+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;

@@ -399,6 +408,9 @@ clEnqueueWriteBufferRect(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
ptr, host_origin, host_pitch,
region));

+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;

@@ -504,6 +516,9 @@ clEnqueueReadImage(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
&img, src_origin, src_pitch,
region));

+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;

@@ -538,6 +553,9 @@ clEnqueueWriteImage(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
ptr, {}, src_pitch,
region));

+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;

@@ -667,7 +685,11 @@ clEnqueueMapBuffer(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,

void *map = mem.resource(q).add_map(q, flags, blocking, obj_origin, region);

- ret_object(rd_ev, create<hard_event>(q, CL_COMMAND_MAP_BUFFER, deps));
+ auto hev = create<hard_event>(q, CL_COMMAND_MAP_BUFFER, deps);
+ if (blocking)
+ hev().wait();
+
+ ret_object(rd_ev, hev);
ret_error(r_errcode, CL_SUCCESS);
return map;

@@ -695,7 +717,11 @@ clEnqueueMapImage(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,

void *map = img.resource(q).add_map(q, flags, blocking, origin, region);

- ret_object(rd_ev, create<hard_event>(q, CL_COMMAND_MAP_IMAGE, deps));
+ auto hev = create<hard_event>(q, CL_COMMAND_MAP_IMAGE, deps);
+ if (blocking)
+ hev().wait();
+
+ ret_object(rd_ev, hev);
ret_error(r_errcode, CL_SUCCESS);
return map;
--
2.13.3
Francisco Jerez
2017-08-02 22:05:33 UTC
Reply
Permalink
Raw Message
These changes are somewhat redundant and potentially
performance-impacting, the reason is that in the OpenCL API,
clEnqueueWrite* commands are specified to block until the memory
provided by the application as origin can be reused safely (i.e. until
soft_copy_op()() runs), not necessarily until the transfer to graphics
memory has completed (which is what hard_event::wait() will wait for).
OTOH reads and maps as implemented by soft_copy_op and friends are
essentially blocking so the wait() call is redundant in most cases.

The only reason why it might be useful to behave differently on blocking
transfers is that the application may have specified a user event in the
event dependency list, which may cause the soft_copy_op() call to be
delayed until the application signals the user event. In order to fix
that it should really only be necessary to wait for the event action to
be executed, not necessarily its associated GPU work.

Last time this issue came up (yeah it's not the first time) I proposed
the patches below to add a mechanism to wait for the event action only,
feel free to include it as PATCH 0.1 and 0.2 of this series (it's been a
while so they may no longer apply cleanly).

Thank you.
Post by Jan Vesely
v2: wait in map_buffer and map_image as well
---
Hi Aaron,
yes, I think you're right, we should wait in Map* as well.
If nothing else it's consistent, even if passing the flag to add_map might make it unnecessary (haven't actually checked).
thanks,
Jan
src/gallium/state_trackers/clover/api/transfer.cpp | 30 ++++++++++++++++++++--
1 file changed, 28 insertions(+), 2 deletions(-)
diff --git a/src/gallium/state_trackers/clover/api/transfer.cpp b/src/gallium/state_trackers/clover/api/transfer.cpp
index f7046253be..729a34590e 100644
--- a/src/gallium/state_trackers/clover/api/transfer.cpp
+++ b/src/gallium/state_trackers/clover/api/transfer.cpp
@@ -295,6 +295,9 @@ clEnqueueReadBuffer(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
&mem, obj_origin, obj_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -325,6 +328,9 @@ clEnqueueWriteBuffer(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
ptr, {}, obj_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -362,6 +368,9 @@ clEnqueueReadBufferRect(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
&mem, obj_origin, obj_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -399,6 +408,9 @@ clEnqueueWriteBufferRect(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
ptr, host_origin, host_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -504,6 +516,9 @@ clEnqueueReadImage(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
&img, src_origin, src_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -538,6 +553,9 @@ clEnqueueWriteImage(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
ptr, {}, src_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -667,7 +685,11 @@ clEnqueueMapBuffer(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
void *map = mem.resource(q).add_map(q, flags, blocking, obj_origin, region);
- ret_object(rd_ev, create<hard_event>(q, CL_COMMAND_MAP_BUFFER, deps));
+ auto hev = create<hard_event>(q, CL_COMMAND_MAP_BUFFER, deps);
+ if (blocking)
+ hev().wait();
+
+ ret_object(rd_ev, hev);
ret_error(r_errcode, CL_SUCCESS);
return map;
@@ -695,7 +717,11 @@ clEnqueueMapImage(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
void *map = img.resource(q).add_map(q, flags, blocking, origin, region);
- ret_object(rd_ev, create<hard_event>(q, CL_COMMAND_MAP_IMAGE, deps));
+ auto hev = create<hard_event>(q, CL_COMMAND_MAP_IMAGE, deps);
+ if (blocking)
+ hev().wait();
+
+ ret_object(rd_ev, hev);
ret_error(r_errcode, CL_SUCCESS);
return map;
--
2.13.3
Jan Vesely
2017-08-04 04:30:48 UTC
Reply
Permalink
Raw Message
Hi,

thanks for detailed explanation. I indeed missed the writeBuffer part
in specs.
Post by Francisco Jerez
These changes are somewhat redundant and potentially
performance-impacting, the reason is that in the OpenCL API,
clEnqueueWrite* commands are specified to block until the memory
provided by the application as origin can be reused safely (i.e. until
soft_copy_op()() runs), not necessarily until the transfer to graphics
memory has completed (which is what hard_event::wait() will wait for).
OTOH reads and maps as implemented by soft_copy_op and friends are
essentially blocking so the wait() call is redundant in most cases.
That explains a noticeable slowdown running piglit with these changes.
I'm not sure about the read part though. I expected it to be basically
a noop, but it changes behaviour.
Adding clGetEventInfo(CL_EVENT_COMMAND_EXECUTION_STATUS) after a
blocking read in one of the piglit tests surprisingly returns
CL_QUEUED.

The specs don't mention use of events with blocking read, but it does
say that "When the read command has completed, the contents of the
buffer that ptr points to can be used by the application." in the non-
blocking section. I'd say that the expectation is for the event to be
CL_COMPLETE after blocking read (at least beignet/pocl/intel-cpu-sdk
follow this).
Post by Francisco Jerez
The only reason why it might be useful to behave differently on blocking
transfers is that the application may have specified a user event in the
event dependency list, which may cause the soft_copy_op() call to be
delayed until the application signals the user event. In order to fix
that it should really only be necessary to wait for the event action to
be executed, not necessarily its associated GPU work.
I think that another use is that non-blocking writes do not create an
extra copy of the buffer. Thus
clEnqueueWriteBuffer(...,cl_false, ev, ...)
clWaitForEvents(ev)
is more memory efficient.
Post by Francisco Jerez
Last time this issue came up (yeah it's not the first time) I proposed
the patches below to add a mechanism to wait for the event action only,
feel free to include it as PATCH 0.1 and 0.2 of this series (it's been a
while so they may no longer apply cleanly).
I think we can just add comments explaining why the blocking argument
is ignored, until someone chooses to fix this problem and/or to
implement proper non-blocking variants (would std::async work for
trivial cases like ReadBuffer?)

thanks,
Jan
Post by Francisco Jerez
Thank you.
Post by Jan Vesely
v2: wait in map_buffer and map_image as well
---
Hi Aaron,
yes, I think you're right, we should wait in Map* as well.
If nothing else it's consistent, even if passing the flag to add_map might make it unnecessary (haven't actually checked).
thanks,
Jan
src/gallium/state_trackers/clover/api/transfer.cpp | 30 ++++++++++++++++++++--
1 file changed, 28 insertions(+), 2 deletions(-)
diff --git a/src/gallium/state_trackers/clover/api/transfer.cpp b/src/gallium/state_trackers/clover/api/transfer.cpp
index f7046253be..729a34590e 100644
--- a/src/gallium/state_trackers/clover/api/transfer.cpp
+++ b/src/gallium/state_trackers/clover/api/transfer.cpp
@@ -295,6 +295,9 @@ clEnqueueReadBuffer(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
&mem, obj_origin, obj_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -325,6 +328,9 @@ clEnqueueWriteBuffer(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
ptr, {}, obj_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -362,6 +368,9 @@ clEnqueueReadBufferRect(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
&mem, obj_origin, obj_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -399,6 +408,9 @@ clEnqueueWriteBufferRect(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
ptr, host_origin, host_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -504,6 +516,9 @@ clEnqueueReadImage(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
&img, src_origin, src_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -538,6 +553,9 @@ clEnqueueWriteImage(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
ptr, {}, src_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -667,7 +685,11 @@ clEnqueueMapBuffer(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
void *map = mem.resource(q).add_map(q, flags, blocking, obj_origin, region);
- ret_object(rd_ev, create<hard_event>(q, CL_COMMAND_MAP_BUFFER, deps));
+ auto hev = create<hard_event>(q, CL_COMMAND_MAP_BUFFER, deps);
+ if (blocking)
+ hev().wait();
+
+ ret_object(rd_ev, hev);
ret_error(r_errcode, CL_SUCCESS);
return map;
@@ -695,7 +717,11 @@ clEnqueueMapImage(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
void *map = img.resource(q).add_map(q, flags, blocking, origin, region);
- ret_object(rd_ev, create<hard_event>(q, CL_COMMAND_MAP_IMAGE, deps));
+ auto hev = create<hard_event>(q, CL_COMMAND_MAP_IMAGE, deps);
+ if (blocking)
+ hev().wait();
+
+ ret_object(rd_ev, hev);
ret_error(r_errcode, CL_SUCCESS);
return map;
--
2.13.3
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev
--
Jan Vesely <***@rutgers.edu>
Francisco Jerez
2017-08-05 19:26:32 UTC
Reply
Permalink
Raw Message
Post by Jan Vesely
Hi,
thanks for detailed explanation. I indeed missed the writeBuffer part
in specs.
Post by Francisco Jerez
These changes are somewhat redundant and potentially
performance-impacting, the reason is that in the OpenCL API,
clEnqueueWrite* commands are specified to block until the memory
provided by the application as origin can be reused safely (i.e. until
soft_copy_op()() runs), not necessarily until the transfer to graphics
memory has completed (which is what hard_event::wait() will wait for).
OTOH reads and maps as implemented by soft_copy_op and friends are
essentially blocking so the wait() call is redundant in most cases.
That explains a noticeable slowdown running piglit with these changes.
I'm not sure about the read part though. I expected it to be basically
a noop, but it changes behaviour.
I think this change would have slowed you down the most whenever the
mapping operation performed by soft_copy_op() is able to proceed
immediately, either because the buffer is idle (so the driver doesn't
stall on transfer_map()) *or* because the driver is trying to be smart
and creates a bounce buffer where data can be copied into immediately
without stalling, then inserts a pipelined GPU copy from the bounce
buffer into the real buffer. With this patch you will stall on the GPU
copy regardless (and whatever other work was already on the command
stream which might be substantial), even though it wouldn't have been
necessary in any of these cases.
Post by Jan Vesely
Adding clGetEventInfo(CL_EVENT_COMMAND_EXECUTION_STATUS) after a
blocking read in one of the piglit tests surprisingly returns
CL_QUEUED.
Hmm, yeah, that seems kind of debatable behaviour, although it's
definitely legit for writes, not quite sure for reads... I believe the
reason why that happens is because the CPU copy proceeds very quickly
(due to the reasons mentioned in the last paragraph), but the hard_event
is still associated with a pipe_fence synchronous with the GPU's command
stream, so it won't get signalled until the GPU catches up.
Post by Jan Vesely
The specs don't mention use of events with blocking read, but it does
say that "When the read command has completed, the contents of the
buffer that ptr points to can be used by the application." in the non-
blocking section. I'd say that the expectation is for the event to be
CL_COMPLETE after blocking read (at least beignet/pocl/intel-cpu-sdk
follow this).
Post by Francisco Jerez
The only reason why it might be useful to behave differently on blocking
transfers is that the application may have specified a user event in the
event dependency list, which may cause the soft_copy_op() call to be
delayed until the application signals the user event. In order to fix
that it should really only be necessary to wait for the event action to
be executed, not necessarily its associated GPU work.
I think that another use is that non-blocking writes do not create an
extra copy of the buffer. Thus
clEnqueueWriteBuffer(...,cl_false, ev, ...)
clWaitForEvents(ev)
is more memory efficient.
Post by Francisco Jerez
Last time this issue came up (yeah it's not the first time) I proposed
the patches below to add a mechanism to wait for the event action only,
feel free to include it as PATCH 0.1 and 0.2 of this series (it's been a
while so they may no longer apply cleanly).
I think we can just add comments explaining why the blocking argument
is ignored, until someone chooses to fix this problem
I think the problem is definitely worth fixing, and it shouldn't really
take more effort than adding comments explaining the current behaviour
;), basically just add a bunch of 'if (blocking)
hev().wait_signalled();' where the spec requires it, roughly as you had
been doing in this patch, but wait_signalled() should only stall on the
CPU work associated with the event, which should give you the same
performance as the current approach.
Post by Jan Vesely
and/or to
implement proper non-blocking variants (would std::async work for
trivial cases like ReadBuffer?)
thanks,
Jan
Post by Francisco Jerez
Thank you.
Post by Jan Vesely
v2: wait in map_buffer and map_image as well
---
Hi Aaron,
yes, I think you're right, we should wait in Map* as well.
If nothing else it's consistent, even if passing the flag to add_map might make it unnecessary (haven't actually checked).
thanks,
Jan
src/gallium/state_trackers/clover/api/transfer.cpp | 30 ++++++++++++++++++++--
1 file changed, 28 insertions(+), 2 deletions(-)
diff --git a/src/gallium/state_trackers/clover/api/transfer.cpp b/src/gallium/state_trackers/clover/api/transfer.cpp
index f7046253be..729a34590e 100644
--- a/src/gallium/state_trackers/clover/api/transfer.cpp
+++ b/src/gallium/state_trackers/clover/api/transfer.cpp
@@ -295,6 +295,9 @@ clEnqueueReadBuffer(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
&mem, obj_origin, obj_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -325,6 +328,9 @@ clEnqueueWriteBuffer(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
ptr, {}, obj_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -362,6 +368,9 @@ clEnqueueReadBufferRect(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
&mem, obj_origin, obj_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -399,6 +408,9 @@ clEnqueueWriteBufferRect(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
ptr, host_origin, host_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -504,6 +516,9 @@ clEnqueueReadImage(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
&img, src_origin, src_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -538,6 +553,9 @@ clEnqueueWriteImage(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
ptr, {}, src_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -667,7 +685,11 @@ clEnqueueMapBuffer(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
void *map = mem.resource(q).add_map(q, flags, blocking, obj_origin, region);
- ret_object(rd_ev, create<hard_event>(q, CL_COMMAND_MAP_BUFFER, deps));
+ auto hev = create<hard_event>(q, CL_COMMAND_MAP_BUFFER, deps);
+ if (blocking)
+ hev().wait();
+
+ ret_object(rd_ev, hev);
ret_error(r_errcode, CL_SUCCESS);
return map;
@@ -695,7 +717,11 @@ clEnqueueMapImage(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
void *map = img.resource(q).add_map(q, flags, blocking, origin, region);
- ret_object(rd_ev, create<hard_event>(q, CL_COMMAND_MAP_IMAGE, deps));
+ auto hev = create<hard_event>(q, CL_COMMAND_MAP_IMAGE, deps);
+ if (blocking)
+ hev().wait();
+
+ ret_object(rd_ev, hev);
ret_error(r_errcode, CL_SUCCESS);
return map;
--
2.13.3
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev
--
Francisco Jerez
2017-08-05 19:34:55 UTC
Reply
Permalink
Raw Message
Post by Francisco Jerez
Post by Jan Vesely
Hi,
thanks for detailed explanation. I indeed missed the writeBuffer part
in specs.
Post by Francisco Jerez
These changes are somewhat redundant and potentially
performance-impacting, the reason is that in the OpenCL API,
clEnqueueWrite* commands are specified to block until the memory
provided by the application as origin can be reused safely (i.e. until
soft_copy_op()() runs), not necessarily until the transfer to graphics
memory has completed (which is what hard_event::wait() will wait for).
OTOH reads and maps as implemented by soft_copy_op and friends are
essentially blocking so the wait() call is redundant in most cases.
That explains a noticeable slowdown running piglit with these changes.
I'm not sure about the read part though. I expected it to be basically
a noop, but it changes behaviour.
I think this change would have slowed you down the most whenever the
mapping operation performed by soft_copy_op() is able to proceed
immediately, either because the buffer is idle (so the driver doesn't
stall on transfer_map()) *or* because the driver is trying to be smart
and creates a bounce buffer where data can be copied into immediately
without stalling, then inserts a pipelined GPU copy from the bounce
buffer into the real buffer. With this patch you will stall on the GPU
copy regardless (and whatever other work was already on the command
stream which might be substantial), even though it wouldn't have been
necessary in any of these cases.
Post by Jan Vesely
Adding clGetEventInfo(CL_EVENT_COMMAND_EXECUTION_STATUS) after a
blocking read in one of the piglit tests surprisingly returns
CL_QUEUED.
Hmm, yeah, that seems kind of debatable behaviour, although it's
definitely legit for writes, not quite sure for reads... I believe the
reason why that happens is because the CPU copy proceeds very quickly
(due to the reasons mentioned in the last paragraph), but the hard_event
is still associated with a pipe_fence synchronous with the GPU's command
stream, so it won't get signalled until the GPU catches up.
Post by Jan Vesely
The specs don't mention use of events with blocking read, but it does
say that "When the read command has completed, the contents of the
buffer that ptr points to can be used by the application." in the non-
blocking section. I'd say that the expectation is for the event to be
CL_COMPLETE after blocking read (at least beignet/pocl/intel-cpu-sdk
follow this).
Post by Francisco Jerez
The only reason why it might be useful to behave differently on blocking
transfers is that the application may have specified a user event in the
event dependency list, which may cause the soft_copy_op() call to be
delayed until the application signals the user event. In order to fix
that it should really only be necessary to wait for the event action to
be executed, not necessarily its associated GPU work.
I think that another use is that non-blocking writes do not create an
extra copy of the buffer. Thus
clEnqueueWriteBuffer(...,cl_false, ev, ...)
clWaitForEvents(ev)
is more memory efficient.
Post by Francisco Jerez
Last time this issue came up (yeah it's not the first time) I proposed
the patches below to add a mechanism to wait for the event action only,
feel free to include it as PATCH 0.1 and 0.2 of this series (it's been a
while so they may no longer apply cleanly).
I think we can just add comments explaining why the blocking argument
is ignored, until someone chooses to fix this problem
I think the problem is definitely worth fixing, and it shouldn't really
take more effort than adding comments explaining the current behaviour
;), basically just add a bunch of 'if (blocking)
hev().wait_signalled();' where the spec requires it, roughly as you had
been doing in this patch, but wait_signalled() should only stall on the
CPU work associated with the event, which should give you the same
performance as the current approach.
Post by Jan Vesely
and/or to
implement proper non-blocking variants (would std::async work for
trivial cases like ReadBuffer?)
Hm, and to answer this question -- Yeah, std::async would probably work,
but I'm not certain whether it would actually perform better than the
current approach, because on the one hand the actual DMA-ing of the
buffer is likely to happen quasi-asynchronously already assuming the
driver is competent, and OTOH because spawning a new thread for the copy
would introduce additional overhead that might defeat your purpose
unless the copy is very large -- Only experimentation will tell whether
it pays off.
Post by Francisco Jerez
Post by Jan Vesely
thanks,
Jan
Post by Francisco Jerez
Thank you.
Post by Jan Vesely
v2: wait in map_buffer and map_image as well
---
Hi Aaron,
yes, I think you're right, we should wait in Map* as well.
If nothing else it's consistent, even if passing the flag to add_map might make it unnecessary (haven't actually checked).
thanks,
Jan
src/gallium/state_trackers/clover/api/transfer.cpp | 30 ++++++++++++++++++++--
1 file changed, 28 insertions(+), 2 deletions(-)
diff --git a/src/gallium/state_trackers/clover/api/transfer.cpp b/src/gallium/state_trackers/clover/api/transfer.cpp
index f7046253be..729a34590e 100644
--- a/src/gallium/state_trackers/clover/api/transfer.cpp
+++ b/src/gallium/state_trackers/clover/api/transfer.cpp
@@ -295,6 +295,9 @@ clEnqueueReadBuffer(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
&mem, obj_origin, obj_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -325,6 +328,9 @@ clEnqueueWriteBuffer(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
ptr, {}, obj_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -362,6 +368,9 @@ clEnqueueReadBufferRect(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
&mem, obj_origin, obj_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -399,6 +408,9 @@ clEnqueueWriteBufferRect(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
ptr, host_origin, host_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -504,6 +516,9 @@ clEnqueueReadImage(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
&img, src_origin, src_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -538,6 +553,9 @@ clEnqueueWriteImage(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
ptr, {}, src_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -667,7 +685,11 @@ clEnqueueMapBuffer(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
void *map = mem.resource(q).add_map(q, flags, blocking, obj_origin, region);
- ret_object(rd_ev, create<hard_event>(q, CL_COMMAND_MAP_BUFFER, deps));
+ auto hev = create<hard_event>(q, CL_COMMAND_MAP_BUFFER, deps);
+ if (blocking)
+ hev().wait();
+
+ ret_object(rd_ev, hev);
ret_error(r_errcode, CL_SUCCESS);
return map;
@@ -695,7 +717,11 @@ clEnqueueMapImage(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
void *map = img.resource(q).add_map(q, flags, blocking, origin, region);
- ret_object(rd_ev, create<hard_event>(q, CL_COMMAND_MAP_IMAGE, deps));
+ auto hev = create<hard_event>(q, CL_COMMAND_MAP_IMAGE, deps);
+ if (blocking)
+ hev().wait();
+
+ ret_object(rd_ev, hev);
ret_error(r_errcode, CL_SUCCESS);
return map;
--
2.13.3
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev
--
Jan Vesely
2017-08-12 20:34:47 UTC
Reply
Permalink
Raw Message
Post by Francisco Jerez
Post by Francisco Jerez
Post by Jan Vesely
Hi,
thanks for detailed explanation. I indeed missed the writeBuffer part
in specs.
Post by Francisco Jerez
These changes are somewhat redundant and potentially
performance-impacting, the reason is that in the OpenCL API,
clEnqueueWrite* commands are specified to block until the memory
provided by the application as origin can be reused safely (i.e. until
soft_copy_op()() runs), not necessarily until the transfer to graphics
memory has completed (which is what hard_event::wait() will wait for).
OTOH reads and maps as implemented by soft_copy_op and friends are
essentially blocking so the wait() call is redundant in most cases.
That explains a noticeable slowdown running piglit with these changes.
I'm not sure about the read part though. I expected it to be basically
a noop, but it changes behaviour.
I think this change would have slowed you down the most whenever the
mapping operation performed by soft_copy_op() is able to proceed
immediately, either because the buffer is idle (so the driver doesn't
stall on transfer_map()) *or* because the driver is trying to be smart
and creates a bounce buffer where data can be copied into immediately
without stalling, then inserts a pipelined GPU copy from the bounce
buffer into the real buffer. With this patch you will stall on the GPU
copy regardless (and whatever other work was already on the command
stream which might be substantial), even though it wouldn't have been
necessary in any of these cases.
Post by Jan Vesely
Adding clGetEventInfo(CL_EVENT_COMMAND_EXECUTION_STATUS) after a
blocking read in one of the piglit tests surprisingly returns
CL_QUEUED.
Hmm, yeah, that seems kind of debatable behaviour, although it's
definitely legit for writes, not quite sure for reads... I believe the
reason why that happens is because the CPU copy proceeds very quickly
(due to the reasons mentioned in the last paragraph), but the hard_event
is still associated with a pipe_fence synchronous with the GPU's command
stream, so it won't get signalled until the GPU catches up.
Hi,
sorry for the delay, last week was submission week...

The part that I'm still missing is what kind of GPU work needs to be
done after clEnqueueRead*(). I assume all necessary work is completed
before I can access the data.
Also CL_QUEUED status was surprising. since we performed at least some
of the work (we got the data), I'd expect CL_RUNNING or CL_SUBMITTED at
least.
Post by Francisco Jerez
Post by Francisco Jerez
Post by Jan Vesely
The specs don't mention use of events with blocking read, but it does
say that "When the read command has completed, the contents of the
buffer that ptr points to can be used by the application." in the non-
blocking section. I'd say that the expectation is for the event to be
CL_COMPLETE after blocking read (at least beignet/pocl/intel-cpu-sdk
follow this).
Post by Francisco Jerez
The only reason why it might be useful to behave differently on blocking
transfers is that the application may have specified a user event in the
event dependency list, which may cause the soft_copy_op() call to be
delayed until the application signals the user event. In order to fix
that it should really only be necessary to wait for the event action to
be executed, not necessarily its associated GPU work.
I think that another use is that non-blocking writes do not create an
extra copy of the buffer. Thus
clEnqueueWriteBuffer(...,cl_false, ev, ...)
clWaitForEvents(ev)
is more memory efficient.
Post by Francisco Jerez
Last time this issue came up (yeah it's not the first time) I proposed
the patches below to add a mechanism to wait for the event action only,
feel free to include it as PATCH 0.1 and 0.2 of this series (it's been a
while so they may no longer apply cleanly).
I think we can just add comments explaining why the blocking argument
is ignored, until someone chooses to fix this problem
I think the problem is definitely worth fixing, and it shouldn't really
take more effort than adding comments explaining the current behaviour
;), basically just add a bunch of 'if (blocking)
hev().wait_signalled();' where the spec requires it, roughly as you had
been doing in this patch, but wait_signalled() should only stall on the
CPU work associated with the event, which should give you the same
performance as the current approach.
I can send a patch that replaces wait() -> wait_signalled()
Post by Francisco Jerez
Post by Francisco Jerez
Post by Jan Vesely
and/or to
implement proper non-blocking variants (would std::async work for
trivial cases like ReadBuffer?)
Hm, and to answer this question -- Yeah, std::async would probably work,
but I'm not certain whether it would actually perform better than the
current approach, because on the one hand the actual DMA-ing of the
buffer is likely to happen quasi-asynchronously already assuming the
driver is competent, and OTOH because spawning a new thread for the copy
would introduce additional overhead that might defeat your purpose
unless the copy is very large -- Only experimentation will tell whether
it pays off.
it was just a speculation. it looks like Vedran is interested in
implementing non-blocking reads/writes[0] so I'll leave it to him. r600
has bigger problems elsewhere atm.

thanks,
Jan

[0]https://bugs.freedesktop.org/show_bug.cgi?id=100199
Post by Francisco Jerez
Post by Francisco Jerez
Post by Jan Vesely
thanks,
Jan
Post by Francisco Jerez
Thank you.
Post by Jan Vesely
v2: wait in map_buffer and map_image as well
---
Hi Aaron,
yes, I think you're right, we should wait in Map* as well.
If nothing else it's consistent, even if passing the flag to
add_map might make it unnecessary (haven't actually checked).
thanks,
Jan
src/gallium/state_trackers/clover/api/transfer.cpp | 30 ++++++++++++++++++++--
1 file changed, 28 insertions(+), 2 deletions(-)
diff --git a/src/gallium/state_trackers/clover/api/transfer.cpp b/src/gallium/state_trackers/clover/api/transfer.cpp
index f7046253be..729a34590e 100644
--- a/src/gallium/state_trackers/clover/api/transfer.cpp
+++ b/src/gallium/state_trackers/clover/api/transfer.cpp
@@ -295,6 +295,9 @@ clEnqueueReadBuffer(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
&mem, obj_origin, obj_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -325,6 +328,9 @@ clEnqueueWriteBuffer(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
ptr, {}, obj_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -362,6 +368,9 @@ clEnqueueReadBufferRect(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
&mem, obj_origin, obj_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -399,6 +408,9 @@ clEnqueueWriteBufferRect(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
ptr, host_origin, host_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -504,6 +516,9 @@ clEnqueueReadImage(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
&img, src_origin, src_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -538,6 +553,9 @@ clEnqueueWriteImage(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
ptr, {}, src_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -667,7 +685,11 @@ clEnqueueMapBuffer(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
void *map = mem.resource(q).add_map(q, flags, blocking, obj_origin, region);
- ret_object(rd_ev, create<hard_event>(q, CL_COMMAND_MAP_BUFFER, deps));
+ auto hev = create<hard_event>(q, CL_COMMAND_MAP_BUFFER, deps);
+ if (blocking)
+ hev().wait();
+
+ ret_object(rd_ev, hev);
ret_error(r_errcode, CL_SUCCESS);
return map;
@@ -695,7 +717,11 @@ clEnqueueMapImage(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
void *map = img.resource(q).add_map(q, flags, blocking, origin, region);
- ret_object(rd_ev, create<hard_event>(q, CL_COMMAND_MAP_IMAGE, deps));
+ auto hev = create<hard_event>(q, CL_COMMAND_MAP_IMAGE, deps);
+ if (blocking)
+ hev().wait();
+
+ ret_object(rd_ev, hev);
ret_error(r_errcode, CL_SUCCESS);
return map;
--
2.13.3
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev
--
--
Jan Vesely <***@rutgers.edu>
Francisco Jerez
2017-08-13 03:14:50 UTC
Reply
Permalink
Raw Message
Post by Jan Vesely
Post by Francisco Jerez
Post by Francisco Jerez
Post by Jan Vesely
Hi,
thanks for detailed explanation. I indeed missed the writeBuffer part
in specs.
Post by Francisco Jerez
These changes are somewhat redundant and potentially
performance-impacting, the reason is that in the OpenCL API,
clEnqueueWrite* commands are specified to block until the memory
provided by the application as origin can be reused safely (i.e. until
soft_copy_op()() runs), not necessarily until the transfer to graphics
memory has completed (which is what hard_event::wait() will wait for).
OTOH reads and maps as implemented by soft_copy_op and friends are
essentially blocking so the wait() call is redundant in most cases.
That explains a noticeable slowdown running piglit with these changes.
I'm not sure about the read part though. I expected it to be basically
a noop, but it changes behaviour.
I think this change would have slowed you down the most whenever the
mapping operation performed by soft_copy_op() is able to proceed
immediately, either because the buffer is idle (so the driver doesn't
stall on transfer_map()) *or* because the driver is trying to be smart
and creates a bounce buffer where data can be copied into immediately
without stalling, then inserts a pipelined GPU copy from the bounce
buffer into the real buffer. With this patch you will stall on the GPU
copy regardless (and whatever other work was already on the command
stream which might be substantial), even though it wouldn't have been
necessary in any of these cases.
Post by Jan Vesely
Adding clGetEventInfo(CL_EVENT_COMMAND_EXECUTION_STATUS) after a
blocking read in one of the piglit tests surprisingly returns
CL_QUEUED.
Hmm, yeah, that seems kind of debatable behaviour, although it's
definitely legit for writes, not quite sure for reads... I believe the
reason why that happens is because the CPU copy proceeds very quickly
(due to the reasons mentioned in the last paragraph), but the hard_event
is still associated with a pipe_fence synchronous with the GPU's command
stream, so it won't get signalled until the GPU catches up.
Hi,
sorry for the delay, last week was submission week...
No worries.
Post by Jan Vesely
The part that I'm still missing is what kind of GPU work needs to be
done after clEnqueueRead*(). I assume all necessary work is completed
before I can access the data.
Also CL_QUEUED status was surprising. since we performed at least some
of the work (we got the data), I'd expect CL_RUNNING or CL_SUBMITTED at
least.
The lag is not due to GPU work that needs to be performed after the
clEnqueueRead call, but due to GPU work that may precede it in the
command stream: Because clover doesn't know when the last time was that
the buffer was referenced by GPU work, the event is associated with a
fence synchronous with the current batch (which obviously won't signal
before any of the GPU work that actually referenced the buffer
completes). However the pipe driver has a more accurate idea of when
the buffer was used last, so the transfer_map() operation is likely to
complete more quickly than the CL event status changes to CL_COMPLETE.
The reason you're seeing CL_QUEUED rather than CL_SUBMITTED is because
the read operation didn't even need to flush the current batch, so
there's no fence associated with the CL event object yet.
Post by Jan Vesely
Post by Francisco Jerez
Post by Francisco Jerez
Post by Jan Vesely
The specs don't mention use of events with blocking read, but it does
say that "When the read command has completed, the contents of the
buffer that ptr points to can be used by the application." in the non-
blocking section. I'd say that the expectation is for the event to be
CL_COMPLETE after blocking read (at least beignet/pocl/intel-cpu-sdk
follow this).
Post by Francisco Jerez
The only reason why it might be useful to behave differently on blocking
transfers is that the application may have specified a user event in the
event dependency list, which may cause the soft_copy_op() call to be
delayed until the application signals the user event. In order to fix
that it should really only be necessary to wait for the event action to
be executed, not necessarily its associated GPU work.
I think that another use is that non-blocking writes do not create an
extra copy of the buffer. Thus
clEnqueueWriteBuffer(...,cl_false, ev, ...)
clWaitForEvents(ev)
is more memory efficient.
Post by Francisco Jerez
Last time this issue came up (yeah it's not the first time) I proposed
the patches below to add a mechanism to wait for the event action only,
feel free to include it as PATCH 0.1 and 0.2 of this series (it's been a
while so they may no longer apply cleanly).
I think we can just add comments explaining why the blocking argument
is ignored, until someone chooses to fix this problem
I think the problem is definitely worth fixing, and it shouldn't really
take more effort than adding comments explaining the current behaviour
;), basically just add a bunch of 'if (blocking)
hev().wait_signalled();' where the spec requires it, roughly as you had
been doing in this patch, but wait_signalled() should only stall on the
CPU work associated with the event, which should give you the same
performance as the current approach.
I can send a patch that replaces wait() -> wait_signalled()
Thanks :)
Post by Jan Vesely
Post by Francisco Jerez
Post by Francisco Jerez
Post by Jan Vesely
and/or to
implement proper non-blocking variants (would std::async work for
trivial cases like ReadBuffer?)
Hm, and to answer this question -- Yeah, std::async would probably work,
but I'm not certain whether it would actually perform better than the
current approach, because on the one hand the actual DMA-ing of the
buffer is likely to happen quasi-asynchronously already assuming the
driver is competent, and OTOH because spawning a new thread for the copy
would introduce additional overhead that might defeat your purpose
unless the copy is very large -- Only experimentation will tell whether
it pays off.
it was just a speculation. it looks like Vedran is interested in
implementing non-blocking reads/writes[0] so I'll leave it to him. r600
has bigger problems elsewhere atm.
Yeah, I'm aware of his work, I suspect the above are the reasons why he
got rather mixed performance results from his changes.
Post by Jan Vesely
thanks,
Jan
[0]https://bugs.freedesktop.org/show_bug.cgi?id=100199
Post by Francisco Jerez
Post by Francisco Jerez
Post by Jan Vesely
thanks,
Jan
Post by Francisco Jerez
Thank you.
Post by Jan Vesely
v2: wait in map_buffer and map_image as well
---
Hi Aaron,
yes, I think you're right, we should wait in Map* as well.
If nothing else it's consistent, even if passing the flag to
add_map might make it unnecessary (haven't actually checked).
thanks,
Jan
src/gallium/state_trackers/clover/api/transfer.cpp | 30 ++++++++++++++++++++--
1 file changed, 28 insertions(+), 2 deletions(-)
diff --git a/src/gallium/state_trackers/clover/api/transfer.cpp b/src/gallium/state_trackers/clover/api/transfer.cpp
index f7046253be..729a34590e 100644
--- a/src/gallium/state_trackers/clover/api/transfer.cpp
+++ b/src/gallium/state_trackers/clover/api/transfer.cpp
@@ -295,6 +295,9 @@ clEnqueueReadBuffer(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
&mem, obj_origin, obj_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -325,6 +328,9 @@ clEnqueueWriteBuffer(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
ptr, {}, obj_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -362,6 +368,9 @@ clEnqueueReadBufferRect(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
&mem, obj_origin, obj_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -399,6 +408,9 @@ clEnqueueWriteBufferRect(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
ptr, host_origin, host_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -504,6 +516,9 @@ clEnqueueReadImage(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
&img, src_origin, src_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -538,6 +553,9 @@ clEnqueueWriteImage(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
ptr, {}, src_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -667,7 +685,11 @@ clEnqueueMapBuffer(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
void *map = mem.resource(q).add_map(q, flags, blocking, obj_origin, region);
- ret_object(rd_ev, create<hard_event>(q, CL_COMMAND_MAP_BUFFER, deps));
+ auto hev = create<hard_event>(q, CL_COMMAND_MAP_BUFFER, deps);
+ if (blocking)
+ hev().wait();
+
+ ret_object(rd_ev, hev);
ret_error(r_errcode, CL_SUCCESS);
return map;
@@ -695,7 +717,11 @@ clEnqueueMapImage(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
void *map = img.resource(q).add_map(q, flags, blocking, origin, region);
- ret_object(rd_ev, create<hard_event>(q, CL_COMMAND_MAP_IMAGE, deps));
+ auto hev = create<hard_event>(q, CL_COMMAND_MAP_IMAGE, deps);
+ if (blocking)
+ hev().wait();
+
+ ret_object(rd_ev, hev);
ret_error(r_errcode, CL_SUCCESS);
return map;
--
2.13.3
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev
--
--
Aaron Watry
2017-08-14 15:13:52 UTC
Reply
Permalink
Raw Message
Post by Francisco Jerez
Post by Jan Vesely
Post by Francisco Jerez
Post by Francisco Jerez
Post by Jan Vesely
Hi,
thanks for detailed explanation. I indeed missed the writeBuffer part
in specs.
Post by Francisco Jerez
These changes are somewhat redundant and potentially
performance-impacting, the reason is that in the OpenCL API,
clEnqueueWrite* commands are specified to block until the memory
provided by the application as origin can be reused safely (i.e. until
soft_copy_op()() runs), not necessarily until the transfer to graphics
memory has completed (which is what hard_event::wait() will wait for).
OTOH reads and maps as implemented by soft_copy_op and friends are
essentially blocking so the wait() call is redundant in most cases.
That explains a noticeable slowdown running piglit with these changes.
I'm not sure about the read part though. I expected it to be basically
a noop, but it changes behaviour.
I think this change would have slowed you down the most whenever the
mapping operation performed by soft_copy_op() is able to proceed
immediately, either because the buffer is idle (so the driver doesn't
stall on transfer_map()) *or* because the driver is trying to be smart
and creates a bounce buffer where data can be copied into immediately
without stalling, then inserts a pipelined GPU copy from the bounce
buffer into the real buffer. With this patch you will stall on the GPU
copy regardless (and whatever other work was already on the command
stream which might be substantial), even though it wouldn't have been
necessary in any of these cases.
Post by Jan Vesely
Adding clGetEventInfo(CL_EVENT_COMMAND_EXECUTION_STATUS) after a
blocking read in one of the piglit tests surprisingly returns
CL_QUEUED.
Hmm, yeah, that seems kind of debatable behaviour, although it's
definitely legit for writes, not quite sure for reads... I believe the
reason why that happens is because the CPU copy proceeds very quickly
(due to the reasons mentioned in the last paragraph), but the hard_event
is still associated with a pipe_fence synchronous with the GPU's command
stream, so it won't get signalled until the GPU catches up.
Hi,
sorry for the delay, last week was submission week...
No worries.
Post by Jan Vesely
The part that I'm still missing is what kind of GPU work needs to be
done after clEnqueueRead*(). I assume all necessary work is completed
before I can access the data.
Also CL_QUEUED status was surprising. since we performed at least some
of the work (we got the data), I'd expect CL_RUNNING or CL_SUBMITTED at
least.
The lag is not due to GPU work that needs to be performed after the
clEnqueueRead call, but due to GPU work that may precede it in the
command stream: Because clover doesn't know when the last time was that
the buffer was referenced by GPU work, the event is associated with a
fence synchronous with the current batch (which obviously won't signal
before any of the GPU work that actually referenced the buffer
completes). However the pipe driver has a more accurate idea of when
the buffer was used last, so the transfer_map() operation is likely to
complete more quickly than the CL event status changes to CL_COMPLETE.
The reason you're seeing CL_QUEUED rather than CL_SUBMITTED is because
the read operation didn't even need to flush the current batch, so
there's no fence associated with the CL event object yet.
Speaking of event status issues, I've been sitting on the attached
patch (and some others) until my current series dealing with language
versions is dealt with.

Basically, our clSetEventCallback implementation is ignoring several
event statuses that *should* be triggering the callbacks, and is
instead generating errors which cause CTS failures.

--Aaron
Post by Francisco Jerez
Post by Jan Vesely
Post by Francisco Jerez
Post by Francisco Jerez
Post by Jan Vesely
The specs don't mention use of events with blocking read, but it does
say that "When the read command has completed, the contents of the
buffer that ptr points to can be used by the application." in the non-
blocking section. I'd say that the expectation is for the event to be
CL_COMPLETE after blocking read (at least beignet/pocl/intel-cpu-sdk
follow this).
Post by Francisco Jerez
The only reason why it might be useful to behave differently on blocking
transfers is that the application may have specified a user event in the
event dependency list, which may cause the soft_copy_op() call to be
delayed until the application signals the user event. In order to fix
that it should really only be necessary to wait for the event action to
be executed, not necessarily its associated GPU work.
I think that another use is that non-blocking writes do not create an
extra copy of the buffer. Thus
clEnqueueWriteBuffer(...,cl_false, ev, ...)
clWaitForEvents(ev)
is more memory efficient.
Post by Francisco Jerez
Last time this issue came up (yeah it's not the first time) I proposed
the patches below to add a mechanism to wait for the event action only,
feel free to include it as PATCH 0.1 and 0.2 of this series (it's been a
while so they may no longer apply cleanly).
I think we can just add comments explaining why the blocking argument
is ignored, until someone chooses to fix this problem
I think the problem is definitely worth fixing, and it shouldn't really
take more effort than adding comments explaining the current behaviour
;), basically just add a bunch of 'if (blocking)
hev().wait_signalled();' where the spec requires it, roughly as you had
been doing in this patch, but wait_signalled() should only stall on the
CPU work associated with the event, which should give you the same
performance as the current approach.
I can send a patch that replaces wait() -> wait_signalled()
Thanks :)
Post by Jan Vesely
Post by Francisco Jerez
Post by Francisco Jerez
Post by Jan Vesely
and/or to
implement proper non-blocking variants (would std::async work for
trivial cases like ReadBuffer?)
Hm, and to answer this question -- Yeah, std::async would probably work,
but I'm not certain whether it would actually perform better than the
current approach, because on the one hand the actual DMA-ing of the
buffer is likely to happen quasi-asynchronously already assuming the
driver is competent, and OTOH because spawning a new thread for the copy
would introduce additional overhead that might defeat your purpose
unless the copy is very large -- Only experimentation will tell whether
it pays off.
it was just a speculation. it looks like Vedran is interested in
implementing non-blocking reads/writes[0] so I'll leave it to him. r600
has bigger problems elsewhere atm.
Yeah, I'm aware of his work, I suspect the above are the reasons why he
got rather mixed performance results from his changes.
Post by Jan Vesely
thanks,
Jan
[0]https://bugs.freedesktop.org/show_bug.cgi?id=100199
Post by Francisco Jerez
Post by Francisco Jerez
Post by Jan Vesely
thanks,
Jan
Post by Francisco Jerez
Thank you.
Post by Jan Vesely
v2: wait in map_buffer and map_image as well
---
Hi Aaron,
yes, I think you're right, we should wait in Map* as well.
If nothing else it's consistent, even if passing the flag to
add_map might make it unnecessary (haven't actually checked).
thanks,
Jan
src/gallium/state_trackers/clover/api/transfer.cpp | 30 ++++++++++++++++++++--
1 file changed, 28 insertions(+), 2 deletions(-)
diff --git a/src/gallium/state_trackers/clover/api/transfer.cpp b/src/gallium/state_trackers/clover/api/transfer.cpp
index f7046253be..729a34590e 100644
--- a/src/gallium/state_trackers/clover/api/transfer.cpp
+++ b/src/gallium/state_trackers/clover/api/transfer.cpp
@@ -295,6 +295,9 @@ clEnqueueReadBuffer(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
&mem, obj_origin, obj_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -325,6 +328,9 @@ clEnqueueWriteBuffer(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
ptr, {}, obj_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -362,6 +368,9 @@ clEnqueueReadBufferRect(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
&mem, obj_origin, obj_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -399,6 +408,9 @@ clEnqueueWriteBufferRect(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
ptr, host_origin, host_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -504,6 +516,9 @@ clEnqueueReadImage(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
&img, src_origin, src_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -538,6 +553,9 @@ clEnqueueWriteImage(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
ptr, {}, src_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -667,7 +685,11 @@ clEnqueueMapBuffer(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
void *map = mem.resource(q).add_map(q, flags, blocking, obj_origin, region);
- ret_object(rd_ev, create<hard_event>(q, CL_COMMAND_MAP_BUFFER, deps));
+ auto hev = create<hard_event>(q, CL_COMMAND_MAP_BUFFER, deps);
+ if (blocking)
+ hev().wait();
+
+ ret_object(rd_ev, hev);
ret_error(r_errcode, CL_SUCCESS);
return map;
@@ -695,7 +717,11 @@ clEnqueueMapImage(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
void *map = img.resource(q).add_map(q, flags, blocking, origin, region);
- ret_object(rd_ev, create<hard_event>(q, CL_COMMAND_MAP_IMAGE, deps));
+ auto hev = create<hard_event>(q, CL_COMMAND_MAP_IMAGE, deps);
+ if (blocking)
+ hev().wait();
+
+ ret_object(rd_ev, hev);
ret_error(r_errcode, CL_SUCCESS);
return map;
--
2.13.3
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev
--
--
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev
Francisco Jerez
2017-08-14 18:53:59 UTC
Reply
Permalink
Raw Message
Post by Aaron Watry
Post by Francisco Jerez
Post by Jan Vesely
Post by Francisco Jerez
Post by Francisco Jerez
Post by Jan Vesely
Hi,
thanks for detailed explanation. I indeed missed the writeBuffer part
in specs.
Post by Francisco Jerez
These changes are somewhat redundant and potentially
performance-impacting, the reason is that in the OpenCL API,
clEnqueueWrite* commands are specified to block until the memory
provided by the application as origin can be reused safely (i.e. until
soft_copy_op()() runs), not necessarily until the transfer to graphics
memory has completed (which is what hard_event::wait() will wait for).
OTOH reads and maps as implemented by soft_copy_op and friends are
essentially blocking so the wait() call is redundant in most cases.
That explains a noticeable slowdown running piglit with these changes.
I'm not sure about the read part though. I expected it to be basically
a noop, but it changes behaviour.
I think this change would have slowed you down the most whenever the
mapping operation performed by soft_copy_op() is able to proceed
immediately, either because the buffer is idle (so the driver doesn't
stall on transfer_map()) *or* because the driver is trying to be smart
and creates a bounce buffer where data can be copied into immediately
without stalling, then inserts a pipelined GPU copy from the bounce
buffer into the real buffer. With this patch you will stall on the GPU
copy regardless (and whatever other work was already on the command
stream which might be substantial), even though it wouldn't have been
necessary in any of these cases.
Post by Jan Vesely
Adding clGetEventInfo(CL_EVENT_COMMAND_EXECUTION_STATUS) after a
blocking read in one of the piglit tests surprisingly returns
CL_QUEUED.
Hmm, yeah, that seems kind of debatable behaviour, although it's
definitely legit for writes, not quite sure for reads... I believe the
reason why that happens is because the CPU copy proceeds very quickly
(due to the reasons mentioned in the last paragraph), but the hard_event
is still associated with a pipe_fence synchronous with the GPU's command
stream, so it won't get signalled until the GPU catches up.
Hi,
sorry for the delay, last week was submission week...
No worries.
Post by Jan Vesely
The part that I'm still missing is what kind of GPU work needs to be
done after clEnqueueRead*(). I assume all necessary work is completed
before I can access the data.
Also CL_QUEUED status was surprising. since we performed at least some
of the work (we got the data), I'd expect CL_RUNNING or CL_SUBMITTED at
least.
The lag is not due to GPU work that needs to be performed after the
clEnqueueRead call, but due to GPU work that may precede it in the
command stream: Because clover doesn't know when the last time was that
the buffer was referenced by GPU work, the event is associated with a
fence synchronous with the current batch (which obviously won't signal
before any of the GPU work that actually referenced the buffer
completes). However the pipe driver has a more accurate idea of when
the buffer was used last, so the transfer_map() operation is likely to
complete more quickly than the CL event status changes to CL_COMPLETE.
The reason you're seeing CL_QUEUED rather than CL_SUBMITTED is because
the read operation didn't even need to flush the current batch, so
there's no fence associated with the CL event object yet.
Speaking of event status issues, I've been sitting on the attached
patch (and some others) until my current series dealing with language
versions is dealt with.
Basically, our clSetEventCallback implementation is ignoring several
event statuses that *should* be triggering the callbacks, and is
instead generating errors which cause CTS failures.
--Aaron
Post by Francisco Jerez
Post by Jan Vesely
Post by Francisco Jerez
Post by Francisco Jerez
Post by Jan Vesely
The specs don't mention use of events with blocking read, but it does
say that "When the read command has completed, the contents of the
buffer that ptr points to can be used by the application." in the non-
blocking section. I'd say that the expectation is for the event to be
CL_COMPLETE after blocking read (at least beignet/pocl/intel-cpu-sdk
follow this).
Post by Francisco Jerez
The only reason why it might be useful to behave differently on blocking
transfers is that the application may have specified a user event in the
event dependency list, which may cause the soft_copy_op() call to be
delayed until the application signals the user event. In order to fix
that it should really only be necessary to wait for the event action to
be executed, not necessarily its associated GPU work.
I think that another use is that non-blocking writes do not create an
extra copy of the buffer. Thus
clEnqueueWriteBuffer(...,cl_false, ev, ...)
clWaitForEvents(ev)
is more memory efficient.
Post by Francisco Jerez
Last time this issue came up (yeah it's not the first time) I proposed
the patches below to add a mechanism to wait for the event action only,
feel free to include it as PATCH 0.1 and 0.2 of this series (it's been a
while so they may no longer apply cleanly).
I think we can just add comments explaining why the blocking argument
is ignored, until someone chooses to fix this problem
I think the problem is definitely worth fixing, and it shouldn't really
take more effort than adding comments explaining the current behaviour
;), basically just add a bunch of 'if (blocking)
hev().wait_signalled();' where the spec requires it, roughly as you had
been doing in this patch, but wait_signalled() should only stall on the
CPU work associated with the event, which should give you the same
performance as the current approach.
I can send a patch that replaces wait() -> wait_signalled()
Thanks :)
Post by Jan Vesely
Post by Francisco Jerez
Post by Francisco Jerez
Post by Jan Vesely
and/or to
implement proper non-blocking variants (would std::async work for
trivial cases like ReadBuffer?)
Hm, and to answer this question -- Yeah, std::async would probably work,
but I'm not certain whether it would actually perform better than the
current approach, because on the one hand the actual DMA-ing of the
buffer is likely to happen quasi-asynchronously already assuming the
driver is competent, and OTOH because spawning a new thread for the copy
would introduce additional overhead that might defeat your purpose
unless the copy is very large -- Only experimentation will tell whether
it pays off.
it was just a speculation. it looks like Vedran is interested in
implementing non-blocking reads/writes[0] so I'll leave it to him. r600
has bigger problems elsewhere atm.
Yeah, I'm aware of his work, I suspect the above are the reasons why he
got rather mixed performance results from his changes.
Post by Jan Vesely
thanks,
Jan
[0]https://bugs.freedesktop.org/show_bug.cgi?id=100199
Post by Francisco Jerez
Post by Francisco Jerez
Post by Jan Vesely
thanks,
Jan
Post by Francisco Jerez
Thank you.
Post by Jan Vesely
v2: wait in map_buffer and map_image as well
---
Hi Aaron,
yes, I think you're right, we should wait in Map* as well.
If nothing else it's consistent, even if passing the flag to
add_map might make it unnecessary (haven't actually checked).
thanks,
Jan
src/gallium/state_trackers/clover/api/transfer.cpp | 30 ++++++++++++++++++++--
1 file changed, 28 insertions(+), 2 deletions(-)
diff --git a/src/gallium/state_trackers/clover/api/transfer.cpp b/src/gallium/state_trackers/clover/api/transfer.cpp
index f7046253be..729a34590e 100644
--- a/src/gallium/state_trackers/clover/api/transfer.cpp
+++ b/src/gallium/state_trackers/clover/api/transfer.cpp
@@ -295,6 +295,9 @@ clEnqueueReadBuffer(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
&mem, obj_origin, obj_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -325,6 +328,9 @@ clEnqueueWriteBuffer(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
ptr, {}, obj_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -362,6 +368,9 @@ clEnqueueReadBufferRect(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
&mem, obj_origin, obj_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -399,6 +408,9 @@ clEnqueueWriteBufferRect(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
ptr, host_origin, host_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -504,6 +516,9 @@ clEnqueueReadImage(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
&img, src_origin, src_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -538,6 +553,9 @@ clEnqueueWriteImage(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
ptr, {}, src_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -667,7 +685,11 @@ clEnqueueMapBuffer(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
void *map = mem.resource(q).add_map(q, flags, blocking, obj_origin, region);
- ret_object(rd_ev, create<hard_event>(q, CL_COMMAND_MAP_BUFFER, deps));
+ auto hev = create<hard_event>(q, CL_COMMAND_MAP_BUFFER, deps);
+ if (blocking)
+ hev().wait();
+
+ ret_object(rd_ev, hev);
ret_error(r_errcode, CL_SUCCESS);
return map;
@@ -695,7 +717,11 @@ clEnqueueMapImage(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
void *map = img.resource(q).add_map(q, flags, blocking, origin, region);
- ret_object(rd_ev, create<hard_event>(q, CL_COMMAND_MAP_IMAGE, deps));
+ auto hev = create<hard_event>(q, CL_COMMAND_MAP_IMAGE, deps);
+ if (blocking)
+ hev().wait();
+
+ ret_object(rd_ev, hev);
ret_error(r_errcode, CL_SUCCESS);
return map;
--
2.13.3
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev
--
--
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev
From ef827d9b06c2061d9eb198f202399d90ea261208 Mon Sep 17 00:00:00 2001
Date: Thu, 3 Aug 2017 20:55:18 -0500
Subject: [PATCH] clover/event: Include additional event statuses for
clSetEventCallback
The registered callback function will be called when the execution
status of command associated with event changes to an execution
status equal to or past the status specified by command_exec_status.
CL_COMPLETE is equal to or past any of: submitted/queued/running.
That quotation doesn't really imply that other event status codes should
be accepted. In fact the same section of the same CL spec claims:

"clSetEventCallback returns CL_SUCCESS if the function is executed
successfully. Otherwise, it returns one of the following errors: [..]
CL_INVALID_VALUE if [..] command_exec_callback_type is not CL_COMPLETE."

Is the spec contradicting itself?
Post by Aaron Watry
Fixes: OpenCL CTS test_conformance/events/test_events callbacks
---
src/gallium/state_trackers/clover/api/event.cpp | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/src/gallium/state_trackers/clover/api/event.cpp b/src/gallium/state_trackers/clover/api/event.cpp
index 5d1a0e52c5..bb7f6ed9f0 100644
--- a/src/gallium/state_trackers/clover/api/event.cpp
+++ b/src/gallium/state_trackers/clover/api/event.cpp
@@ -126,7 +126,10 @@ clSetEventCallback(cl_event d_ev, cl_int type,
void *user_data) try {
auto &ev = obj(d_ev);
- if (!pfn_notify || type != CL_COMPLETE)
+ if (!pfn_notify ||
+ (type != CL_COMPLETE && type != CL_SUBMITTED &&
+ type != CL_QUEUED && type != CL_RUNNING
Redundant line break. Also I don't think CL_QUEUED should be accepted.
Post by Aaron Watry
+ ))
throw error(CL_INVALID_VALUE);
// Create a temporary soft event that depends on ev, with
--
2.11.0
Aaron Watry
2017-08-14 21:29:42 UTC
Reply
Permalink
Raw Message
Post by Francisco Jerez
Post by Aaron Watry
Post by Francisco Jerez
Post by Jan Vesely
Post by Francisco Jerez
Post by Francisco Jerez
Post by Jan Vesely
Hi,
thanks for detailed explanation. I indeed missed the writeBuffer part
in specs.
Post by Francisco Jerez
These changes are somewhat redundant and potentially
performance-impacting, the reason is that in the OpenCL API,
clEnqueueWrite* commands are specified to block until the memory
provided by the application as origin can be reused safely (i.e. until
soft_copy_op()() runs), not necessarily until the transfer to graphics
memory has completed (which is what hard_event::wait() will wait for).
OTOH reads and maps as implemented by soft_copy_op and friends are
essentially blocking so the wait() call is redundant in most cases.
That explains a noticeable slowdown running piglit with these changes.
I'm not sure about the read part though. I expected it to be basically
a noop, but it changes behaviour.
I think this change would have slowed you down the most whenever the
mapping operation performed by soft_copy_op() is able to proceed
immediately, either because the buffer is idle (so the driver doesn't
stall on transfer_map()) *or* because the driver is trying to be smart
and creates a bounce buffer where data can be copied into immediately
without stalling, then inserts a pipelined GPU copy from the bounce
buffer into the real buffer. With this patch you will stall on the GPU
copy regardless (and whatever other work was already on the command
stream which might be substantial), even though it wouldn't have been
necessary in any of these cases.
Post by Jan Vesely
Adding clGetEventInfo(CL_EVENT_COMMAND_EXECUTION_STATUS) after a
blocking read in one of the piglit tests surprisingly returns
CL_QUEUED.
Hmm, yeah, that seems kind of debatable behaviour, although it's
definitely legit for writes, not quite sure for reads... I believe the
reason why that happens is because the CPU copy proceeds very quickly
(due to the reasons mentioned in the last paragraph), but the hard_event
is still associated with a pipe_fence synchronous with the GPU's command
stream, so it won't get signalled until the GPU catches up.
Hi,
sorry for the delay, last week was submission week...
No worries.
Post by Jan Vesely
The part that I'm still missing is what kind of GPU work needs to be
done after clEnqueueRead*(). I assume all necessary work is completed
before I can access the data.
Also CL_QUEUED status was surprising. since we performed at least some
of the work (we got the data), I'd expect CL_RUNNING or CL_SUBMITTED at
least.
The lag is not due to GPU work that needs to be performed after the
clEnqueueRead call, but due to GPU work that may precede it in the
command stream: Because clover doesn't know when the last time was that
the buffer was referenced by GPU work, the event is associated with a
fence synchronous with the current batch (which obviously won't signal
before any of the GPU work that actually referenced the buffer
completes). However the pipe driver has a more accurate idea of when
the buffer was used last, so the transfer_map() operation is likely to
complete more quickly than the CL event status changes to CL_COMPLETE.
The reason you're seeing CL_QUEUED rather than CL_SUBMITTED is because
the read operation didn't even need to flush the current batch, so
there's no fence associated with the CL event object yet.
Speaking of event status issues, I've been sitting on the attached
patch (and some others) until my current series dealing with language
versions is dealt with.
Basically, our clSetEventCallback implementation is ignoring several
event statuses that *should* be triggering the callbacks, and is
instead generating errors which cause CTS failures.
--Aaron
Post by Francisco Jerez
Post by Jan Vesely
Post by Francisco Jerez
Post by Francisco Jerez
Post by Jan Vesely
The specs don't mention use of events with blocking read, but it does
say that "When the read command has completed, the contents of the
buffer that ptr points to can be used by the application." in the non-
blocking section. I'd say that the expectation is for the event to be
CL_COMPLETE after blocking read (at least beignet/pocl/intel-cpu-sdk
follow this).
Post by Francisco Jerez
The only reason why it might be useful to behave differently on blocking
transfers is that the application may have specified a user event in the
event dependency list, which may cause the soft_copy_op() call to be
delayed until the application signals the user event. In order to fix
that it should really only be necessary to wait for the event action to
be executed, not necessarily its associated GPU work.
I think that another use is that non-blocking writes do not create an
extra copy of the buffer. Thus
clEnqueueWriteBuffer(...,cl_false, ev, ...)
clWaitForEvents(ev)
is more memory efficient.
Post by Francisco Jerez
Last time this issue came up (yeah it's not the first time) I proposed
the patches below to add a mechanism to wait for the event action only,
feel free to include it as PATCH 0.1 and 0.2 of this series (it's been a
while so they may no longer apply cleanly).
I think we can just add comments explaining why the blocking argument
is ignored, until someone chooses to fix this problem
I think the problem is definitely worth fixing, and it shouldn't really
take more effort than adding comments explaining the current behaviour
;), basically just add a bunch of 'if (blocking)
hev().wait_signalled();' where the spec requires it, roughly as you had
been doing in this patch, but wait_signalled() should only stall on the
CPU work associated with the event, which should give you the same
performance as the current approach.
I can send a patch that replaces wait() -> wait_signalled()
Thanks :)
Post by Jan Vesely
Post by Francisco Jerez
Post by Francisco Jerez
Post by Jan Vesely
and/or to
implement proper non-blocking variants (would std::async work for
trivial cases like ReadBuffer?)
Hm, and to answer this question -- Yeah, std::async would probably work,
but I'm not certain whether it would actually perform better than the
current approach, because on the one hand the actual DMA-ing of the
buffer is likely to happen quasi-asynchronously already assuming the
driver is competent, and OTOH because spawning a new thread for the copy
would introduce additional overhead that might defeat your purpose
unless the copy is very large -- Only experimentation will tell whether
it pays off.
it was just a speculation. it looks like Vedran is interested in
implementing non-blocking reads/writes[0] so I'll leave it to him. r600
has bigger problems elsewhere atm.
Yeah, I'm aware of his work, I suspect the above are the reasons why he
got rather mixed performance results from his changes.
Post by Jan Vesely
thanks,
Jan
[0]https://bugs.freedesktop.org/show_bug.cgi?id=100199
Post by Francisco Jerez
Post by Francisco Jerez
Post by Jan Vesely
thanks,
Jan
Post by Francisco Jerez
Thank you.
Post by Jan Vesely
v2: wait in map_buffer and map_image as well
---
Hi Aaron,
yes, I think you're right, we should wait in Map* as well.
If nothing else it's consistent, even if passing the flag to
add_map might make it unnecessary (haven't actually checked).
thanks,
Jan
src/gallium/state_trackers/clover/api/transfer.cpp | 30 ++++++++++++++++++++--
1 file changed, 28 insertions(+), 2 deletions(-)
diff --git a/src/gallium/state_trackers/clover/api/transfer.cpp b/src/gallium/state_trackers/clover/api/transfer.cpp
index f7046253be..729a34590e 100644
--- a/src/gallium/state_trackers/clover/api/transfer.cpp
+++ b/src/gallium/state_trackers/clover/api/transfer.cpp
@@ -295,6 +295,9 @@ clEnqueueReadBuffer(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
&mem, obj_origin, obj_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -325,6 +328,9 @@ clEnqueueWriteBuffer(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
ptr, {}, obj_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -362,6 +368,9 @@ clEnqueueReadBufferRect(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
&mem, obj_origin, obj_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -399,6 +408,9 @@ clEnqueueWriteBufferRect(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
ptr, host_origin, host_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -504,6 +516,9 @@ clEnqueueReadImage(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
&img, src_origin, src_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -538,6 +553,9 @@ clEnqueueWriteImage(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
ptr, {}, src_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -667,7 +685,11 @@ clEnqueueMapBuffer(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
void *map = mem.resource(q).add_map(q, flags, blocking, obj_origin, region);
- ret_object(rd_ev, create<hard_event>(q, CL_COMMAND_MAP_BUFFER, deps));
+ auto hev = create<hard_event>(q, CL_COMMAND_MAP_BUFFER, deps);
+ if (blocking)
+ hev().wait();
+
+ ret_object(rd_ev, hev);
ret_error(r_errcode, CL_SUCCESS);
return map;
@@ -695,7 +717,11 @@ clEnqueueMapImage(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
void *map = img.resource(q).add_map(q, flags, blocking, origin, region);
- ret_object(rd_ev, create<hard_event>(q, CL_COMMAND_MAP_IMAGE, deps));
+ auto hev = create<hard_event>(q, CL_COMMAND_MAP_IMAGE, deps);
+ if (blocking)
+ hev().wait();
+
+ ret_object(rd_ev, hev);
ret_error(r_errcode, CL_SUCCESS);
return map;
--
2.13.3
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev
--
--
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev
From ef827d9b06c2061d9eb198f202399d90ea261208 Mon Sep 17 00:00:00 2001
Date: Thu, 3 Aug 2017 20:55:18 -0500
Subject: [PATCH] clover/event: Include additional event statuses for
clSetEventCallback
The registered callback function will be called when the execution
status of command associated with event changes to an execution
status equal to or past the status specified by command_exec_status.
CL_COMPLETE is equal to or past any of: submitted/queued/running.
That quotation doesn't really imply that other event status codes should
"clSetEventCallback returns CL_SUCCESS if the function is executed
successfully. Otherwise, it returns one of the following errors: [..]
CL_INVALID_VALUE if [..] command_exec_callback_type is not CL_COMPLETE."
Is the spec contradicting itself?
I think it might be. The quote that you have from above (page 184 of
the 1.2 spec) indicates that CL_COMPLETE is the only valid status in
this case, but if you check out the previous page (183):

command_exec_callback_type is described as:
specifies the command execution status for which the callback is
registered. The command execution callback values for which a
callback can be registered are:
CL_SUBMITTED , CL_RUNNING or CL_COMPLETE[20] . There is no guarantee
that the callback
functions registered for various execution status values for an
event will be called in the exact
order that the execution status of a command changes. Furthermore,
it should be noted that
receiving a call back for an event with a status other than
CL_COMPLETE , in no way implies that
the memory model or execution model as defined by the OpenCL
specification has changed. For
example, it is not valid to assume that a corresponding memory
transfer has completed unless the
event is in a state CL_COMPLETE .

Footnote 20 is:
The callback function registered for a command_exec_callback_type
value of CL_COMPLETE will be called
when the command has completed successfully or is abnormally terminated.
Post by Francisco Jerez
Post by Aaron Watry
Fixes: OpenCL CTS test_conformance/events/test_events callbacks
---
src/gallium/state_trackers/clover/api/event.cpp | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/src/gallium/state_trackers/clover/api/event.cpp b/src/gallium/state_trackers/clover/api/event.cpp
index 5d1a0e52c5..bb7f6ed9f0 100644
--- a/src/gallium/state_trackers/clover/api/event.cpp
+++ b/src/gallium/state_trackers/clover/api/event.cpp
@@ -126,7 +126,10 @@ clSetEventCallback(cl_event d_ev, cl_int type,
void *user_data) try {
auto &ev = obj(d_ev);
- if (!pfn_notify || type != CL_COMPLETE)
+ if (!pfn_notify ||
+ (type != CL_COMPLETE && type != CL_SUBMITTED &&
+ type != CL_QUEUED && type != CL_RUNNING
Redundant line break. Also I don't think CL_QUEUED should be accepted.
Yeah, you're right about CL_QUEUED. I'll remove that before submitting
to the ML. Just to note: The CTS for 1.2 does specifically test for
CL_SUBMITTED/CL_RUNNING/CL_COMPLETED.

Regarding the line break, I can remove it. I just like to line up my
opening/closing parentheses that way, which also happens to be
consistent with the programmatically-enforced coding standards for
what I do in my day job. Some habits are hard to break.

--Aaron
Post by Francisco Jerez
Post by Aaron Watry
+ ))
throw error(CL_INVALID_VALUE);
// Create a temporary soft event that depends on ev, with
--
2.11.0
Aaron Watry
2017-08-15 00:38:59 UTC
Reply
Permalink
Raw Message
Post by Aaron Watry
Post by Francisco Jerez
Post by Aaron Watry
Post by Francisco Jerez
Post by Jan Vesely
Post by Francisco Jerez
Post by Francisco Jerez
Post by Jan Vesely
Hi,
thanks for detailed explanation. I indeed missed the writeBuffer part
in specs.
Post by Francisco Jerez
These changes are somewhat redundant and potentially
performance-impacting, the reason is that in the OpenCL API,
clEnqueueWrite* commands are specified to block until the memory
provided by the application as origin can be reused safely (i.e. until
soft_copy_op()() runs), not necessarily until the transfer to graphics
memory has completed (which is what hard_event::wait() will wait for).
OTOH reads and maps as implemented by soft_copy_op and friends are
essentially blocking so the wait() call is redundant in most cases.
That explains a noticeable slowdown running piglit with these changes.
I'm not sure about the read part though. I expected it to be basically
a noop, but it changes behaviour.
I think this change would have slowed you down the most whenever the
mapping operation performed by soft_copy_op() is able to proceed
immediately, either because the buffer is idle (so the driver doesn't
stall on transfer_map()) *or* because the driver is trying to be smart
and creates a bounce buffer where data can be copied into immediately
without stalling, then inserts a pipelined GPU copy from the bounce
buffer into the real buffer. With this patch you will stall on the GPU
copy regardless (and whatever other work was already on the command
stream which might be substantial), even though it wouldn't have been
necessary in any of these cases.
Post by Jan Vesely
Adding clGetEventInfo(CL_EVENT_COMMAND_EXECUTION_STATUS) after a
blocking read in one of the piglit tests surprisingly returns
CL_QUEUED.
Hmm, yeah, that seems kind of debatable behaviour, although it's
definitely legit for writes, not quite sure for reads... I believe the
reason why that happens is because the CPU copy proceeds very quickly
(due to the reasons mentioned in the last paragraph), but the hard_event
is still associated with a pipe_fence synchronous with the GPU's command
stream, so it won't get signalled until the GPU catches up.
Hi,
sorry for the delay, last week was submission week...
No worries.
Post by Jan Vesely
The part that I'm still missing is what kind of GPU work needs to be
done after clEnqueueRead*(). I assume all necessary work is completed
before I can access the data.
Also CL_QUEUED status was surprising. since we performed at least some
of the work (we got the data), I'd expect CL_RUNNING or CL_SUBMITTED at
least.
The lag is not due to GPU work that needs to be performed after the
clEnqueueRead call, but due to GPU work that may precede it in the
command stream: Because clover doesn't know when the last time was that
the buffer was referenced by GPU work, the event is associated with a
fence synchronous with the current batch (which obviously won't signal
before any of the GPU work that actually referenced the buffer
completes). However the pipe driver has a more accurate idea of when
the buffer was used last, so the transfer_map() operation is likely to
complete more quickly than the CL event status changes to CL_COMPLETE.
The reason you're seeing CL_QUEUED rather than CL_SUBMITTED is because
the read operation didn't even need to flush the current batch, so
there's no fence associated with the CL event object yet.
Speaking of event status issues, I've been sitting on the attached
patch (and some others) until my current series dealing with language
versions is dealt with.
Basically, our clSetEventCallback implementation is ignoring several
event statuses that *should* be triggering the callbacks, and is
instead generating errors which cause CTS failures.
--Aaron
Post by Francisco Jerez
Post by Jan Vesely
Post by Francisco Jerez
Post by Francisco Jerez
Post by Jan Vesely
The specs don't mention use of events with blocking read, but it does
say that "When the read command has completed, the contents of the
buffer that ptr points to can be used by the application." in the non-
blocking section. I'd say that the expectation is for the event to be
CL_COMPLETE after blocking read (at least beignet/pocl/intel-cpu-sdk
follow this).
Post by Francisco Jerez
The only reason why it might be useful to behave differently on blocking
transfers is that the application may have specified a user event in the
event dependency list, which may cause the soft_copy_op() call to be
delayed until the application signals the user event. In order to fix
that it should really only be necessary to wait for the event action to
be executed, not necessarily its associated GPU work.
I think that another use is that non-blocking writes do not create an
extra copy of the buffer. Thus
clEnqueueWriteBuffer(...,cl_false, ev, ...)
clWaitForEvents(ev)
is more memory efficient.
Post by Francisco Jerez
Last time this issue came up (yeah it's not the first time) I proposed
the patches below to add a mechanism to wait for the event action only,
feel free to include it as PATCH 0.1 and 0.2 of this series (it's been a
while so they may no longer apply cleanly).
I think we can just add comments explaining why the blocking argument
is ignored, until someone chooses to fix this problem
I think the problem is definitely worth fixing, and it shouldn't really
take more effort than adding comments explaining the current behaviour
;), basically just add a bunch of 'if (blocking)
hev().wait_signalled();' where the spec requires it, roughly as you had
been doing in this patch, but wait_signalled() should only stall on the
CPU work associated with the event, which should give you the same
performance as the current approach.
I can send a patch that replaces wait() -> wait_signalled()
Thanks :)
Post by Jan Vesely
Post by Francisco Jerez
Post by Francisco Jerez
Post by Jan Vesely
and/or to
implement proper non-blocking variants (would std::async work for
trivial cases like ReadBuffer?)
Hm, and to answer this question -- Yeah, std::async would probably work,
but I'm not certain whether it would actually perform better than the
current approach, because on the one hand the actual DMA-ing of the
buffer is likely to happen quasi-asynchronously already assuming the
driver is competent, and OTOH because spawning a new thread for the copy
would introduce additional overhead that might defeat your purpose
unless the copy is very large -- Only experimentation will tell whether
it pays off.
it was just a speculation. it looks like Vedran is interested in
implementing non-blocking reads/writes[0] so I'll leave it to him. r600
has bigger problems elsewhere atm.
Yeah, I'm aware of his work, I suspect the above are the reasons why he
got rather mixed performance results from his changes.
Post by Jan Vesely
thanks,
Jan
[0]https://bugs.freedesktop.org/show_bug.cgi?id=100199
Post by Francisco Jerez
Post by Francisco Jerez
Post by Jan Vesely
thanks,
Jan
Post by Francisco Jerez
Thank you.
Post by Jan Vesely
v2: wait in map_buffer and map_image as well
---
Hi Aaron,
yes, I think you're right, we should wait in Map* as well.
If nothing else it's consistent, even if passing the flag to
add_map might make it unnecessary (haven't actually checked).
thanks,
Jan
src/gallium/state_trackers/clover/api/transfer.cpp | 30 ++++++++++++++++++++--
1 file changed, 28 insertions(+), 2 deletions(-)
diff --git a/src/gallium/state_trackers/clover/api/transfer.cpp b/src/gallium/state_trackers/clover/api/transfer.cpp
index f7046253be..729a34590e 100644
--- a/src/gallium/state_trackers/clover/api/transfer.cpp
+++ b/src/gallium/state_trackers/clover/api/transfer.cpp
@@ -295,6 +295,9 @@ clEnqueueReadBuffer(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
&mem, obj_origin, obj_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -325,6 +328,9 @@ clEnqueueWriteBuffer(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
ptr, {}, obj_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -362,6 +368,9 @@ clEnqueueReadBufferRect(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
&mem, obj_origin, obj_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -399,6 +408,9 @@ clEnqueueWriteBufferRect(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
ptr, host_origin, host_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -504,6 +516,9 @@ clEnqueueReadImage(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
&img, src_origin, src_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -538,6 +553,9 @@ clEnqueueWriteImage(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
ptr, {}, src_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -667,7 +685,11 @@ clEnqueueMapBuffer(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
void *map = mem.resource(q).add_map(q, flags, blocking, obj_origin, region);
- ret_object(rd_ev, create<hard_event>(q, CL_COMMAND_MAP_BUFFER, deps));
+ auto hev = create<hard_event>(q, CL_COMMAND_MAP_BUFFER, deps);
+ if (blocking)
+ hev().wait();
+
+ ret_object(rd_ev, hev);
ret_error(r_errcode, CL_SUCCESS);
return map;
@@ -695,7 +717,11 @@ clEnqueueMapImage(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
void *map = img.resource(q).add_map(q, flags, blocking, origin, region);
- ret_object(rd_ev, create<hard_event>(q, CL_COMMAND_MAP_IMAGE, deps));
+ auto hev = create<hard_event>(q, CL_COMMAND_MAP_IMAGE, deps);
+ if (blocking)
+ hev().wait();
+
+ ret_object(rd_ev, hev);
ret_error(r_errcode, CL_SUCCESS);
return map;
--
2.13.3
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev
--
--
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev
From ef827d9b06c2061d9eb198f202399d90ea261208 Mon Sep 17 00:00:00 2001
Date: Thu, 3 Aug 2017 20:55:18 -0500
Subject: [PATCH] clover/event: Include additional event statuses for
clSetEventCallback
The registered callback function will be called when the execution
status of command associated with event changes to an execution
status equal to or past the status specified by command_exec_status.
CL_COMPLETE is equal to or past any of: submitted/queued/running.
That quotation doesn't really imply that other event status codes should
"clSetEventCallback returns CL_SUCCESS if the function is executed
successfully. Otherwise, it returns one of the following errors: [..]
CL_INVALID_VALUE if [..] command_exec_callback_type is not CL_COMPLETE."
Is the spec contradicting itself?
I think it might be. The quote that you have from above (page 184 of
the 1.2 spec) indicates that CL_COMPLETE is the only valid status in
Looks like they clarified this in the CL 2.0 spec. CL_COMPLETE,
CL_SUBMITTED, CL_RUNNING are all valid values. CL_QUEUED is NOT in
that list.

--Aaron
Post by Aaron Watry
specifies the command execution status for which the callback is
registered. The command execution callback values for which a
CL_SUBMITTED , CL_RUNNING or CL_COMPLETE[20] . There is no guarantee
that the callback
functions registered for various execution status values for an
event will be called in the exact
order that the execution status of a command changes. Furthermore,
it should be noted that
receiving a call back for an event with a status other than
CL_COMPLETE , in no way implies that
the memory model or execution model as defined by the OpenCL
specification has changed. For
example, it is not valid to assume that a corresponding memory
transfer has completed unless the
event is in a state CL_COMPLETE .
The callback function registered for a command_exec_callback_type
value of CL_COMPLETE will be called
when the command has completed successfully or is abnormally terminated.
Post by Francisco Jerez
Post by Aaron Watry
Fixes: OpenCL CTS test_conformance/events/test_events callbacks
---
src/gallium/state_trackers/clover/api/event.cpp | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/src/gallium/state_trackers/clover/api/event.cpp b/src/gallium/state_trackers/clover/api/event.cpp
index 5d1a0e52c5..bb7f6ed9f0 100644
--- a/src/gallium/state_trackers/clover/api/event.cpp
+++ b/src/gallium/state_trackers/clover/api/event.cpp
@@ -126,7 +126,10 @@ clSetEventCallback(cl_event d_ev, cl_int type,
void *user_data) try {
auto &ev = obj(d_ev);
- if (!pfn_notify || type != CL_COMPLETE)
+ if (!pfn_notify ||
+ (type != CL_COMPLETE && type != CL_SUBMITTED &&
+ type != CL_QUEUED && type != CL_RUNNING
Redundant line break. Also I don't think CL_QUEUED should be accepted.
Yeah, you're right about CL_QUEUED. I'll remove that before submitting
to the ML. Just to note: The CTS for 1.2 does specifically test for
CL_SUBMITTED/CL_RUNNING/CL_COMPLETED.
Regarding the line break, I can remove it. I just like to line up my
opening/closing parentheses that way, which also happens to be
consistent with the programmatically-enforced coding standards for
what I do in my day job. Some habits are hard to break.
--Aaron
Post by Francisco Jerez
Post by Aaron Watry
+ ))
throw error(CL_INVALID_VALUE);
// Create a temporary soft event that depends on ev, with
--
2.11.0
Jan Vesely
2017-08-15 03:01:30 UTC
Reply
Permalink
Raw Message
Post by Francisco Jerez
Post by Jan Vesely
Post by Francisco Jerez
Post by Francisco Jerez
Post by Jan Vesely
Hi,
thanks for detailed explanation. I indeed missed the writeBuffer part
in specs.
Post by Francisco Jerez
These changes are somewhat redundant and potentially
performance-impacting, the reason is that in the OpenCL API,
clEnqueueWrite* commands are specified to block until the memory
provided by the application as origin can be reused safely (i.e. until
soft_copy_op()() runs), not necessarily until the transfer to graphics
memory has completed (which is what hard_event::wait() will wait for).
OTOH reads and maps as implemented by soft_copy_op and friends are
essentially blocking so the wait() call is redundant in most cases.
That explains a noticeable slowdown running piglit with these changes.
I'm not sure about the read part though. I expected it to be basically
a noop, but it changes behaviour.
I think this change would have slowed you down the most whenever the
mapping operation performed by soft_copy_op() is able to proceed
immediately, either because the buffer is idle (so the driver doesn't
stall on transfer_map()) *or* because the driver is trying to be smart
and creates a bounce buffer where data can be copied into immediately
without stalling, then inserts a pipelined GPU copy from the bounce
buffer into the real buffer. With this patch you will stall on the GPU
copy regardless (and whatever other work was already on the command
stream which might be substantial), even though it wouldn't have been
necessary in any of these cases.
Post by Jan Vesely
Adding clGetEventInfo(CL_EVENT_COMMAND_EXECUTION_STATUS) after a
blocking read in one of the piglit tests surprisingly returns
CL_QUEUED.
Hmm, yeah, that seems kind of debatable behaviour, although it's
definitely legit for writes, not quite sure for reads... I believe the
reason why that happens is because the CPU copy proceeds very quickly
(due to the reasons mentioned in the last paragraph), but the hard_event
is still associated with a pipe_fence synchronous with the GPU's command
stream, so it won't get signalled until the GPU catches up.
Hi,
sorry for the delay, last week was submission week...
No worries.
Post by Jan Vesely
The part that I'm still missing is what kind of GPU work needs to be
done after clEnqueueRead*(). I assume all necessary work is completed
before I can access the data.
Also CL_QUEUED status was surprising. since we performed at least some
of the work (we got the data), I'd expect CL_RUNNING or CL_SUBMITTED at
least.
The lag is not due to GPU work that needs to be performed after the
clEnqueueRead call, but due to GPU work that may precede it in the
command stream: Because clover doesn't know when the last time was that
the buffer was referenced by GPU work, the event is associated with a
fence synchronous with the current batch (which obviously won't signal
before any of the GPU work that actually referenced the buffer
completes). However the pipe driver has a more accurate idea of when
the buffer was used last, so the transfer_map() operation is likely to
complete more quickly than the CL event status changes to CL_COMPLETE.
The reason you're seeing CL_QUEUED rather than CL_SUBMITTED is because
the read operation didn't even need to flush the current batch, so
there's no fence associated with the CL event object yet.
thanks. so the event is waiting for the current batch, even if the
buffer access is done out of order. The question is, why do we use the
fence at all? If I understood correctly mapping the buffer will be
delayed by the pipe driver if needed, so we don't really need the
fence. Am I missing something?

Jan
Post by Francisco Jerez
Post by Jan Vesely
Post by Francisco Jerez
Post by Francisco Jerez
Post by Jan Vesely
The specs don't mention use of events with blocking read, but it does
say that "When the read command has completed, the contents of the
buffer that ptr points to can be used by the application." in the non-
blocking section. I'd say that the expectation is for the event to be
CL_COMPLETE after blocking read (at least beignet/pocl/intel-cpu-sdk
follow this).
Post by Francisco Jerez
The only reason why it might be useful to behave differently on blocking
transfers is that the application may have specified a user event in the
event dependency list, which may cause the soft_copy_op() call to be
delayed until the application signals the user event. In order to fix
that it should really only be necessary to wait for the event action to
be executed, not necessarily its associated GPU work.
I think that another use is that non-blocking writes do not create an
extra copy of the buffer. Thus
clEnqueueWriteBuffer(...,cl_false, ev, ...)
clWaitForEvents(ev)
is more memory efficient.
Post by Francisco Jerez
Last time this issue came up (yeah it's not the first time) I proposed
the patches below to add a mechanism to wait for the event action only,
feel free to include it as PATCH 0.1 and 0.2 of this series (it's been a
while so they may no longer apply cleanly).
I think we can just add comments explaining why the blocking argument
is ignored, until someone chooses to fix this problem
I think the problem is definitely worth fixing, and it shouldn't really
take more effort than adding comments explaining the current behaviour
;), basically just add a bunch of 'if (blocking)
hev().wait_signalled();' where the spec requires it, roughly as you had
been doing in this patch, but wait_signalled() should only stall on the
CPU work associated with the event, which should give you the same
performance as the current approach.
I can send a patch that replaces wait() -> wait_signalled()
Thanks :)
Post by Jan Vesely
Post by Francisco Jerez
Post by Francisco Jerez
Post by Jan Vesely
and/or to
implement proper non-blocking variants (would std::async work for
trivial cases like ReadBuffer?)
Hm, and to answer this question -- Yeah, std::async would probably work,
but I'm not certain whether it would actually perform better than the
current approach, because on the one hand the actual DMA-ing of the
buffer is likely to happen quasi-asynchronously already assuming the
driver is competent, and OTOH because spawning a new thread for the copy
would introduce additional overhead that might defeat your purpose
unless the copy is very large -- Only experimentation will tell whether
it pays off.
it was just a speculation. it looks like Vedran is interested in
implementing non-blocking reads/writes[0] so I'll leave it to him. r600
has bigger problems elsewhere atm.
Yeah, I'm aware of his work, I suspect the above are the reasons why he
got rather mixed performance results from his changes.
Post by Jan Vesely
thanks,
Jan
[0]https://bugs.freedesktop.org/show_bug.cgi?id=100199
Post by Francisco Jerez
Post by Francisco Jerez
Post by Jan Vesely
thanks,
Jan
Post by Francisco Jerez
Thank you.
Post by Jan Vesely
v2: wait in map_buffer and map_image as well
---
Hi Aaron,
yes, I think you're right, we should wait in Map* as well.
If nothing else it's consistent, even if passing the flag to
add_map might make it unnecessary (haven't actually checked).
thanks,
Jan
src/gallium/state_trackers/clover/api/transfer.cpp | 30 ++++++++++++++++++++--
1 file changed, 28 insertions(+), 2 deletions(-)
diff --git a/src/gallium/state_trackers/clover/api/transfer.cpp b/src/gallium/state_trackers/clover/api/transfer.cpp
index f7046253be..729a34590e 100644
--- a/src/gallium/state_trackers/clover/api/transfer.cpp
+++ b/src/gallium/state_trackers/clover/api/transfer.cpp
@@ -295,6 +295,9 @@ clEnqueueReadBuffer(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
&mem, obj_origin, obj_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -325,6 +328,9 @@ clEnqueueWriteBuffer(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
ptr, {}, obj_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -362,6 +368,9 @@ clEnqueueReadBufferRect(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
&mem, obj_origin, obj_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -399,6 +408,9 @@ clEnqueueWriteBufferRect(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
ptr, host_origin, host_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -504,6 +516,9 @@ clEnqueueReadImage(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
&img, src_origin, src_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -538,6 +553,9 @@ clEnqueueWriteImage(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
ptr, {}, src_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -667,7 +685,11 @@ clEnqueueMapBuffer(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
void *map = mem.resource(q).add_map(q, flags, blocking, obj_origin, region);
- ret_object(rd_ev, create<hard_event>(q, CL_COMMAND_MAP_BUFFER, deps));
+ auto hev = create<hard_event>(q, CL_COMMAND_MAP_BUFFER, deps);
+ if (blocking)
+ hev().wait();
+
+ ret_object(rd_ev, hev);
ret_error(r_errcode, CL_SUCCESS);
return map;
@@ -695,7 +717,11 @@ clEnqueueMapImage(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
void *map = img.resource(q).add_map(q, flags, blocking, origin, region);
- ret_object(rd_ev, create<hard_event>(q, CL_COMMAND_MAP_IMAGE, deps));
+ auto hev = create<hard_event>(q, CL_COMMAND_MAP_IMAGE, deps);
+ if (blocking)
+ hev().wait();
+
+ ret_object(rd_ev, hev);
ret_error(r_errcode, CL_SUCCESS);
return map;
--
2.13.3
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev
--
--
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev
--
Jan Vesely <***@rutgers.edu>
Francisco Jerez
2017-08-15 19:00:48 UTC
Reply
Permalink
Raw Message
Post by Jan Vesely
Post by Francisco Jerez
Post by Jan Vesely
Post by Francisco Jerez
Post by Francisco Jerez
Post by Jan Vesely
Hi,
thanks for detailed explanation. I indeed missed the writeBuffer part
in specs.
Post by Francisco Jerez
These changes are somewhat redundant and potentially
performance-impacting, the reason is that in the OpenCL API,
clEnqueueWrite* commands are specified to block until the memory
provided by the application as origin can be reused safely (i.e. until
soft_copy_op()() runs), not necessarily until the transfer to graphics
memory has completed (which is what hard_event::wait() will wait for).
OTOH reads and maps as implemented by soft_copy_op and friends are
essentially blocking so the wait() call is redundant in most cases.
That explains a noticeable slowdown running piglit with these changes.
I'm not sure about the read part though. I expected it to be basically
a noop, but it changes behaviour.
I think this change would have slowed you down the most whenever the
mapping operation performed by soft_copy_op() is able to proceed
immediately, either because the buffer is idle (so the driver doesn't
stall on transfer_map()) *or* because the driver is trying to be smart
and creates a bounce buffer where data can be copied into immediately
without stalling, then inserts a pipelined GPU copy from the bounce
buffer into the real buffer. With this patch you will stall on the GPU
copy regardless (and whatever other work was already on the command
stream which might be substantial), even though it wouldn't have been
necessary in any of these cases.
Post by Jan Vesely
Adding clGetEventInfo(CL_EVENT_COMMAND_EXECUTION_STATUS) after a
blocking read in one of the piglit tests surprisingly returns
CL_QUEUED.
Hmm, yeah, that seems kind of debatable behaviour, although it's
definitely legit for writes, not quite sure for reads... I believe the
reason why that happens is because the CPU copy proceeds very quickly
(due to the reasons mentioned in the last paragraph), but the hard_event
is still associated with a pipe_fence synchronous with the GPU's command
stream, so it won't get signalled until the GPU catches up.
Hi,
sorry for the delay, last week was submission week...
No worries.
Post by Jan Vesely
The part that I'm still missing is what kind of GPU work needs to be
done after clEnqueueRead*(). I assume all necessary work is completed
before I can access the data.
Also CL_QUEUED status was surprising. since we performed at least some
of the work (we got the data), I'd expect CL_RUNNING or CL_SUBMITTED at
least.
The lag is not due to GPU work that needs to be performed after the
clEnqueueRead call, but due to GPU work that may precede it in the
command stream: Because clover doesn't know when the last time was that
the buffer was referenced by GPU work, the event is associated with a
fence synchronous with the current batch (which obviously won't signal
before any of the GPU work that actually referenced the buffer
completes). However the pipe driver has a more accurate idea of when
the buffer was used last, so the transfer_map() operation is likely to
complete more quickly than the CL event status changes to CL_COMPLETE.
The reason you're seeing CL_QUEUED rather than CL_SUBMITTED is because
the read operation didn't even need to flush the current batch, so
there's no fence associated with the CL event object yet.
thanks. so the event is waiting for the current batch, even if the
buffer access is done out of order. The question is, why do we use the
fence at all? If I understood correctly mapping the buffer will be
delayed by the pipe driver if needed, so we don't really need the
fence. Am I missing something?
The current CL event object implementation needs a gallium fence object
in order to find out what status it's in and in order to implement
waiting. The problem is that it doesn't have any fence object available
that is close "enough" to the actual GPU operation that modified the
buffer for the last time. There are several more or less hairy ways to
improve this, e.g. using a soft_event that's manually signaled when
triggered instead of a hard_event (though I'm not sure whether this
would break other CL APIs that may rely on hard_event functionality),
using a different event subclass that doesn't use fences and is instead
considered immediately signaled as soon as it's triggered, using the
current hard_event implementation with a dummy known-signaled pipe
fence, or adding per-buffer fence tracking so each memory object is able
to remember which fence was the last to reference the buffer in the
command stream. Most of these would involve substantial work relative
to the slight benefit achieved (which is dependent on your definition of
the fence being "close enough" to the relevant GPU work).
Post by Jan Vesely
Jan
Post by Francisco Jerez
Post by Jan Vesely
Post by Francisco Jerez
Post by Francisco Jerez
Post by Jan Vesely
The specs don't mention use of events with blocking read, but it does
say that "When the read command has completed, the contents of the
buffer that ptr points to can be used by the application." in the non-
blocking section. I'd say that the expectation is for the event to be
CL_COMPLETE after blocking read (at least beignet/pocl/intel-cpu-sdk
follow this).
Post by Francisco Jerez
The only reason why it might be useful to behave differently on blocking
transfers is that the application may have specified a user event in the
event dependency list, which may cause the soft_copy_op() call to be
delayed until the application signals the user event. In order to fix
that it should really only be necessary to wait for the event action to
be executed, not necessarily its associated GPU work.
I think that another use is that non-blocking writes do not create an
extra copy of the buffer. Thus
clEnqueueWriteBuffer(...,cl_false, ev, ...)
clWaitForEvents(ev)
is more memory efficient.
Post by Francisco Jerez
Last time this issue came up (yeah it's not the first time) I proposed
the patches below to add a mechanism to wait for the event action only,
feel free to include it as PATCH 0.1 and 0.2 of this series (it's been a
while so they may no longer apply cleanly).
I think we can just add comments explaining why the blocking argument
is ignored, until someone chooses to fix this problem
I think the problem is definitely worth fixing, and it shouldn't really
take more effort than adding comments explaining the current behaviour
;), basically just add a bunch of 'if (blocking)
hev().wait_signalled();' where the spec requires it, roughly as you had
been doing in this patch, but wait_signalled() should only stall on the
CPU work associated with the event, which should give you the same
performance as the current approach.
I can send a patch that replaces wait() -> wait_signalled()
Thanks :)
Post by Jan Vesely
Post by Francisco Jerez
Post by Francisco Jerez
Post by Jan Vesely
and/or to
implement proper non-blocking variants (would std::async work for
trivial cases like ReadBuffer?)
Hm, and to answer this question -- Yeah, std::async would probably work,
but I'm not certain whether it would actually perform better than the
current approach, because on the one hand the actual DMA-ing of the
buffer is likely to happen quasi-asynchronously already assuming the
driver is competent, and OTOH because spawning a new thread for the copy
would introduce additional overhead that might defeat your purpose
unless the copy is very large -- Only experimentation will tell whether
it pays off.
it was just a speculation. it looks like Vedran is interested in
implementing non-blocking reads/writes[0] so I'll leave it to him. r600
has bigger problems elsewhere atm.
Yeah, I'm aware of his work, I suspect the above are the reasons why he
got rather mixed performance results from his changes.
Post by Jan Vesely
thanks,
Jan
[0]https://bugs.freedesktop.org/show_bug.cgi?id=100199
Post by Francisco Jerez
Post by Francisco Jerez
Post by Jan Vesely
thanks,
Jan
Post by Francisco Jerez
Thank you.
Post by Jan Vesely
v2: wait in map_buffer and map_image as well
---
Hi Aaron,
yes, I think you're right, we should wait in Map* as well.
If nothing else it's consistent, even if passing the flag to
add_map might make it unnecessary (haven't actually checked).
thanks,
Jan
src/gallium/state_trackers/clover/api/transfer.cpp | 30 ++++++++++++++++++++--
1 file changed, 28 insertions(+), 2 deletions(-)
diff --git a/src/gallium/state_trackers/clover/api/transfer.cpp b/src/gallium/state_trackers/clover/api/transfer.cpp
index f7046253be..729a34590e 100644
--- a/src/gallium/state_trackers/clover/api/transfer.cpp
+++ b/src/gallium/state_trackers/clover/api/transfer.cpp
@@ -295,6 +295,9 @@ clEnqueueReadBuffer(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
&mem, obj_origin, obj_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -325,6 +328,9 @@ clEnqueueWriteBuffer(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
ptr, {}, obj_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -362,6 +368,9 @@ clEnqueueReadBufferRect(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
&mem, obj_origin, obj_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -399,6 +408,9 @@ clEnqueueWriteBufferRect(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
ptr, host_origin, host_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -504,6 +516,9 @@ clEnqueueReadImage(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
&img, src_origin, src_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -538,6 +553,9 @@ clEnqueueWriteImage(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
ptr, {}, src_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -667,7 +685,11 @@ clEnqueueMapBuffer(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
void *map = mem.resource(q).add_map(q, flags, blocking, obj_origin, region);
- ret_object(rd_ev, create<hard_event>(q, CL_COMMAND_MAP_BUFFER, deps));
+ auto hev = create<hard_event>(q, CL_COMMAND_MAP_BUFFER, deps);
+ if (blocking)
+ hev().wait();
+
+ ret_object(rd_ev, hev);
ret_error(r_errcode, CL_SUCCESS);
return map;
@@ -695,7 +717,11 @@ clEnqueueMapImage(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
void *map = img.resource(q).add_map(q, flags, blocking, origin, region);
- ret_object(rd_ev, create<hard_event>(q, CL_COMMAND_MAP_IMAGE, deps));
+ auto hev = create<hard_event>(q, CL_COMMAND_MAP_IMAGE, deps);
+ if (blocking)
+ hev().wait();
+
+ ret_object(rd_ev, hev);
ret_error(r_errcode, CL_SUCCESS);
return map;
--
2.13.3
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev
--
--
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev
--
Jan Vesely
2017-08-17 19:09:50 UTC
Reply
Permalink
Raw Message
Post by Francisco Jerez
Post by Jan Vesely
Post by Francisco Jerez
Post by Jan Vesely
Post by Francisco Jerez
Post by Francisco Jerez
Post by Jan Vesely
Hi,
thanks for detailed explanation. I indeed missed the writeBuffer part
in specs.
Post by Francisco Jerez
These changes are somewhat redundant and potentially
performance-impacting, the reason is that in the OpenCL API,
clEnqueueWrite* commands are specified to block until the memory
provided by the application as origin can be reused safely (i.e. until
soft_copy_op()() runs), not necessarily until the transfer to graphics
memory has completed (which is what hard_event::wait() will wait for).
OTOH reads and maps as implemented by soft_copy_op and friends are
essentially blocking so the wait() call is redundant in most cases.
That explains a noticeable slowdown running piglit with these changes.
I'm not sure about the read part though. I expected it to be basically
a noop, but it changes behaviour.
I think this change would have slowed you down the most whenever the
mapping operation performed by soft_copy_op() is able to proceed
immediately, either because the buffer is idle (so the driver doesn't
stall on transfer_map()) *or* because the driver is trying to be smart
and creates a bounce buffer where data can be copied into immediately
without stalling, then inserts a pipelined GPU copy from the bounce
buffer into the real buffer. With this patch you will stall on the GPU
copy regardless (and whatever other work was already on the command
stream which might be substantial), even though it wouldn't have been
necessary in any of these cases.
Post by Jan Vesely
Adding clGetEventInfo(CL_EVENT_COMMAND_EXECUTION_STATUS) after a
blocking read in one of the piglit tests surprisingly returns
CL_QUEUED.
Hmm, yeah, that seems kind of debatable behaviour, although it's
definitely legit for writes, not quite sure for reads... I believe the
reason why that happens is because the CPU copy proceeds very quickly
(due to the reasons mentioned in the last paragraph), but the hard_event
is still associated with a pipe_fence synchronous with the GPU's command
stream, so it won't get signalled until the GPU catches up.
Hi,
sorry for the delay, last week was submission week...
No worries.
Post by Jan Vesely
The part that I'm still missing is what kind of GPU work needs to be
done after clEnqueueRead*(). I assume all necessary work is completed
before I can access the data.
Also CL_QUEUED status was surprising. since we performed at least some
of the work (we got the data), I'd expect CL_RUNNING or CL_SUBMITTED at
least.
The lag is not due to GPU work that needs to be performed after the
clEnqueueRead call, but due to GPU work that may precede it in the
command stream: Because clover doesn't know when the last time was that
the buffer was referenced by GPU work, the event is associated with a
fence synchronous with the current batch (which obviously won't signal
before any of the GPU work that actually referenced the buffer
completes). However the pipe driver has a more accurate idea of when
the buffer was used last, so the transfer_map() operation is likely to
complete more quickly than the CL event status changes to CL_COMPLETE.
The reason you're seeing CL_QUEUED rather than CL_SUBMITTED is because
the read operation didn't even need to flush the current batch, so
there's no fence associated with the CL event object yet.
thanks. so the event is waiting for the current batch, even if the
buffer access is done out of order. The question is, why do we use the
fence at all? If I understood correctly mapping the buffer will be
delayed by the pipe driver if needed, so we don't really need the
fence. Am I missing something?
The current CL event object implementation needs a gallium fence object
in order to find out what status it's in and in order to implement
waiting. The problem is that it doesn't have any fence object available
that is close "enough" to the actual GPU operation that modified the
buffer for the last time. There are several more or less hairy ways to
improve this, e.g. using a soft_event that's manually signaled when
triggered instead of a hard_event (though I'm not sure whether this
would break other CL APIs that may rely on hard_event functionality),
using a different event subclass that doesn't use fences and is instead
considered immediately signaled as soon as it's triggered,
Using soft_event was what I had in mind (new event subclass also looks
OK). The rest looks rather hacky.
Can you be more specific about the potential API issues? we return the
event to user, so the next interaction with clover will be when the
user uses that event as an input. I don't think we expect hard_events
as user input (I haven't really checked).

I'll try to find some time to experiment with this. do you want me to
send the wait_signalled patch before that?

Jan
Post by Francisco Jerez
using the
current hard_event implementation with a dummy known-signaled pipe
fence, or adding per-buffer fence tracking so each memory object is able
to remember which fence was the last to reference the buffer in the
command stream. Most of these would involve substantial work relative
to the slight benefit achieved (which is dependent on your definition of
the fence being "close enough" to the relevant GPU work).
Post by Jan Vesely
Jan
Post by Francisco Jerez
Post by Jan Vesely
Post by Francisco Jerez
Post by Francisco Jerez
Post by Jan Vesely
The specs don't mention use of events with blocking read, but it does
say that "When the read command has completed, the contents of the
buffer that ptr points to can be used by the application." in the non-
blocking section. I'd say that the expectation is for the event to be
CL_COMPLETE after blocking read (at least beignet/pocl/intel-cpu-sdk
follow this).
Post by Francisco Jerez
The only reason why it might be useful to behave differently on blocking
transfers is that the application may have specified a user event in the
event dependency list, which may cause the soft_copy_op() call to be
delayed until the application signals the user event. In order to fix
that it should really only be necessary to wait for the event action to
be executed, not necessarily its associated GPU work.
I think that another use is that non-blocking writes do not create an
extra copy of the buffer. Thus
clEnqueueWriteBuffer(...,cl_false, ev, ...)
clWaitForEvents(ev)
is more memory efficient.
Post by Francisco Jerez
Last time this issue came up (yeah it's not the first time) I proposed
the patches below to add a mechanism to wait for the event action only,
feel free to include it as PATCH 0.1 and 0.2 of this series (it's been a
while so they may no longer apply cleanly).
I think we can just add comments explaining why the blocking argument
is ignored, until someone chooses to fix this problem
I think the problem is definitely worth fixing, and it shouldn't really
take more effort than adding comments explaining the current behaviour
;), basically just add a bunch of 'if (blocking)
hev().wait_signalled();' where the spec requires it, roughly as you had
been doing in this patch, but wait_signalled() should only stall on the
CPU work associated with the event, which should give you the same
performance as the current approach.
I can send a patch that replaces wait() -> wait_signalled()
Thanks :)
Post by Jan Vesely
Post by Francisco Jerez
Post by Francisco Jerez
Post by Jan Vesely
and/or to
implement proper non-blocking variants (would std::async work for
trivial cases like ReadBuffer?)
Hm, and to answer this question -- Yeah, std::async would probably work,
but I'm not certain whether it would actually perform better than the
current approach, because on the one hand the actual DMA-ing of the
buffer is likely to happen quasi-asynchronously already assuming the
driver is competent, and OTOH because spawning a new thread for the copy
would introduce additional overhead that might defeat your purpose
unless the copy is very large -- Only experimentation will tell whether
it pays off.
it was just a speculation. it looks like Vedran is interested in
implementing non-blocking reads/writes[0] so I'll leave it to him. r600
has bigger problems elsewhere atm.
Yeah, I'm aware of his work, I suspect the above are the reasons why he
got rather mixed performance results from his changes.
Post by Jan Vesely
thanks,
Jan
[0]https://bugs.freedesktop.org/show_bug.cgi?id=100199
Post by Francisco Jerez
Post by Francisco Jerez
Post by Jan Vesely
thanks,
Jan
Post by Francisco Jerez
Thank you.
Post by Jan Vesely
v2: wait in map_buffer and map_image as well
---
Hi Aaron,
yes, I think you're right, we should wait in Map* as well.
If nothing else it's consistent, even if passing the flag to
add_map might make it unnecessary (haven't actually checked).
thanks,
Jan
src/gallium/state_trackers/clover/api/transfer.cpp | 30 ++++++++++++++++++++--
1 file changed, 28 insertions(+), 2 deletions(-)
diff --git a/src/gallium/state_trackers/clover/api/transfer.cpp b/src/gallium/state_trackers/clover/api/transfer.cpp
index f7046253be..729a34590e 100644
--- a/src/gallium/state_trackers/clover/api/transfer.cpp
+++ b/src/gallium/state_trackers/clover/api/transfer.cpp
@@ -295,6 +295,9 @@ clEnqueueReadBuffer(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
&mem, obj_origin, obj_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -325,6 +328,9 @@ clEnqueueWriteBuffer(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
ptr, {}, obj_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -362,6 +368,9 @@ clEnqueueReadBufferRect(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
&mem, obj_origin, obj_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -399,6 +408,9 @@ clEnqueueWriteBufferRect(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
ptr, host_origin, host_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -504,6 +516,9 @@ clEnqueueReadImage(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
&img, src_origin, src_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -538,6 +553,9 @@ clEnqueueWriteImage(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
ptr, {}, src_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -667,7 +685,11 @@ clEnqueueMapBuffer(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
void *map = mem.resource(q).add_map(q, flags, blocking, obj_origin, region);
- ret_object(rd_ev, create<hard_event>(q, CL_COMMAND_MAP_BUFFER, deps));
+ auto hev = create<hard_event>(q, CL_COMMAND_MAP_BUFFER, deps);
+ if (blocking)
+ hev().wait();
+
+ ret_object(rd_ev, hev);
ret_error(r_errcode, CL_SUCCESS);
return map;
@@ -695,7 +717,11 @@ clEnqueueMapImage(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
void *map = img.resource(q).add_map(q, flags, blocking, origin, region);
- ret_object(rd_ev, create<hard_event>(q, CL_COMMAND_MAP_IMAGE, deps));
+ auto hev = create<hard_event>(q, CL_COMMAND_MAP_IMAGE, deps);
+ if (blocking)
+ hev().wait();
+
+ ret_object(rd_ev, hev);
ret_error(r_errcode, CL_SUCCESS);
return map;
--
2.13.3
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev
--
--
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev
--
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev
Francisco Jerez
2017-08-18 00:09:23 UTC
Reply
Permalink
Raw Message
Post by Jan Vesely
Post by Francisco Jerez
Post by Jan Vesely
Post by Francisco Jerez
Post by Jan Vesely
Post by Francisco Jerez
Post by Francisco Jerez
Post by Jan Vesely
Hi,
thanks for detailed explanation. I indeed missed the writeBuffer part
in specs.
Post by Francisco Jerez
These changes are somewhat redundant and potentially
performance-impacting, the reason is that in the OpenCL API,
clEnqueueWrite* commands are specified to block until the memory
provided by the application as origin can be reused safely (i.e. until
soft_copy_op()() runs), not necessarily until the transfer to graphics
memory has completed (which is what hard_event::wait() will wait for).
OTOH reads and maps as implemented by soft_copy_op and friends are
essentially blocking so the wait() call is redundant in most cases.
That explains a noticeable slowdown running piglit with these changes.
I'm not sure about the read part though. I expected it to be basically
a noop, but it changes behaviour.
I think this change would have slowed you down the most whenever the
mapping operation performed by soft_copy_op() is able to proceed
immediately, either because the buffer is idle (so the driver doesn't
stall on transfer_map()) *or* because the driver is trying to be smart
and creates a bounce buffer where data can be copied into immediately
without stalling, then inserts a pipelined GPU copy from the bounce
buffer into the real buffer. With this patch you will stall on the GPU
copy regardless (and whatever other work was already on the command
stream which might be substantial), even though it wouldn't have been
necessary in any of these cases.
Post by Jan Vesely
Adding clGetEventInfo(CL_EVENT_COMMAND_EXECUTION_STATUS) after a
blocking read in one of the piglit tests surprisingly returns
CL_QUEUED.
Hmm, yeah, that seems kind of debatable behaviour, although it's
definitely legit for writes, not quite sure for reads... I believe the
reason why that happens is because the CPU copy proceeds very quickly
(due to the reasons mentioned in the last paragraph), but the hard_event
is still associated with a pipe_fence synchronous with the GPU's command
stream, so it won't get signalled until the GPU catches up.
Hi,
sorry for the delay, last week was submission week...
No worries.
Post by Jan Vesely
The part that I'm still missing is what kind of GPU work needs to be
done after clEnqueueRead*(). I assume all necessary work is completed
before I can access the data.
Also CL_QUEUED status was surprising. since we performed at least some
of the work (we got the data), I'd expect CL_RUNNING or CL_SUBMITTED at
least.
The lag is not due to GPU work that needs to be performed after the
clEnqueueRead call, but due to GPU work that may precede it in the
command stream: Because clover doesn't know when the last time was that
the buffer was referenced by GPU work, the event is associated with a
fence synchronous with the current batch (which obviously won't signal
before any of the GPU work that actually referenced the buffer
completes). However the pipe driver has a more accurate idea of when
the buffer was used last, so the transfer_map() operation is likely to
complete more quickly than the CL event status changes to CL_COMPLETE.
The reason you're seeing CL_QUEUED rather than CL_SUBMITTED is because
the read operation didn't even need to flush the current batch, so
there's no fence associated with the CL event object yet.
thanks. so the event is waiting for the current batch, even if the
buffer access is done out of order. The question is, why do we use the
fence at all? If I understood correctly mapping the buffer will be
delayed by the pipe driver if needed, so we don't really need the
fence. Am I missing something?
The current CL event object implementation needs a gallium fence object
in order to find out what status it's in and in order to implement
waiting. The problem is that it doesn't have any fence object available
that is close "enough" to the actual GPU operation that modified the
buffer for the last time. There are several more or less hairy ways to
improve this, e.g. using a soft_event that's manually signaled when
triggered instead of a hard_event (though I'm not sure whether this
would break other CL APIs that may rely on hard_event functionality),
using a different event subclass that doesn't use fences and is instead
considered immediately signaled as soon as it's triggered,
Using soft_event was what I had in mind (new event subclass also looks
OK). The rest looks rather hacky.
Can you be more specific about the potential API issues? we return the
event to user, so the next interaction with clover will be when the
user uses that event as an input. I don't think we expect hard_events
as user input (I haven't really checked).
Some interfaces might because of hard_event-specific functionality.
clGetEventProfilingInfo() comes to mind though it might be legal for you
to return an error to the user because of soft_events lacking profiling
information (the current implementation won't work because it relies on
batch-synchronous timestamp queries but you don't want to be
batch-synchronous). The CL_EVENT_COMMAND_TYPE query may misbehave with
soft_events too.

Also I have a feeling you may break some assumptions of in-order command
execution (which is always enabled in clover). In particular in-order
command queues guarantee that if command A is issued by the application
before command B in the same queue, and command B is reported to be
complete, command A can be assumed by the application to have finished
execution. Sounds like if command B is a read and you report it to be
immediately complete you're likely to end up breaking this assumption,
so you'd have to ev.wait() in order to make sure previous work has
completed in order, which you could do today anyway without taking the
effort of switching over to soft_events. It would be equally suboptimal
either way.
Post by Jan Vesely
I'll try to find some time to experiment with this. do you want me to
send the wait_signalled patch before that?
I guess it wouldn't hurt. :)
Post by Jan Vesely
Jan
Post by Francisco Jerez
using the
current hard_event implementation with a dummy known-signaled pipe
fence, or adding per-buffer fence tracking so each memory object is able
to remember which fence was the last to reference the buffer in the
command stream. Most of these would involve substantial work relative
to the slight benefit achieved (which is dependent on your definition of
the fence being "close enough" to the relevant GPU work).
Post by Jan Vesely
Jan
Post by Francisco Jerez
Post by Jan Vesely
Post by Francisco Jerez
Post by Francisco Jerez
Post by Jan Vesely
The specs don't mention use of events with blocking read, but it does
say that "When the read command has completed, the contents of the
buffer that ptr points to can be used by the application." in the non-
blocking section. I'd say that the expectation is for the event to be
CL_COMPLETE after blocking read (at least beignet/pocl/intel-cpu-sdk
follow this).
Post by Francisco Jerez
The only reason why it might be useful to behave differently on blocking
transfers is that the application may have specified a user event in the
event dependency list, which may cause the soft_copy_op() call to be
delayed until the application signals the user event. In order to fix
that it should really only be necessary to wait for the event action to
be executed, not necessarily its associated GPU work.
I think that another use is that non-blocking writes do not create an
extra copy of the buffer. Thus
clEnqueueWriteBuffer(...,cl_false, ev, ...)
clWaitForEvents(ev)
is more memory efficient.
Post by Francisco Jerez
Last time this issue came up (yeah it's not the first time) I proposed
the patches below to add a mechanism to wait for the event action only,
feel free to include it as PATCH 0.1 and 0.2 of this series (it's been a
while so they may no longer apply cleanly).
I think we can just add comments explaining why the blocking argument
is ignored, until someone chooses to fix this problem
I think the problem is definitely worth fixing, and it shouldn't really
take more effort than adding comments explaining the current behaviour
;), basically just add a bunch of 'if (blocking)
hev().wait_signalled();' where the spec requires it, roughly as you had
been doing in this patch, but wait_signalled() should only stall on the
CPU work associated with the event, which should give you the same
performance as the current approach.
I can send a patch that replaces wait() -> wait_signalled()
Thanks :)
Post by Jan Vesely
Post by Francisco Jerez
Post by Francisco Jerez
Post by Jan Vesely
and/or to
implement proper non-blocking variants (would std::async work for
trivial cases like ReadBuffer?)
Hm, and to answer this question -- Yeah, std::async would probably work,
but I'm not certain whether it would actually perform better than the
current approach, because on the one hand the actual DMA-ing of the
buffer is likely to happen quasi-asynchronously already assuming the
driver is competent, and OTOH because spawning a new thread for the copy
would introduce additional overhead that might defeat your purpose
unless the copy is very large -- Only experimentation will tell whether
it pays off.
it was just a speculation. it looks like Vedran is interested in
implementing non-blocking reads/writes[0] so I'll leave it to him. r600
has bigger problems elsewhere atm.
Yeah, I'm aware of his work, I suspect the above are the reasons why he
got rather mixed performance results from his changes.
Post by Jan Vesely
thanks,
Jan
[0]https://bugs.freedesktop.org/show_bug.cgi?id=100199
Post by Francisco Jerez
Post by Francisco Jerez
Post by Jan Vesely
thanks,
Jan
Post by Francisco Jerez
Thank you.
Post by Jan Vesely
v2: wait in map_buffer and map_image as well
---
Hi Aaron,
yes, I think you're right, we should wait in Map* as well.
If nothing else it's consistent, even if passing the flag to
add_map might make it unnecessary (haven't actually checked).
thanks,
Jan
src/gallium/state_trackers/clover/api/transfer.cpp | 30 ++++++++++++++++++++--
1 file changed, 28 insertions(+), 2 deletions(-)
diff --git a/src/gallium/state_trackers/clover/api/transfer.cpp b/src/gallium/state_trackers/clover/api/transfer.cpp
index f7046253be..729a34590e 100644
--- a/src/gallium/state_trackers/clover/api/transfer.cpp
+++ b/src/gallium/state_trackers/clover/api/transfer.cpp
@@ -295,6 +295,9 @@ clEnqueueReadBuffer(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
&mem, obj_origin, obj_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -325,6 +328,9 @@ clEnqueueWriteBuffer(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
ptr, {}, obj_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -362,6 +368,9 @@ clEnqueueReadBufferRect(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
&mem, obj_origin, obj_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -399,6 +408,9 @@ clEnqueueWriteBufferRect(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
ptr, host_origin, host_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -504,6 +516,9 @@ clEnqueueReadImage(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
&img, src_origin, src_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -538,6 +553,9 @@ clEnqueueWriteImage(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
ptr, {}, src_pitch,
region));
+ if (blocking)
+ hev().wait();
+
ret_object(rd_ev, hev);
return CL_SUCCESS;
@@ -667,7 +685,11 @@ clEnqueueMapBuffer(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
void *map = mem.resource(q).add_map(q, flags, blocking, obj_origin, region);
- ret_object(rd_ev, create<hard_event>(q, CL_COMMAND_MAP_BUFFER, deps));
+ auto hev = create<hard_event>(q, CL_COMMAND_MAP_BUFFER, deps);
+ if (blocking)
+ hev().wait();
+
+ ret_object(rd_ev, hev);
ret_error(r_errcode, CL_SUCCESS);
return map;
@@ -695,7 +717,11 @@ clEnqueueMapImage(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
void *map = img.resource(q).add_map(q, flags, blocking, origin, region);
- ret_object(rd_ev, create<hard_event>(q, CL_COMMAND_MAP_IMAGE, deps));
+ auto hev = create<hard_event>(q, CL_COMMAND_MAP_IMAGE, deps);
+ if (blocking)
+ hev().wait();
+
+ ret_object(rd_ev, hev);
ret_error(r_errcode, CL_SUCCESS);
return map;
--
2.13.3
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev
--
--
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev
--
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev
Loading...