[nvptx] Add some support for .local atomics

The ptx insn atom doesn't support local memory.  In case of doing an atomic
operation on local memory, we run into:
...
operation not supported on global/shared address space
...
This is the cuGetErrorString message for CUDA_ERROR_INVALID_ADDRESS_SPACE.

The message is somewhat confusing given that actually the operation is not
supported on local address space.

Fix this by falling back on a non-atomic version when detecting
a frame-related memory operand.

This only solves some cases that are detected at compile-time.  It does
however fix the openacc private-atomic-* test-cases.

Tested on x86_64 with nvptx accelerator.

gcc/ChangeLog:

2022-01-27  Tom de Vries  <tdevries@suse.de>

	* config/nvptx/nvptx.md (define_insn "atomic_compare_and_swap<mode>_1")
	(define_insn "atomic_exchange<mode>")
	(define_insn "atomic_fetch_add<mode>")
	(define_insn "atomic_fetch_addsf")
	(define_insn "atomic_fetch_<logic><mode>"): Output non-atomic version
	if memory operands is frame-relative.

gcc/testsuite/ChangeLog:

2022-01-31  Tom de Vries  <tdevries@suse.de>

	* gcc.target/nvptx/stack-atomics-run.c: New test.

libgomp/ChangeLog:

2022-01-27  Tom de Vries  <tdevries@suse.de>

	* testsuite/libgomp.oacc-c-c++-common/private-atomic-1.c: Remove
	PR83812 workaround.
	* testsuite/libgomp.oacc-fortran/private-atomic-1-vector.f90: Same.
	* testsuite/libgomp.oacc-fortran/private-atomic-1-worker.f90: Same.
This commit is contained in:
Tom de Vries 2022-01-21 21:46:05 +01:00
parent ca902055d0
commit e0451f93d9
5 changed files with 124 additions and 23 deletions

View File

@ -1790,11 +1790,28 @@
(unspec_volatile:SDIM [(const_int 0)] UNSPECV_CAS))]
""
{
struct address_info info;
decompose_mem_address (&info, operands[1]);
if (info.base != NULL && REG_P (*info.base)
&& REGNO_PTR_FRAME_P (REGNO (*info.base)))
{
output_asm_insn ("{", NULL);
output_asm_insn ("\\t" ".reg.pred" "\\t" "%%eq_p;", NULL);
output_asm_insn ("\\t" ".reg%t0" "\\t" "%%val;", operands);
output_asm_insn ("\\t" "ld%A1%t0" "\\t" "%%val,%1;", operands);
output_asm_insn ("\\t" "setp.eq%t0" "\\t" "%%eq_p, %%val, %2;",
operands);
output_asm_insn ("@%%eq_p\\t" "st%A1%t0" "\\t" "%1,%3;", operands);
output_asm_insn ("\\t" "mov%t0" "\\t" "%0,%%val;", operands);
output_asm_insn ("}", NULL);
return "";
}
const char *t
= "%.\\tatom%A1.cas.b%T0\\t%0, %1, %2, %3;";
= "\\tatom%A1.cas.b%T0\\t%0, %1, %2, %3;";
return nvptx_output_atomic_insn (t, operands, 1, 4);
}
[(set_attr "atomic" "true")])
[(set_attr "atomic" "true")
(set_attr "predicable" "false")])
(define_insn "atomic_exchange<mode>"
[(set (match_operand:SDIM 0 "nvptx_register_operand" "=R") ;; output
@ -1806,6 +1823,19 @@
(match_operand:SDIM 2 "nvptx_nonmemory_operand" "Ri"))] ;; input
""
{
struct address_info info;
decompose_mem_address (&info, operands[1]);
if (info.base != NULL && REG_P (*info.base)
&& REGNO_PTR_FRAME_P (REGNO (*info.base)))
{
output_asm_insn ("{", NULL);
output_asm_insn ("\\t" ".reg%t0" "\\t" "%%val;", operands);
output_asm_insn ("%.\\t" "ld%A1%t0" "\\t" "%%val,%1;", operands);
output_asm_insn ("%.\\t" "st%A1%t0" "\\t" "%1,%2;", operands);
output_asm_insn ("%.\\t" "mov%t0" "\\t" "%0,%%val;", operands);
output_asm_insn ("}", NULL);
return "";
}
const char *t
= "%.\tatom%A1.exch.b%T0\t%0, %1, %2;";
return nvptx_output_atomic_insn (t, operands, 1, 3);
@ -1823,6 +1853,22 @@
(match_dup 1))]
""
{
struct address_info info;
decompose_mem_address (&info, operands[1]);
if (info.base != NULL && REG_P (*info.base)
&& REGNO_PTR_FRAME_P (REGNO (*info.base)))
{
output_asm_insn ("{", NULL);
output_asm_insn ("\\t" ".reg%t0" "\\t" "%%val;", operands);
output_asm_insn ("\\t" ".reg%t0" "\\t" "%%update;", operands);
output_asm_insn ("%.\\t" "ld%A1%t0" "\\t" "%%val,%1;", operands);
output_asm_insn ("%.\\t" "add%t0" "\\t" "%%update,%%val,%2;",
operands);
output_asm_insn ("%.\\t" "st%A1%t0" "\\t" "%1,%%update;", operands);
output_asm_insn ("%.\\t" "mov%t0" "\\t" "%0,%%val;", operands);
output_asm_insn ("}", NULL);
return "";
}
const char *t
= "%.\\tatom%A1.add%t0\\t%0, %1, %2;";
return nvptx_output_atomic_insn (t, operands, 1, 3);
@ -1840,6 +1886,22 @@
(match_dup 1))]
""
{
struct address_info info;
decompose_mem_address (&info, operands[1]);
if (info.base != NULL && REG_P (*info.base)
&& REGNO_PTR_FRAME_P (REGNO (*info.base)))
{
output_asm_insn ("{", NULL);
output_asm_insn ("\\t" ".reg%t0" "\\t" "%%val;", operands);
output_asm_insn ("\\t" ".reg%t0" "\\t" "%%update;", operands);
output_asm_insn ("%.\\t" "ld%A1%t0" "\\t" "%%val,%1;", operands);
output_asm_insn ("%.\\t" "add%t0" "\\t" "%%update,%%val,%2;",
operands);
output_asm_insn ("%.\\t" "st%A1%t0" "\\t" "%1,%%update;", operands);
output_asm_insn ("%.\\t" "mov%t0" "\\t" "%0,%%val;", operands);
output_asm_insn ("}", NULL);
return "";
}
const char *t
= "%.\\tatom%A1.add%t0\\t%0, %1, %2;";
return nvptx_output_atomic_insn (t, operands, 1, 3);
@ -1860,6 +1922,22 @@
(match_dup 1))]
"<MODE>mode == SImode || TARGET_SM35"
{
struct address_info info;
decompose_mem_address (&info, operands[1]);
if (info.base != NULL && REG_P (*info.base)
&& REGNO_PTR_FRAME_P (REGNO (*info.base)))
{
output_asm_insn ("{", NULL);
output_asm_insn ("\\t" ".reg.b%T0" "\\t" "%%val;", operands);
output_asm_insn ("\\t" ".reg.b%T0" "\\t" "%%update;", operands);
output_asm_insn ("%.\\t" "ld%A1%t0" "\\t" "%%val,%1;", operands);
output_asm_insn ("%.\\t" "<logic>.b%T0" "\\t" "%%update,%%val,%2;",
operands);
output_asm_insn ("%.\\t" "st%A1%t0" "\\t" "%1,%%update;", operands);
output_asm_insn ("%.\\t" "mov%t0" "\\t" "%0,%%val;", operands);
output_asm_insn ("}", NULL);
return "";
}
const char *t
= "%.\\tatom%A1.b%T0.<logic>\\t%0, %1, %2;";
return nvptx_output_atomic_insn (t, operands, 1, 3);

View File

@ -0,0 +1,44 @@
/* { dg-do run } */
enum memmodel {
MEMMODEL_RELAXED = 0
};
int
main (void)
{
int a, b;
a = 1;
__atomic_fetch_add (&a, 1, MEMMODEL_RELAXED);
if (a != 2)
__builtin_abort ();
a = 0;
__atomic_fetch_or (&a, 1, MEMMODEL_RELAXED);
if (a != 1)
__builtin_abort ();
a = 1;
b = -1;
b = __atomic_exchange_n (&a, 0, MEMMODEL_RELAXED);
if (a != 0)
__builtin_abort ();
if (b != 1)
__builtin_abort ();
a = 1;
b = -1;
{
int expected = a;
b = __atomic_compare_exchange_n (&a, &expected, 0, 0, MEMMODEL_RELAXED,
MEMMODEL_RELAXED);
}
if (a != 0)
__builtin_abort ();
if (b != 1)
__builtin_abort ();
return 0;
}

View File

@ -32,13 +32,6 @@ int main (void)
{
#pragma acc atomic update
++v;
/* nvptx offloading: PR83812 "operation not supported on global/shared address space".
{ dg-output "(\n|\r\n|\r)libgomp: cuStreamSynchronize error: operation not supported on global/shared address space(\n|\r\n|\r)$" { target openacc_nvidia_accel_selected } }
Scan for what we expect in the "XFAILed" case (without actually XFAILing).
{ dg-shouldfail "XFAILed" { openacc_nvidia_accel_selected } }
... instead of 'dg-xfail-run-if' so that 'dg-output' is evaluated at all.
{ dg-final { if { [dg-process-target { xfail openacc_nvidia_accel_selected }] == "F" } { xfail "[testname-for-summary] really is XFAILed" } } }
... so that we still get an XFAIL visible in the log. */
}
res += (v == -222 + 121);

View File

@ -25,13 +25,6 @@ program main
do i = 0, 31
!$acc atomic update
w = w + 1
! nvptx offloading: PR83812 "operation not supported on global/shared address space".
! { dg-output "(\n|\r\n|\r)libgomp: cuStreamSynchronize error: operation not supported on global/shared address space(\n|\r\n|\r)$" { target openacc_nvidia_accel_selected } }
! Scan for what we expect in the "XFAILed" case (without actually XFAILing).
! { dg-shouldfail "XFAILed" { openacc_nvidia_accel_selected } }
! ... instead of 'dg-xfail-run-if' so that 'dg-output' is evaluated at all.
! { dg-final { if { [dg-process-target { xfail openacc_nvidia_accel_selected }] == "F" } { xfail "[testname-for-summary] really is XFAILed" } } }
! ... so that we still get an XFAIL visible in the log.
!$acc end atomic
end do
arr(j) = w

View File

@ -25,13 +25,6 @@ program main
do i = 0, 31
!$acc atomic update
w = w + 1
! nvptx offloading: PR83812 "operation not supported on global/shared address space".
! { dg-output "(\n|\r\n|\r)libgomp: cuStreamSynchronize error: operation not supported on global/shared address space(\n|\r\n|\r)$" { target openacc_nvidia_accel_selected } }
! Scan for what we expect in the "XFAILed" case (without actually XFAILing).
! { dg-shouldfail "XFAILed" { openacc_nvidia_accel_selected } }
! ... instead of 'dg-xfail-run-if' so that 'dg-output' is evaluated at all.
! { dg-final { if { [dg-process-target { xfail openacc_nvidia_accel_selected }] == "F" } { xfail "[testname-for-summary] really is XFAILed" } } }
! ... so that we still get an XFAIL visible in the log.
!$acc end atomic
end do
arr(j) = w