[nvptx] Add some support for .local atomics
The ptx insn atom doesn't support local memory. In case of doing an atomic operation on local memory, we run into: ... operation not supported on global/shared address space ... This is the cuGetErrorString message for CUDA_ERROR_INVALID_ADDRESS_SPACE. The message is somewhat confusing given that actually the operation is not supported on local address space. Fix this by falling back on a non-atomic version when detecting a frame-related memory operand. This only solves some cases that are detected at compile-time. It does however fix the openacc private-atomic-* test-cases. Tested on x86_64 with nvptx accelerator. gcc/ChangeLog: 2022-01-27 Tom de Vries <tdevries@suse.de> * config/nvptx/nvptx.md (define_insn "atomic_compare_and_swap<mode>_1") (define_insn "atomic_exchange<mode>") (define_insn "atomic_fetch_add<mode>") (define_insn "atomic_fetch_addsf") (define_insn "atomic_fetch_<logic><mode>"): Output non-atomic version if memory operands is frame-relative. gcc/testsuite/ChangeLog: 2022-01-31 Tom de Vries <tdevries@suse.de> * gcc.target/nvptx/stack-atomics-run.c: New test. libgomp/ChangeLog: 2022-01-27 Tom de Vries <tdevries@suse.de> * testsuite/libgomp.oacc-c-c++-common/private-atomic-1.c: Remove PR83812 workaround. * testsuite/libgomp.oacc-fortran/private-atomic-1-vector.f90: Same. * testsuite/libgomp.oacc-fortran/private-atomic-1-worker.f90: Same.
This commit is contained in:
parent
ca902055d0
commit
e0451f93d9
@ -1790,11 +1790,28 @@
|
||||
(unspec_volatile:SDIM [(const_int 0)] UNSPECV_CAS))]
|
||||
""
|
||||
{
|
||||
struct address_info info;
|
||||
decompose_mem_address (&info, operands[1]);
|
||||
if (info.base != NULL && REG_P (*info.base)
|
||||
&& REGNO_PTR_FRAME_P (REGNO (*info.base)))
|
||||
{
|
||||
output_asm_insn ("{", NULL);
|
||||
output_asm_insn ("\\t" ".reg.pred" "\\t" "%%eq_p;", NULL);
|
||||
output_asm_insn ("\\t" ".reg%t0" "\\t" "%%val;", operands);
|
||||
output_asm_insn ("\\t" "ld%A1%t0" "\\t" "%%val,%1;", operands);
|
||||
output_asm_insn ("\\t" "setp.eq%t0" "\\t" "%%eq_p, %%val, %2;",
|
||||
operands);
|
||||
output_asm_insn ("@%%eq_p\\t" "st%A1%t0" "\\t" "%1,%3;", operands);
|
||||
output_asm_insn ("\\t" "mov%t0" "\\t" "%0,%%val;", operands);
|
||||
output_asm_insn ("}", NULL);
|
||||
return "";
|
||||
}
|
||||
const char *t
|
||||
= "%.\\tatom%A1.cas.b%T0\\t%0, %1, %2, %3;";
|
||||
= "\\tatom%A1.cas.b%T0\\t%0, %1, %2, %3;";
|
||||
return nvptx_output_atomic_insn (t, operands, 1, 4);
|
||||
}
|
||||
[(set_attr "atomic" "true")])
|
||||
[(set_attr "atomic" "true")
|
||||
(set_attr "predicable" "false")])
|
||||
|
||||
(define_insn "atomic_exchange<mode>"
|
||||
[(set (match_operand:SDIM 0 "nvptx_register_operand" "=R") ;; output
|
||||
@ -1806,6 +1823,19 @@
|
||||
(match_operand:SDIM 2 "nvptx_nonmemory_operand" "Ri"))] ;; input
|
||||
""
|
||||
{
|
||||
struct address_info info;
|
||||
decompose_mem_address (&info, operands[1]);
|
||||
if (info.base != NULL && REG_P (*info.base)
|
||||
&& REGNO_PTR_FRAME_P (REGNO (*info.base)))
|
||||
{
|
||||
output_asm_insn ("{", NULL);
|
||||
output_asm_insn ("\\t" ".reg%t0" "\\t" "%%val;", operands);
|
||||
output_asm_insn ("%.\\t" "ld%A1%t0" "\\t" "%%val,%1;", operands);
|
||||
output_asm_insn ("%.\\t" "st%A1%t0" "\\t" "%1,%2;", operands);
|
||||
output_asm_insn ("%.\\t" "mov%t0" "\\t" "%0,%%val;", operands);
|
||||
output_asm_insn ("}", NULL);
|
||||
return "";
|
||||
}
|
||||
const char *t
|
||||
= "%.\tatom%A1.exch.b%T0\t%0, %1, %2;";
|
||||
return nvptx_output_atomic_insn (t, operands, 1, 3);
|
||||
@ -1823,6 +1853,22 @@
|
||||
(match_dup 1))]
|
||||
""
|
||||
{
|
||||
struct address_info info;
|
||||
decompose_mem_address (&info, operands[1]);
|
||||
if (info.base != NULL && REG_P (*info.base)
|
||||
&& REGNO_PTR_FRAME_P (REGNO (*info.base)))
|
||||
{
|
||||
output_asm_insn ("{", NULL);
|
||||
output_asm_insn ("\\t" ".reg%t0" "\\t" "%%val;", operands);
|
||||
output_asm_insn ("\\t" ".reg%t0" "\\t" "%%update;", operands);
|
||||
output_asm_insn ("%.\\t" "ld%A1%t0" "\\t" "%%val,%1;", operands);
|
||||
output_asm_insn ("%.\\t" "add%t0" "\\t" "%%update,%%val,%2;",
|
||||
operands);
|
||||
output_asm_insn ("%.\\t" "st%A1%t0" "\\t" "%1,%%update;", operands);
|
||||
output_asm_insn ("%.\\t" "mov%t0" "\\t" "%0,%%val;", operands);
|
||||
output_asm_insn ("}", NULL);
|
||||
return "";
|
||||
}
|
||||
const char *t
|
||||
= "%.\\tatom%A1.add%t0\\t%0, %1, %2;";
|
||||
return nvptx_output_atomic_insn (t, operands, 1, 3);
|
||||
@ -1840,6 +1886,22 @@
|
||||
(match_dup 1))]
|
||||
""
|
||||
{
|
||||
struct address_info info;
|
||||
decompose_mem_address (&info, operands[1]);
|
||||
if (info.base != NULL && REG_P (*info.base)
|
||||
&& REGNO_PTR_FRAME_P (REGNO (*info.base)))
|
||||
{
|
||||
output_asm_insn ("{", NULL);
|
||||
output_asm_insn ("\\t" ".reg%t0" "\\t" "%%val;", operands);
|
||||
output_asm_insn ("\\t" ".reg%t0" "\\t" "%%update;", operands);
|
||||
output_asm_insn ("%.\\t" "ld%A1%t0" "\\t" "%%val,%1;", operands);
|
||||
output_asm_insn ("%.\\t" "add%t0" "\\t" "%%update,%%val,%2;",
|
||||
operands);
|
||||
output_asm_insn ("%.\\t" "st%A1%t0" "\\t" "%1,%%update;", operands);
|
||||
output_asm_insn ("%.\\t" "mov%t0" "\\t" "%0,%%val;", operands);
|
||||
output_asm_insn ("}", NULL);
|
||||
return "";
|
||||
}
|
||||
const char *t
|
||||
= "%.\\tatom%A1.add%t0\\t%0, %1, %2;";
|
||||
return nvptx_output_atomic_insn (t, operands, 1, 3);
|
||||
@ -1860,6 +1922,22 @@
|
||||
(match_dup 1))]
|
||||
"<MODE>mode == SImode || TARGET_SM35"
|
||||
{
|
||||
struct address_info info;
|
||||
decompose_mem_address (&info, operands[1]);
|
||||
if (info.base != NULL && REG_P (*info.base)
|
||||
&& REGNO_PTR_FRAME_P (REGNO (*info.base)))
|
||||
{
|
||||
output_asm_insn ("{", NULL);
|
||||
output_asm_insn ("\\t" ".reg.b%T0" "\\t" "%%val;", operands);
|
||||
output_asm_insn ("\\t" ".reg.b%T0" "\\t" "%%update;", operands);
|
||||
output_asm_insn ("%.\\t" "ld%A1%t0" "\\t" "%%val,%1;", operands);
|
||||
output_asm_insn ("%.\\t" "<logic>.b%T0" "\\t" "%%update,%%val,%2;",
|
||||
operands);
|
||||
output_asm_insn ("%.\\t" "st%A1%t0" "\\t" "%1,%%update;", operands);
|
||||
output_asm_insn ("%.\\t" "mov%t0" "\\t" "%0,%%val;", operands);
|
||||
output_asm_insn ("}", NULL);
|
||||
return "";
|
||||
}
|
||||
const char *t
|
||||
= "%.\\tatom%A1.b%T0.<logic>\\t%0, %1, %2;";
|
||||
return nvptx_output_atomic_insn (t, operands, 1, 3);
|
||||
|
44
gcc/testsuite/gcc.target/nvptx/stack-atomics-run.c
Normal file
44
gcc/testsuite/gcc.target/nvptx/stack-atomics-run.c
Normal file
@ -0,0 +1,44 @@
|
||||
/* { dg-do run } */
|
||||
|
||||
enum memmodel {
|
||||
MEMMODEL_RELAXED = 0
|
||||
};
|
||||
|
||||
int
|
||||
main (void)
|
||||
{
|
||||
int a, b;
|
||||
|
||||
a = 1;
|
||||
__atomic_fetch_add (&a, 1, MEMMODEL_RELAXED);
|
||||
if (a != 2)
|
||||
__builtin_abort ();
|
||||
|
||||
a = 0;
|
||||
__atomic_fetch_or (&a, 1, MEMMODEL_RELAXED);
|
||||
if (a != 1)
|
||||
__builtin_abort ();
|
||||
|
||||
a = 1;
|
||||
b = -1;
|
||||
b = __atomic_exchange_n (&a, 0, MEMMODEL_RELAXED);
|
||||
if (a != 0)
|
||||
__builtin_abort ();
|
||||
if (b != 1)
|
||||
__builtin_abort ();
|
||||
|
||||
a = 1;
|
||||
b = -1;
|
||||
{
|
||||
int expected = a;
|
||||
b = __atomic_compare_exchange_n (&a, &expected, 0, 0, MEMMODEL_RELAXED,
|
||||
MEMMODEL_RELAXED);
|
||||
}
|
||||
if (a != 0)
|
||||
__builtin_abort ();
|
||||
if (b != 1)
|
||||
__builtin_abort ();
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
@ -32,13 +32,6 @@ int main (void)
|
||||
{
|
||||
#pragma acc atomic update
|
||||
++v;
|
||||
/* nvptx offloading: PR83812 "operation not supported on global/shared address space".
|
||||
{ dg-output "(\n|\r\n|\r)libgomp: cuStreamSynchronize error: operation not supported on global/shared address space(\n|\r\n|\r)$" { target openacc_nvidia_accel_selected } }
|
||||
Scan for what we expect in the "XFAILed" case (without actually XFAILing).
|
||||
{ dg-shouldfail "XFAILed" { openacc_nvidia_accel_selected } }
|
||||
... instead of 'dg-xfail-run-if' so that 'dg-output' is evaluated at all.
|
||||
{ dg-final { if { [dg-process-target { xfail openacc_nvidia_accel_selected }] == "F" } { xfail "[testname-for-summary] really is XFAILed" } } }
|
||||
... so that we still get an XFAIL visible in the log. */
|
||||
}
|
||||
|
||||
res += (v == -222 + 121);
|
||||
|
@ -25,13 +25,6 @@ program main
|
||||
do i = 0, 31
|
||||
!$acc atomic update
|
||||
w = w + 1
|
||||
! nvptx offloading: PR83812 "operation not supported on global/shared address space".
|
||||
! { dg-output "(\n|\r\n|\r)libgomp: cuStreamSynchronize error: operation not supported on global/shared address space(\n|\r\n|\r)$" { target openacc_nvidia_accel_selected } }
|
||||
! Scan for what we expect in the "XFAILed" case (without actually XFAILing).
|
||||
! { dg-shouldfail "XFAILed" { openacc_nvidia_accel_selected } }
|
||||
! ... instead of 'dg-xfail-run-if' so that 'dg-output' is evaluated at all.
|
||||
! { dg-final { if { [dg-process-target { xfail openacc_nvidia_accel_selected }] == "F" } { xfail "[testname-for-summary] really is XFAILed" } } }
|
||||
! ... so that we still get an XFAIL visible in the log.
|
||||
!$acc end atomic
|
||||
end do
|
||||
arr(j) = w
|
||||
|
@ -25,13 +25,6 @@ program main
|
||||
do i = 0, 31
|
||||
!$acc atomic update
|
||||
w = w + 1
|
||||
! nvptx offloading: PR83812 "operation not supported on global/shared address space".
|
||||
! { dg-output "(\n|\r\n|\r)libgomp: cuStreamSynchronize error: operation not supported on global/shared address space(\n|\r\n|\r)$" { target openacc_nvidia_accel_selected } }
|
||||
! Scan for what we expect in the "XFAILed" case (without actually XFAILing).
|
||||
! { dg-shouldfail "XFAILed" { openacc_nvidia_accel_selected } }
|
||||
! ... instead of 'dg-xfail-run-if' so that 'dg-output' is evaluated at all.
|
||||
! { dg-final { if { [dg-process-target { xfail openacc_nvidia_accel_selected }] == "F" } { xfail "[testname-for-summary] really is XFAILed" } } }
|
||||
! ... so that we still get an XFAIL visible in the log.
|
||||
!$acc end atomic
|
||||
end do
|
||||
arr(j) = w
|
||||
|
Loading…
Reference in New Issue
Block a user