[nvptx] Fix reduction lock

When I run the libgomp test-case reduction-cplx-dbl.c on an nvptx accelerator
(T400, driver version 470.86), I run into:
...
FAIL: libgomp.oacc-c/../libgomp.oacc-c-c++-common/reduction-cplx-dbl.c \
  -DACC_DEVICE_TYPE_nvidia=1 -DACC_MEM_SHARED=0 -foffload=nvptx-none  -O0  \
  execution test
FAIL: libgomp.oacc-c/../libgomp.oacc-c-c++-common/reduction-cplx-dbl.c \
  -DACC_DEVICE_TYPE_nvidia=1 -DACC_MEM_SHARED=0 -foffload=nvptx-none  -O2  \
  execution test
...

The problem is in this code generated for a gang reduction:
...
$L39:
		atom.global.cas.b32     %r59, [__reduction_lock], 0, 1;
		setp.ne.u32     %r116, %r59, 0;
	@%r116  bra     $L39;
		ld.f64  %r60, [%r44];
		ld.f64  %r61, [%r44+8];
		ld.f64  %r64, [%r44];
		ld.f64  %r65, [%r44+8];
		add.f64 %r117, %r64, %r22;
		add.f64 %r118, %r65, %r41;
		st.f64  [%r44], %r117;
		st.f64  [%r44+8], %r118;
		atom.global.cas.b32     %r119, [__reduction_lock], 1, 0;
...
which is taking and releasing a lock, but missing the appropriate barriers to
protect the loads and store inside the lock.

Fix this by adding membar.gl barriers.

Likewise, add membar.cta barriers if we protect shared memory loads and
stores (even though the worker-partitioning part of the test-case is not
failing).

Tested on x86_64 with nvptx accelerator.

gcc/ChangeLog:

2022-01-27  Tom de Vries  <tdevries@suse.de>

	* config/nvptx/nvptx.cc (enum nvptx_builtins): Add
	NVPTX_BUILTIN_MEMBAR_GL and NVPTX_BUILTIN_MEMBAR_CTA.
	(VOID): New macro.
	(nvptx_init_builtins): Add MEMBAR_GL and MEMBAR_CTA.
	(nvptx_expand_builtin): Handle NVPTX_BUILTIN_MEMBAR_GL and
	NVPTX_BUILTIN_MEMBAR_CTA.
	(nvptx_lockfull_update): Add level parameter.  Emit barriers.
	(nvptx_reduction_update, nvptx_goacc_reduction_fini): Update call to
	nvptx_lockfull_update.
	* config/nvptx/nvptx.md (define_c_enum "unspecv"): Add
	UNSPECV_MEMBAR_GL.
	(define_expand "nvptx_membar_gl"): New expand.
	(define_insn "*nvptx_membar_gl"): New insn.
This commit is contained in:
Tom de Vries 2022-01-21 10:57:43 +01:00
parent 07a971b28c
commit ca902055d0
2 changed files with 49 additions and 5 deletions

View File

@ -5622,6 +5622,8 @@ enum nvptx_builtins
NVPTX_BUILTIN_VECTOR_ADDR,
NVPTX_BUILTIN_CMP_SWAP,
NVPTX_BUILTIN_CMP_SWAPLL,
NVPTX_BUILTIN_MEMBAR_GL,
NVPTX_BUILTIN_MEMBAR_CTA,
NVPTX_BUILTIN_MAX
};
@ -5652,6 +5654,7 @@ nvptx_init_builtins (void)
#define UINT unsigned_type_node
#define LLUINT long_long_unsigned_type_node
#define PTRVOID ptr_type_node
#define VOID void_type_node
DEF (SHUFFLE, "shuffle", (UINT, UINT, UINT, UINT, NULL_TREE));
DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE));
@ -5661,6 +5664,8 @@ nvptx_init_builtins (void)
(PTRVOID, ST, UINT, UINT, NULL_TREE));
DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
DEF (MEMBAR_GL, "membar_gl", (VOID, VOID, NULL_TREE));
DEF (MEMBAR_CTA, "membar_cta", (VOID, VOID, NULL_TREE));
#undef DEF
#undef ST
@ -5696,6 +5701,14 @@ nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
case NVPTX_BUILTIN_CMP_SWAPLL:
return nvptx_expand_cmp_swap (exp, target, mode, ignore);
case NVPTX_BUILTIN_MEMBAR_GL:
emit_insn (gen_nvptx_membar_gl ());
return NULL_RTX;
case NVPTX_BUILTIN_MEMBAR_CTA:
emit_insn (gen_nvptx_membar_cta ());
return NULL_RTX;
default: gcc_unreachable ();
}
}
@ -6243,7 +6256,7 @@ nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
static tree
nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi,
tree ptr, tree var, tree_code op)
tree ptr, tree var, tree_code op, int level)
{
tree var_type = TREE_TYPE (var);
tree swap_fn = nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP, true);
@ -6295,8 +6308,17 @@ nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi,
lock_loop->any_estimate = true;
add_loop (lock_loop, entry_bb->loop_father);
/* Build and insert the reduction calculation. */
/* Build the pre-barrier. */
gimple_seq red_seq = NULL;
enum nvptx_builtins barrier_builtin
= (level == GOMP_DIM_GANG
? NVPTX_BUILTIN_MEMBAR_GL
: NVPTX_BUILTIN_MEMBAR_CTA);
tree barrier_fn = nvptx_builtin_decl (barrier_builtin, true);
tree barrier_expr = build_call_expr_loc (loc, barrier_fn, 0);
gimplify_stmt (&barrier_expr, &red_seq);
/* Build the reduction calculation. */
tree acc_in = make_ssa_name (var_type);
tree ref_in = build_simple_mem_ref (ptr);
TREE_THIS_VOLATILE (ref_in) = 1;
@ -6310,6 +6332,11 @@ nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi,
TREE_THIS_VOLATILE (ref_out) = 1;
gimplify_assign (ref_out, acc_out, &red_seq);
/* Build the post-barrier. */
barrier_expr = build_call_expr_loc (loc, barrier_fn, 0);
gimplify_stmt (&barrier_expr, &red_seq);
/* Insert the reduction calculation. */
gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
/* Build & insert the unlock sequence. */
@ -6330,7 +6357,7 @@ nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi,
static tree
nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi,
tree ptr, tree var, tree_code op)
tree ptr, tree var, tree_code op, int level)
{
tree type = TREE_TYPE (var);
tree size = TYPE_SIZE (type);
@ -6339,7 +6366,7 @@ nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi,
|| size == TYPE_SIZE (long_long_unsigned_type_node))
return nvptx_lockless_update (loc, gsi, ptr, var, op);
else
return nvptx_lockfull_update (loc, gsi, ptr, var, op);
return nvptx_lockfull_update (loc, gsi, ptr, var, op, level);
}
/* NVPTX implementation of GOACC_REDUCTION_SETUP. */
@ -6531,7 +6558,7 @@ nvptx_goacc_reduction_fini (gcall *call, offload_attrs *oa)
gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
seq = NULL;
r = nvptx_reduction_update (gimple_location (call), &gsi,
accum, var, op);
accum, var, op, level);
}
}

View File

@ -58,6 +58,7 @@
UNSPECV_BARSYNC
UNSPECV_MEMBAR
UNSPECV_MEMBAR_CTA
UNSPECV_MEMBAR_GL
UNSPECV_DIM_POS
UNSPECV_FORK
@ -1932,6 +1933,22 @@
"\\tmembar.cta;"
[(set_attr "predicable" "false")])
(define_expand "nvptx_membar_gl"
[(set (match_dup 0)
(unspec_volatile:BLK [(match_dup 0)] UNSPECV_MEMBAR_GL))]
""
{
operands[0] = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode));
MEM_VOLATILE_P (operands[0]) = 1;
})
(define_insn "*nvptx_membar_gl"
[(set (match_operand:BLK 0 "" "")
(unspec_volatile:BLK [(match_dup 0)] UNSPECV_MEMBAR_GL))]
""
"\\tmembar.gl;"
[(set_attr "predicable" "false")])
(define_insn "nvptx_nounroll"
[(unspec_volatile [(const_int 0)] UNSPECV_NOUNROLL)]
""