[BRIGFE] phsa-specific optimizations

Add flag -fassume-phsa that is on by default. If -fno-assume-phsa
is given, these optimizations are disabled.

With this flag, gccbrig can generate GENERIC that assumes we are
targeting a phsa-runtime based implementation, which allows us
to expose the work-item context accesses to retrieve WI IDs etc.
which helps optimizers.

First optimization that takes advantage of this is to get rid of
the setworkitemid calls whenever we have non-inlined calls that
use IDs internally.

Other optimizations added in this commit:

- expand absoluteid to similar level of simplicity as workitemid.
At the moment absoluteid is the best indexing ID to end up with
WG vectorization.
- propagate ID variables closer to their uses. This is mainly
to avoid known useless casts, which confuse at least scalar
evolution analysis.
- use signed long long for storing IDs. Unsigned integers have
defined wraparound semantics, which confuse at least scalar
evolution analysis, leading to unvectorizable WI loops.
- also refactor some BRIG function generation helpers to brig_function.
- no point in having the wi-loop as a for-loop. It's really
a do...while and SCEV can analyze it just fine still.
- add consts to ptrs etc. in BRIG builtin defs.
Improves optimization opportunities.
- add qualifiers to generated function parameters.
Const and restrict on the hidden local/private pointers,
the arg buffer and the context pointer help some optimizations.

From-SVN: r259957
This commit is contained in:
Pekka Jääskeläinen 2018-05-04 19:43:57 +00:00
parent 1e25c5a9bb
commit 080dc24383
25 changed files with 1257 additions and 814 deletions

View File

@ -1,3 +1,9 @@
2018-05-04 Pekka Jääskeläinen <pekka.jaaskelainen@parmance.com>
* brig-builtins.def: Add consts to ptrs etc. in BRIG builtin defs.
To improve optimization opportunities.
* builtin-types.def: The new needed builtin types for the above.
2018-05-04 Richard Biener <rguenther@suse.de>
* bb-reorder.c (sanitize_hot_paths): Release hot_bbs_to_check.

View File

@ -45,25 +45,25 @@ DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_GRIDSIZE, BRIG_OPCODE_GRIDSIZE,
DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_WORKITEMFLATABSID_U32,
BRIG_OPCODE_WORKITEMFLATABSID, BRIG_TYPE_U32,
"__hsail_workitemflatabsid_u32", BT_FN_UINT_PTR,
ATTR_NOTHROW_LEAF_LIST)
"__hsail_workitemflatabsid_u32", BT_FN_UINT_CONST_PTR,
ATTR_PURE_NOTHROW_LEAF_LIST)
DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_WORKITEMFLATABSID_U64,
BRIG_OPCODE_WORKITEMFLATABSID, BRIG_TYPE_U64,
"__hsail_workitemflatabsid_u64", BT_FN_ULONG_PTR,
ATTR_NOTHROW_LEAF_LIST)
"__hsail_workitemflatabsid_u64", BT_FN_ULONG_CONST_PTR,
ATTR_PURE_NOTHROW_LEAF_LIST)
DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_WORKITEMFLATID, BRIG_OPCODE_WORKITEMFLATID,
BRIG_TYPE_U32, "__hsail_workitemflatid", BT_FN_UINT_PTR,
ATTR_NOTHROW_LEAF_LIST)
BRIG_TYPE_U32, "__hsail_workitemflatid", BT_FN_UINT_CONST_PTR,
ATTR_PURE_NOTHROW_LEAF_LIST)
DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_WORKITEMID, BRIG_OPCODE_WORKITEMID,
BRIG_TYPE_U32, "__hsail_workitemid", BT_FN_UINT_UINT_PTR,
ATTR_NOTHROW_LEAF_LIST)
BRIG_TYPE_U32, "__hsail_workitemid",
BT_FN_UINT_UINT_CONST_PTR, ATTR_PURE_NOTHROW_LEAF_LIST)
DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_WORKGROUPID, BRIG_OPCODE_WORKGROUPID,
BRIG_TYPE_U32, "__hsail_workgroupid", BT_FN_UINT_UINT_PTR,
ATTR_PURE_NOTHROW_LEAF_LIST)
BRIG_TYPE_U32, "__hsail_workgroupid",
BT_FN_UINT_UINT_CONST_PTR, ATTR_PURE_NOTHROW_LEAF_LIST)
DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_CURRENTWORKITEMFLATID,
BRIG_OPCODE_CURRENTWORKITEMFLATID,
@ -90,11 +90,12 @@ DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_PACKETCOMPLETIONSIG_SIG32,
DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_CURRENTWORKGROUPSIZE,
BRIG_OPCODE_CURRENTWORKGROUPSIZE, BRIG_TYPE_U32,
"__hsail_currentworkgroupsize", BT_FN_UINT_UINT_PTR,
"__hsail_currentworkgroupsize", BT_FN_UINT_UINT_CONST_PTR,
ATTR_PURE_NOTHROW_LEAF_LIST)
DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_WORKGROUPSIZE, BRIG_OPCODE_WORKGROUPSIZE,
BRIG_TYPE_U32, "__hsail_workgroupsize", BT_FN_UINT_UINT_PTR,
BRIG_TYPE_U32, "__hsail_workgroupsize",
BT_FN_UINT_UINT_CONST_PTR,
ATTR_PURE_NOTHROW_LEAF_LIST)
DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_DIM, BRIG_OPCODE_DIM,
@ -565,7 +566,7 @@ DEF_HSAIL_INTR_BUILTIN (BUILT_IN_HSAIL_SETWORKITEMID, "__hsail_setworkitemid",
DEF_HSAIL_INTR_BUILTIN (BUILT_IN_HSAIL_LAUNCH_WG_FUNC,
"__hsail_launch_wg_function",
BT_FN_VOID_PTR_PTR_PTR, ATTR_NOTHROW_LEAF_LIST)
BT_FN_VOID_PTR_PTR_UINT32, ATTR_NOTHROW_LEAF_LIST)
DEF_HSAIL_INTR_BUILTIN (BUILT_IN_HSAIL_LAUNCH_KERNEL,
"__hsail_launch_kernel",

View File

@ -1,3 +1,46 @@
2018-05-04 Pekka Jääskeläinen <pekka.jaaskelainen@parmance.com>
Add flag -fassume-phsa that is on by default. If -fno-assume-phsa
is given, these optimizations are disabled. With this flag, gccbrig
can generate GENERIC that assumes we are targeting a phsa-runtime
based implementation, which allows us to expose the work-item context
accesses to retrieve WI IDs etc. which helps optimizers.
First optimization that takes advantage of this is to get rid of
the setworkitemid calls whenever we have non-inlined calls that
use IDs internally. Other optimizations added in this commit:
- expand absoluteid to similar level of simplicity as workitemid.
At the moment absoluteid is the best indexing ID to end up with
WG vectorization.
- propagate ID variables closer to their uses. This is mainly
to avoid known useless casts, which confuse at least scalar
evolution analysis.
- use signed long long for storing IDs. Unsigned integers have
defined wraparound semantics, which confuse at least scalar
evolution analysis, leading to unvectorizable WI loops.
- also refactor some BRIG function generation helpers to brig_function.
- no point in having the wi-loop as a for-loop. It's really
a do...while and SCEV can analyze it just fine still.
- add consts to ptrs etc. in BRIG builtin defs.
Improves optimization opportunities.
- add qualifiers to generated function parameters.
Const and restrict on the hidden local/private pointers,
the arg buffer and the context pointer help some optimizations.
* brig/brigfrontend/brig-basic-inst-handler.cc: See above.
* brig/brigfrontend/brig-branch-inst-handler.cc: See above.
* brig/brigfrontend/brig-cmp-inst-handler.cc: See above.
* brig/brigfrontend/brig-code-entry-handler.cc: See above.
* brig/brigfrontend/brig-code-entry-handler.h: See above.
* brig/brigfrontend/brig-control-handler.cc: See above.
* brig/brigfrontend/brig-cvt-inst-handler.cc: See above.
* brig/brigfrontend/brig-function-handler.cc: See above.
* brig/brigfrontend/brig-function.cc: See above.
* brig/brigfrontend/brig-function.h: See above.
* brig/brigfrontend/brig-label-handler.cc: See above.
* brig/brigfrontend/brig-lane-inst-handler.cc: See above.
* brig/brigfrontend/brig-mem-inst-handler.cc: See above.
* brig/brigfrontend/phsa.h: See above.
* brig/lang.opt: See above.
2018-05-04 Pekka Jääskeläinen <pekka.jaaskelainen@parmance.com>
* brig/brigfrontend/brig-function-handler.cc: Skip multiple forward

View File

@ -105,7 +105,8 @@ brig_basic_inst_handler::build_shuffle (tree arith_type,
/* Unpack the tightly packed mask elements to BIT_FIELD_REFs
from which to construct the mask vector as understood by
VEC_PERM_EXPR. */
tree mask_operand = add_temp_var ("shuffle_mask", operands[2]);
tree mask_operand
= m_parent.m_cf->add_temp_var ("shuffle_mask", operands[2]);
tree mask_element_type
= build_nonstandard_integer_type (input_mask_element_size, true);
@ -219,10 +220,11 @@ brig_basic_inst_handler::build_pack (tree_stl_vec &operands)
tree wide_type = build_nonstandard_integer_type (vecsize, 1);
tree src_vect = build_resize_convert_view (wide_type, operands[0]);
src_vect = add_temp_var ("src_vect", src_vect);
src_vect = m_parent.m_cf->add_temp_var ("src_vect", src_vect);
tree scalar = operands[1];
scalar = add_temp_var ("scalar", convert_to_integer (wide_type, scalar));
scalar = m_parent.m_cf->add_temp_var ("scalar",
convert_to_integer (wide_type, scalar));
tree pos = operands[2];
@ -230,21 +232,22 @@ brig_basic_inst_handler::build_pack (tree_stl_vec &operands)
Zero them for well-defined semantics. */
tree t = build2 (BIT_AND_EXPR, TREE_TYPE (pos), operands[2],
build_int_cstu (TREE_TYPE (pos), ecount - 1));
pos = add_temp_var ("pos", convert (wide_type, t));
pos = m_parent.m_cf->add_temp_var ("pos", convert (wide_type, t));
tree element_type = TREE_TYPE (TREE_TYPE (operands[0]));
size_t element_width = int_size_in_bytes (element_type) * BITS_PER_UNIT;
tree ewidth = build_int_cstu (wide_type, element_width);
tree bitoffset = build2 (MULT_EXPR, wide_type, ewidth, pos);
bitoffset = add_temp_var ("offset", bitoffset);
bitoffset = m_parent.m_cf->add_temp_var ("offset", bitoffset);
uint64_t mask_int
= element_width == 64 ? (uint64_t) -1 : ((uint64_t) 1 << element_width) - 1;
tree mask = build_int_cstu (wide_type, mask_int);
mask = add_temp_var ("mask", convert_to_integer (wide_type, mask));
mask = m_parent.m_cf->add_temp_var ("mask",
convert_to_integer (wide_type, mask));
tree clearing_mask
= build1 (BIT_NOT_EXPR, wide_type,
@ -311,7 +314,8 @@ brig_basic_inst_handler::build_inst_expr (BrigOpcode16_t brig_opcode,
tree arith_type,
tree_stl_vec &operands)
{
tree_code opcode = get_tree_code_for_hsa_opcode (brig_opcode, brig_type);
tree_code opcode
= brig_function::get_tree_code_for_hsa_opcode (brig_opcode, brig_type);
BrigType16_t inner_type = brig_type & BRIG_TYPE_BASE_MASK;
@ -388,8 +392,8 @@ brig_basic_inst_handler::build_inst_expr (BrigOpcode16_t brig_opcode,
on which cannot be used in general to remain HSAIL compliant.
Perhaps a builtin call would be better option here. */
return build2 (RDIV_EXPR, arith_type, build_one_cst (arith_type),
expand_or_call_builtin (BRIG_OPCODE_SQRT, brig_type,
arith_type, operands));
m_parent.m_cf->expand_or_call_builtin
(BRIG_OPCODE_SQRT, brig_type, arith_type, operands));
}
else if (brig_opcode == BRIG_OPCODE_NRCP)
{
@ -410,8 +414,8 @@ brig_basic_inst_handler::build_inst_expr (BrigOpcode16_t brig_opcode,
gcc_unreachable ();
}
else if (opcode == CALL_EXPR)
return expand_or_call_builtin (brig_opcode, brig_type, arith_type,
operands);
return m_parent.m_cf->expand_or_call_builtin (brig_opcode, brig_type,
arith_type, operands);
else if (output_count == 1)
{
if (input_count == 1)
@ -520,7 +524,8 @@ brig_basic_inst_handler::operator () (const BrigBase *base)
in_operands[0] = build_lower_element_broadcast (in_operands[0]);
tree_code opcode
= get_tree_code_for_hsa_opcode (brig_inst->opcode, brig_inst_type);
= brig_function::get_tree_code_for_hsa_opcode (brig_inst->opcode,
brig_inst_type);
if (p >= BRIG_PACK_PPSAT && p <= BRIG_PACK_PSAT)
{
@ -566,11 +571,11 @@ brig_basic_inst_handler::operator () (const BrigBase *base)
*/
tree_stl_vec operand0_elements;
if (input_count > 0)
unpack (in_operands[0], operand0_elements);
m_parent.m_cf->unpack (in_operands[0], operand0_elements);
tree_stl_vec operand1_elements;
if (input_count > 1)
unpack (in_operands[1], operand1_elements);
m_parent.m_cf->unpack (in_operands[1], operand1_elements);
tree_stl_vec result_elements;
@ -617,7 +622,7 @@ brig_basic_inst_handler::operator () (const BrigBase *base)
result_elements.push_back (convert (scalar_type, scalar_expr));
}
instr_expr = pack (result_elements);
instr_expr = m_parent.m_cf->pack (result_elements);
}
else
{
@ -728,140 +733,3 @@ brig_basic_inst_handler::build_lower_element_broadcast (tree vec_operand)
vec_operand, mask);
}
/* Returns the tree code that should be used to implement the given
HSA instruction opcode (BRIG_OPCODE) for the given type of instruction
(BRIG_TYPE). In case the opcode cannot be mapped to a TREE node directly,
returns TREE_LIST (if it can be emulated with a simple chain of tree
nodes) or CALL_EXPR if the opcode should be implemented using a builtin
call. */
tree_code
brig_basic_inst_handler::get_tree_code_for_hsa_opcode
(BrigOpcode16_t brig_opcode, BrigType16_t brig_type) const
{
BrigType16_t brig_inner_type = brig_type & BRIG_TYPE_BASE_MASK;
switch (brig_opcode)
{
case BRIG_OPCODE_NOP:
return NOP_EXPR;
case BRIG_OPCODE_ADD:
return PLUS_EXPR;
case BRIG_OPCODE_CMOV:
if (brig_inner_type == brig_type)
return COND_EXPR;
else
return VEC_COND_EXPR;
case BRIG_OPCODE_SUB:
return MINUS_EXPR;
case BRIG_OPCODE_MUL:
case BRIG_OPCODE_MUL24:
return MULT_EXPR;
case BRIG_OPCODE_MULHI:
case BRIG_OPCODE_MUL24HI:
return MULT_HIGHPART_EXPR;
case BRIG_OPCODE_DIV:
if (gccbrig_is_float_type (brig_inner_type))
return RDIV_EXPR;
else
return TRUNC_DIV_EXPR;
case BRIG_OPCODE_NEG:
return NEGATE_EXPR;
case BRIG_OPCODE_MIN:
if (gccbrig_is_float_type (brig_inner_type))
return CALL_EXPR;
else
return MIN_EXPR;
case BRIG_OPCODE_MAX:
if (gccbrig_is_float_type (brig_inner_type))
return CALL_EXPR;
else
return MAX_EXPR;
case BRIG_OPCODE_FMA:
return FMA_EXPR;
case BRIG_OPCODE_ABS:
return ABS_EXPR;
case BRIG_OPCODE_SHL:
return LSHIFT_EXPR;
case BRIG_OPCODE_SHR:
return RSHIFT_EXPR;
case BRIG_OPCODE_OR:
return BIT_IOR_EXPR;
case BRIG_OPCODE_XOR:
return BIT_XOR_EXPR;
case BRIG_OPCODE_AND:
return BIT_AND_EXPR;
case BRIG_OPCODE_NOT:
return BIT_NOT_EXPR;
case BRIG_OPCODE_RET:
return RETURN_EXPR;
case BRIG_OPCODE_MOV:
case BRIG_OPCODE_LDF:
return MODIFY_EXPR;
case BRIG_OPCODE_LD:
case BRIG_OPCODE_ST:
return MEM_REF;
case BRIG_OPCODE_BR:
return GOTO_EXPR;
case BRIG_OPCODE_REM:
if (brig_type == BRIG_TYPE_U64 || brig_type == BRIG_TYPE_U32)
return TRUNC_MOD_EXPR;
else
return CALL_EXPR;
case BRIG_OPCODE_NRCP:
case BRIG_OPCODE_NRSQRT:
/* Implement as 1/f (x). gcc should pattern detect that and
use a native instruction, if available, for it. */
return TREE_LIST;
case BRIG_OPCODE_FLOOR:
case BRIG_OPCODE_CEIL:
case BRIG_OPCODE_SQRT:
case BRIG_OPCODE_NSQRT:
case BRIG_OPCODE_RINT:
case BRIG_OPCODE_TRUNC:
case BRIG_OPCODE_POPCOUNT:
case BRIG_OPCODE_COPYSIGN:
case BRIG_OPCODE_NCOS:
case BRIG_OPCODE_NSIN:
case BRIG_OPCODE_NLOG2:
case BRIG_OPCODE_NEXP2:
case BRIG_OPCODE_NFMA:
/* Class has type B1 regardless of the float type, thus
the below builtin map search cannot find it. */
case BRIG_OPCODE_CLASS:
case BRIG_OPCODE_WORKITEMABSID:
return CALL_EXPR;
default:
/* Some BRIG opcodes can use the same builtins for unsigned and
signed types. Force these cases to unsigned types.
*/
if (brig_opcode == BRIG_OPCODE_BORROW
|| brig_opcode == BRIG_OPCODE_CARRY
|| brig_opcode == BRIG_OPCODE_LASTBIT
|| brig_opcode == BRIG_OPCODE_BITINSERT)
{
if (brig_type == BRIG_TYPE_S32)
brig_type = BRIG_TYPE_U32;
else if (brig_type == BRIG_TYPE_S64)
brig_type = BRIG_TYPE_U64;
}
builtin_map::const_iterator i
= s_custom_builtins.find (std::make_pair (brig_opcode, brig_type));
if (i != s_custom_builtins.end ())
return CALL_EXPR;
else if (s_custom_builtins.find
(std::make_pair (brig_opcode, brig_inner_type))
!= s_custom_builtins.end ())
return CALL_EXPR;
if (brig_inner_type == BRIG_TYPE_F16
&& s_custom_builtins.find
(std::make_pair (brig_opcode, BRIG_TYPE_F32))
!= s_custom_builtins.end ())
return CALL_EXPR;
break;
}
return TREE_LIST; /* Emulate using a chain of nodes. */
}

View File

@ -119,10 +119,11 @@ brig_branch_inst_handler::operator () (const BrigBase *base)
memory. */
tree group_local_offset
= add_temp_var ("group_local_offset",
build_int_cst
(uint32_type_node,
m_parent.m_cf->m_local_group_variables.size()));
= m_parent.m_cf->add_temp_var ("group_local_offset",
build_int_cst
(uint32_type_node,
m_parent.m_cf->
m_local_group_variables.size()));
/* TODO: ensure the callee's frame is aligned! */
@ -152,6 +153,7 @@ brig_branch_inst_handler::operator () (const BrigBase *base)
m_parent.m_cf->m_called_functions.push_back (func_ref);
if (DECL_EXTERNAL (func_ref))
m_parent.add_decl_call (call);
m_parent.m_cf->start_new_bb ();
return base->byteCount;
}
@ -216,18 +218,21 @@ brig_branch_inst_handler::operator () (const BrigBase *base)
ensure the barrier won't be duplicated or moved out of loops etc.
Like the 'noduplicate' of LLVM. Same goes for fbarriers. */
m_parent.m_cf->append_statement
(expand_or_call_builtin (brig_inst->opcode, BRIG_TYPE_NONE, NULL_TREE,
call_operands));
(m_parent.m_cf->expand_or_call_builtin (brig_inst->opcode,
BRIG_TYPE_NONE, NULL_TREE,
call_operands));
}
else if (brig_inst->opcode >= BRIG_OPCODE_ARRIVEFBAR
&& brig_inst->opcode <= BRIG_OPCODE_WAITFBAR)
{
m_parent.m_cf->m_has_barriers = true;
m_parent.m_cf->append_statement
(expand_or_call_builtin (brig_inst->opcode, BRIG_TYPE_NONE,
uint32_type_node, operands));
(m_parent.m_cf->expand_or_call_builtin (brig_inst->opcode,
BRIG_TYPE_NONE,
uint32_type_node, operands));
}
else
gcc_unreachable ();
m_parent.m_cf->start_new_bb ();
return base->byteCount;
}

View File

@ -180,17 +180,17 @@ brig_cmp_inst_handler::operator () (const BrigBase *base)
results, we must now truncate the result vector to S16s so it
fits to the destination register. We can build the target vector
type from the f16 storage type (unsigned ints). */
expr = add_temp_var ("wide_cmp_result", expr);
expr = m_parent.m_cf->add_temp_var ("wide_cmp_result", expr);
tree_stl_vec wide_elements;
tree_stl_vec shrunk_elements;
unpack (expr, wide_elements);
m_parent.m_cf->unpack (expr, wide_elements);
for (size_t i = 0; i < wide_elements.size (); ++i)
{
tree wide = wide_elements.at (i);
shrunk_elements.push_back
(convert_to_integer (short_integer_type_node, wide));
}
expr = pack (shrunk_elements);
expr = m_parent.m_cf->pack (shrunk_elements);
}
build_output_assignment (*inst_base, operands[0], expr);

View File

@ -41,24 +41,9 @@
#include "brig-builtins.h"
#include "fold-const.h"
brig_code_entry_handler::builtin_map brig_code_entry_handler::s_custom_builtins;
brig_code_entry_handler::brig_code_entry_handler (brig_to_generic &parent)
: brig_entry_handler (parent)
{
if (s_custom_builtins.size () > 0) return;
/* Populate the builtin index. */
#undef DEF_HSAIL_ATOMIC_BUILTIN
#undef DEF_HSAIL_CVT_ZEROI_SAT_BUILTIN
#undef DEF_HSAIL_INTR_BUILTIN
#undef DEF_HSAIL_SAT_BUILTIN
#undef DEF_HSAIL_BUILTIN
#define DEF_HSAIL_BUILTIN(ENUM, HSAIL_OPCODE, HSAIL_TYPE, NAME, TYPE, ATTRS) \
s_custom_builtins[std::make_pair (HSAIL_OPCODE, HSAIL_TYPE)] \
= builtin_decl_explicit (ENUM);
#include "brig-builtins.def"
}
/* Build a tree operand which is a reference to a piece of code. REF is the
@ -301,18 +286,18 @@ brig_code_entry_handler::build_address_operand
tree local_size
= build2 (MULT_EXPR, uint32_type_node,
expand_or_call_builtin (BRIG_OPCODE_WORKGROUPSIZE,
BRIG_TYPE_U32,
uint32_type_node, uint32_0),
expand_or_call_builtin (BRIG_OPCODE_WORKGROUPSIZE,
BRIG_TYPE_U32,
uint32_type_node, uint32_1));
m_parent.m_cf->expand_or_call_builtin
(BRIG_OPCODE_WORKGROUPSIZE, BRIG_TYPE_U32,
uint32_type_node, uint32_0),
m_parent.m_cf->expand_or_call_builtin
(BRIG_OPCODE_WORKGROUPSIZE, BRIG_TYPE_U32,
uint32_type_node, uint32_1));
local_size
= build2 (MULT_EXPR, uint32_type_node,
expand_or_call_builtin (BRIG_OPCODE_WORKGROUPSIZE,
BRIG_TYPE_U32,
uint32_type_node, uint32_2),
m_parent.m_cf->expand_or_call_builtin
(BRIG_OPCODE_WORKGROUPSIZE, BRIG_TYPE_U32,
uint32_type_node, uint32_2),
local_size);
tree var_region
@ -324,9 +309,9 @@ brig_code_entry_handler::build_address_operand
= build2 (MULT_EXPR, uint32_type_node,
build_int_cst (uint32_type_node,
m_parent.private_variable_size (var_name)),
expand_or_call_builtin (BRIG_OPCODE_WORKITEMFLATID,
BRIG_TYPE_U32,
uint32_type_node, operands));
m_parent.m_cf->expand_or_call_builtin
(BRIG_OPCODE_WORKITEMFLATID, BRIG_TYPE_U32,
uint32_type_node, operands));
tree var_offset
= build2 (PLUS_EXPR, uint32_type_node, var_region, pos);
@ -336,8 +321,9 @@ brig_code_entry_handler::build_address_operand
offset to a flat address by adding it as an offset to a (private
or group) base pointer later on. Same applies to group_var_offset. */
symbol_base
= add_temp_var ("priv_var_offset",
convert (size_type_node, var_offset));
= m_parent.m_cf->add_temp_var ("priv_var_offset",
convert (size_type_node,
var_offset));
}
else if (segment == BRIG_SEGMENT_ARG)
{
@ -699,138 +685,6 @@ brig_code_entry_handler::get_tree_expr_type_for_hsa_type
return gccbrig_tree_type_for_hsa_type (brig_type);
}
/* In case the HSA instruction must be implemented using a builtin,
this function is called to get the correct builtin function.
TYPE is the instruction tree type, BRIG_OPCODE the opcode of the
brig instruction and BRIG_TYPE the brig instruction's type. */
tree
brig_code_entry_handler::get_builtin_for_hsa_opcode
(tree type, BrigOpcode16_t brig_opcode, BrigType16_t brig_type) const
{
tree builtin = NULL_TREE;
tree builtin_type = type;
/* For vector types, first find the scalar version of the builtin. */
if (type != NULL_TREE && VECTOR_TYPE_P (type))
builtin_type = TREE_TYPE (type);
BrigType16_t brig_inner_type = brig_type & BRIG_TYPE_BASE_MASK;
/* Some BRIG opcodes can use the same builtins for unsigned and
signed types. Force these cases to unsigned types. */
if (brig_opcode == BRIG_OPCODE_BORROW
|| brig_opcode == BRIG_OPCODE_CARRY
|| brig_opcode == BRIG_OPCODE_LASTBIT
|| brig_opcode == BRIG_OPCODE_BITINSERT)
{
if (brig_type == BRIG_TYPE_S32)
brig_type = BRIG_TYPE_U32;
else if (brig_type == BRIG_TYPE_S64)
brig_type = BRIG_TYPE_U64;
}
switch (brig_opcode)
{
case BRIG_OPCODE_FLOOR:
builtin = mathfn_built_in (builtin_type, BUILT_IN_FLOOR);
break;
case BRIG_OPCODE_CEIL:
builtin = mathfn_built_in (builtin_type, BUILT_IN_CEIL);
break;
case BRIG_OPCODE_SQRT:
case BRIG_OPCODE_NSQRT:
builtin = mathfn_built_in (builtin_type, BUILT_IN_SQRT);
break;
case BRIG_OPCODE_RINT:
builtin = mathfn_built_in (builtin_type, BUILT_IN_RINT);
break;
case BRIG_OPCODE_TRUNC:
builtin = mathfn_built_in (builtin_type, BUILT_IN_TRUNC);
break;
case BRIG_OPCODE_COPYSIGN:
builtin = mathfn_built_in (builtin_type, BUILT_IN_COPYSIGN);
break;
case BRIG_OPCODE_NSIN:
builtin = mathfn_built_in (builtin_type, BUILT_IN_SIN);
break;
case BRIG_OPCODE_NLOG2:
builtin = mathfn_built_in (builtin_type, BUILT_IN_LOG2);
break;
case BRIG_OPCODE_NEXP2:
builtin = mathfn_built_in (builtin_type, BUILT_IN_EXP2);
break;
case BRIG_OPCODE_NFMA:
builtin = mathfn_built_in (builtin_type, BUILT_IN_FMA);
break;
case BRIG_OPCODE_NCOS:
builtin = mathfn_built_in (builtin_type, BUILT_IN_COS);
break;
case BRIG_OPCODE_POPCOUNT:
/* Popcount should be typed by its argument type (the return value
is always u32). Let's use a b64 version for also for b32 for now. */
return builtin_decl_explicit (BUILT_IN_POPCOUNTL);
case BRIG_OPCODE_BORROW:
/* Borrow uses the same builtin for unsigned and signed types. */
if (brig_type == BRIG_TYPE_S32 || brig_type == BRIG_TYPE_U32)
return builtin_decl_explicit (BUILT_IN_HSAIL_BORROW_U32);
else
return builtin_decl_explicit (BUILT_IN_HSAIL_BORROW_U64);
case BRIG_OPCODE_CARRY:
/* Carry also uses the same builtin for unsigned and signed types. */
if (brig_type == BRIG_TYPE_S32 || brig_type == BRIG_TYPE_U32)
return builtin_decl_explicit (BUILT_IN_HSAIL_CARRY_U32);
else
return builtin_decl_explicit (BUILT_IN_HSAIL_CARRY_U64);
default:
/* Use our builtin index for finding a proper builtin for the BRIG
opcode and BRIG type. This takes care most of the builtin cases,
the special cases are handled in the separate 'case' statements
above. */
builtin_map::const_iterator i
= s_custom_builtins.find (std::make_pair (brig_opcode, brig_type));
if (i != s_custom_builtins.end ())
return (*i).second;
if (brig_inner_type != brig_type)
{
/* Try to find a scalar built-in we could use. */
i = s_custom_builtins.find
(std::make_pair (brig_opcode, brig_inner_type));
if (i != s_custom_builtins.end ())
return (*i).second;
}
/* In case this is an fp16 operation that is promoted to fp32,
try to find a fp32 scalar built-in. */
if (brig_inner_type == BRIG_TYPE_F16)
{
i = s_custom_builtins.find
(std::make_pair (brig_opcode, BRIG_TYPE_F32));
if (i != s_custom_builtins.end ())
return (*i).second;
}
gcc_unreachable ();
}
if (VECTOR_TYPE_P (type) && builtin != NULL_TREE)
{
/* Try to find a vectorized version of the built-in.
TODO: properly assert that builtin is a mathfn builtin? */
tree vec_builtin
= targetm.vectorize.builtin_vectorized_function
(builtin_mathfn_code (builtin), type, type);
if (vec_builtin != NULL_TREE)
return vec_builtin;
else
return builtin;
}
if (builtin == NULL_TREE)
gcc_unreachable ();
return builtin;
}
/* Return the correct GENERIC type for storing comparison results
of operand with the type given in SOURCE_TYPE. */
@ -848,264 +702,6 @@ brig_code_entry_handler::get_comparison_result_type (tree source_type)
return gccbrig_tree_type_for_hsa_type (BRIG_TYPE_B1);
}
/* Returns true in case the given opcode needs to know about work-item context
data. In such case the context data is passed as a pointer to a work-item
context object, as the last argument in the builtin call. */
bool
brig_code_entry_handler::needs_workitem_context_data
(BrigOpcode16_t brig_opcode) const
{
switch (brig_opcode)
{
case BRIG_OPCODE_WORKITEMABSID:
case BRIG_OPCODE_WORKITEMFLATABSID:
case BRIG_OPCODE_WORKITEMFLATID:
case BRIG_OPCODE_CURRENTWORKITEMFLATID:
case BRIG_OPCODE_WORKITEMID:
case BRIG_OPCODE_WORKGROUPID:
case BRIG_OPCODE_WORKGROUPSIZE:
case BRIG_OPCODE_CURRENTWORKGROUPSIZE:
case BRIG_OPCODE_GRIDGROUPS:
case BRIG_OPCODE_GRIDSIZE:
case BRIG_OPCODE_DIM:
case BRIG_OPCODE_PACKETID:
case BRIG_OPCODE_PACKETCOMPLETIONSIG:
case BRIG_OPCODE_BARRIER:
case BRIG_OPCODE_WAVEBARRIER:
case BRIG_OPCODE_ARRIVEFBAR:
case BRIG_OPCODE_INITFBAR:
case BRIG_OPCODE_JOINFBAR:
case BRIG_OPCODE_LEAVEFBAR:
case BRIG_OPCODE_RELEASEFBAR:
case BRIG_OPCODE_WAITFBAR:
case BRIG_OPCODE_CUID:
case BRIG_OPCODE_MAXCUID:
case BRIG_OPCODE_DEBUGTRAP:
case BRIG_OPCODE_GROUPBASEPTR:
case BRIG_OPCODE_KERNARGBASEPTR:
case BRIG_OPCODE_ALLOCA:
return true;
default:
return false;
};
}
/* Returns true in case the given opcode that would normally be generated
as a builtin call can be expanded to tree nodes. */
bool
brig_code_entry_handler::can_expand_builtin (BrigOpcode16_t brig_opcode) const
{
switch (brig_opcode)
{
case BRIG_OPCODE_WORKITEMFLATABSID:
case BRIG_OPCODE_WORKITEMFLATID:
case BRIG_OPCODE_WORKITEMABSID:
case BRIG_OPCODE_WORKGROUPSIZE:
case BRIG_OPCODE_CURRENTWORKGROUPSIZE:
/* TODO: expand more builtins. */
return true;
default:
return false;
};
}
/* Try to expand the given builtin call to reuse a previously generated
variable, if possible. If not, just call the given builtin.
BRIG_OPCODE and BRIG_TYPE identify the builtin's BRIG opcode/type,
ARITH_TYPE its GENERIC type, and OPERANDS contains the builtin's
input operands. */
tree
brig_code_entry_handler::expand_or_call_builtin (BrigOpcode16_t brig_opcode,
BrigType16_t brig_type,
tree arith_type,
tree_stl_vec &operands)
{
if (m_parent.m_cf->m_is_kernel && can_expand_builtin (brig_opcode))
return expand_builtin (brig_opcode, operands);
tree built_in
= get_builtin_for_hsa_opcode (arith_type, brig_opcode, brig_type);
if (!VECTOR_TYPE_P (TREE_TYPE (TREE_TYPE (built_in)))
&& arith_type != NULL_TREE && VECTOR_TYPE_P (arith_type)
&& brig_opcode != BRIG_OPCODE_LERP
&& brig_opcode != BRIG_OPCODE_PACKCVT
&& brig_opcode != BRIG_OPCODE_SAD
&& brig_opcode != BRIG_OPCODE_SADHI)
{
/* Call the scalar built-in for all elements in the vector. */
tree_stl_vec operand0_elements;
if (operands.size () > 0)
unpack (operands[0], operand0_elements);
tree_stl_vec operand1_elements;
if (operands.size () > 1)
unpack (operands[1], operand1_elements);
tree_stl_vec result_elements;
size_t element_count = gccbrig_type_vector_subparts (arith_type);
for (size_t i = 0; i < element_count; ++i)
{
tree_stl_vec call_operands;
if (operand0_elements.size () > 0)
call_operands.push_back (operand0_elements.at (i));
if (operand1_elements.size () > 0)
call_operands.push_back (operand1_elements.at (i));
result_elements.push_back
(expand_or_call_builtin (brig_opcode, brig_type,
TREE_TYPE (arith_type),
call_operands));
}
return pack (result_elements);
}
tree_stl_vec call_operands;
tree_stl_vec operand_types;
tree arg_type_chain = TYPE_ARG_TYPES (TREE_TYPE (built_in));
for (size_t i = 0; i < operands.size (); ++i)
{
tree operand_type = TREE_VALUE (arg_type_chain);
call_operands.push_back (convert (operand_type, operands[i]));
operand_types.push_back (operand_type);
arg_type_chain = TREE_CHAIN (arg_type_chain);
}
if (needs_workitem_context_data (brig_opcode))
{
call_operands.push_back (m_parent.m_cf->m_context_arg);
operand_types.push_back (ptr_type_node);
m_parent.m_cf->m_has_unexpanded_dp_builtins = true;
}
size_t operand_count = call_operands.size ();
call_operands.resize (4, NULL_TREE);
operand_types.resize (4, NULL_TREE);
for (size_t i = 0; i < operand_count; ++i)
call_operands.at (i) = build_resize_convert_view (operand_types.at (i),
call_operands.at (i));
tree fnptr = build_fold_addr_expr (built_in);
return build_call_array (TREE_TYPE (TREE_TYPE (built_in)), fnptr,
operand_count, &call_operands[0]);
}
/* Instead of calling a built-in, reuse a previously returned value known to
be still valid. This is beneficial especially for the work-item
identification related builtins as not having them as calls can lead to
more easily vectorizable parallel loops for multi work-item work-groups.
BRIG_OPCODE identifies the builtin and OPERANDS store the operands. */
tree
brig_code_entry_handler::expand_builtin (BrigOpcode16_t brig_opcode,
tree_stl_vec &operands)
{
tree_stl_vec uint32_0 = tree_stl_vec (1, build_int_cst (uint32_type_node, 0));
tree_stl_vec uint32_1 = tree_stl_vec (1, build_int_cst (uint32_type_node, 1));
tree_stl_vec uint32_2 = tree_stl_vec (1, build_int_cst (uint32_type_node, 2));
if (brig_opcode == BRIG_OPCODE_WORKITEMFLATABSID)
{
tree id0 = expand_builtin (BRIG_OPCODE_WORKITEMABSID, uint32_0);
id0 = convert (uint64_type_node, id0);
tree id1 = expand_builtin (BRIG_OPCODE_WORKITEMABSID, uint32_1);
id1 = convert (uint64_type_node, id1);
tree id2 = expand_builtin (BRIG_OPCODE_WORKITEMABSID, uint32_2);
id2 = convert (uint64_type_node, id2);
tree max0 = convert (uint64_type_node,
m_parent.m_cf->m_grid_size_vars[0]);
tree max1 = convert (uint64_type_node,
m_parent.m_cf->m_grid_size_vars[1]);
tree id2_x_max0_x_max1 = build2 (MULT_EXPR, uint64_type_node, id2, max0);
id2_x_max0_x_max1
= build2 (MULT_EXPR, uint64_type_node, id2_x_max0_x_max1, max1);
tree id1_x_max0 = build2 (MULT_EXPR, uint64_type_node, id1, max0);
tree sum = build2 (PLUS_EXPR, uint64_type_node, id0, id1_x_max0);
sum = build2 (PLUS_EXPR, uint64_type_node, sum, id2_x_max0_x_max1);
return add_temp_var ("workitemflatabsid", sum);
}
else if (brig_opcode == BRIG_OPCODE_WORKITEMABSID)
{
HOST_WIDE_INT dim = int_constant_value (operands[0]);
tree local_id_var = m_parent.m_cf->m_local_id_vars[dim];
tree wg_id_var = m_parent.m_cf->m_wg_id_vars[dim];
tree wg_size_var = m_parent.m_cf->m_wg_size_vars[dim];
tree wg_id_x_wg_size = build2 (MULT_EXPR, uint32_type_node,
convert (uint32_type_node, wg_id_var),
convert (uint32_type_node, wg_size_var));
tree sum
= build2 (PLUS_EXPR, uint32_type_node, wg_id_x_wg_size, local_id_var);
return add_temp_var (std::string ("workitemabsid_")
+ (char) ((int) 'x' + dim), sum);
}
else if (brig_opcode == BRIG_OPCODE_WORKITEMFLATID)
{
tree z_x_wgsx_wgsy
= build2 (MULT_EXPR, uint32_type_node,
m_parent.m_cf->m_local_id_vars[2],
m_parent.m_cf->m_wg_size_vars[0]);
z_x_wgsx_wgsy = build2 (MULT_EXPR, uint32_type_node, z_x_wgsx_wgsy,
m_parent.m_cf->m_wg_size_vars[1]);
tree y_x_wgsx
= build2 (MULT_EXPR, uint32_type_node,
m_parent.m_cf->m_local_id_vars[1],
m_parent.m_cf->m_wg_size_vars[0]);
tree sum = build2 (PLUS_EXPR, uint32_type_node, y_x_wgsx, z_x_wgsx_wgsy);
sum = build2 (PLUS_EXPR, uint32_type_node,
m_parent.m_cf->m_local_id_vars[0],
sum);
return add_temp_var ("workitemflatid", sum);
}
else if (brig_opcode == BRIG_OPCODE_WORKGROUPSIZE)
{
HOST_WIDE_INT dim = int_constant_value (operands[0]);
return m_parent.m_cf->m_wg_size_vars[dim];
}
else if (brig_opcode == BRIG_OPCODE_CURRENTWORKGROUPSIZE)
{
HOST_WIDE_INT dim = int_constant_value (operands[0]);
return m_parent.m_cf->m_cur_wg_size_vars[dim];
}
else
gcc_unreachable ();
return NULL_TREE;
}
/* Appends and returns a new temp variable and an accompanying assignment
statement that stores the value of the given EXPR and has the given NAME. */
tree
brig_code_entry_handler::add_temp_var (std::string name, tree expr)
{
tree temp_var = create_tmp_var (TREE_TYPE (expr), name.c_str ());
tree assign = build2 (MODIFY_EXPR, TREE_TYPE (temp_var), temp_var, expr);
m_parent.m_cf->append_statement (assign);
return temp_var;
}
/* Creates a FP32 to FP16 conversion call, assuming the source and destination
are FP32 type variables. */
@ -1387,7 +983,6 @@ brig_code_entry_handler::build_output_assignment (const BrigInstBase &brig_inst,
variable type (can be any type; see get_m_var_declfor_reg @
brig-function.cc). */
tree output_type = TREE_TYPE (output);
tree input_type = TREE_TYPE (inst_expr);
bool is_fp16 = (brig_inst.type & BRIG_TYPE_BASE_MASK) == BRIG_TYPE_F16
&& brig_inst.base.kind != BRIG_KIND_INST_MEM
&& !gccbrig_is_bit_operation (brig_inst.opcode);
@ -1396,6 +991,13 @@ brig_code_entry_handler::build_output_assignment (const BrigInstBase &brig_inst,
bool ftz = false;
const BrigBase *base = &brig_inst.base;
if (m_parent.m_cf->is_id_val (inst_expr))
inst_expr = m_parent.m_cf->id_val (inst_expr);
tree input_type = TREE_TYPE (inst_expr);
m_parent.m_cf->add_reg_var_update (output, inst_expr);
if (base->kind == BRIG_KIND_INST_MOD)
{
const BrigInstMod *mod = (const BrigInstMod *) base;
@ -1418,13 +1020,13 @@ brig_code_entry_handler::build_output_assignment (const BrigInstBase &brig_inst,
{
/* Ensure we don't duplicate the arithmetics to the arguments of the bit
field reference operators. */
inst_expr = add_temp_var ("before_ftz", inst_expr);
inst_expr = m_parent.m_cf->add_temp_var ("before_ftz", inst_expr);
inst_expr = flush_to_zero (is_fp16) (*this, inst_expr);
}
if (is_fp16)
{
inst_expr = add_temp_var ("before_f2h", inst_expr);
inst_expr = m_parent.m_cf->add_temp_var ("before_f2h", inst_expr);
tree f2h_output = build_f2h_conversion (inst_expr);
tree conv = build_resize_convert_view (output_type, f2h_output);
tree assign = build2 (MODIFY_EXPR, output_type, output, conv);
@ -1486,62 +1088,6 @@ brig_code_entry_handler::append_statement (tree stmt)
m_parent.m_cf->append_statement (stmt);
}
/* Unpacks the elements of the vector in VALUE to scalars (bit field
references) in ELEMENTS. */
void
brig_code_entry_handler::unpack (tree value, tree_stl_vec &elements)
{
size_t vec_size = int_size_in_bytes (TREE_TYPE (value));
size_t element_size
= int_size_in_bytes (TREE_TYPE (TREE_TYPE (value))) * BITS_PER_UNIT;
size_t element_count
= vec_size * BITS_PER_UNIT / element_size;
tree input_element_type = TREE_TYPE (TREE_TYPE (value));
value = add_temp_var ("unpack_input", value);
for (size_t i = 0; i < element_count; ++i)
{
tree element
= build3 (BIT_FIELD_REF, input_element_type, value,
TYPE_SIZE (input_element_type),
bitsize_int(i * element_size));
element = add_temp_var ("scalar", element);
elements.push_back (element);
}
}
/* Pack the elements of the scalars in ELEMENTS to the returned vector. */
tree
brig_code_entry_handler::pack (tree_stl_vec &elements)
{
size_t element_count = elements.size ();
gcc_assert (element_count > 1);
tree output_element_type = TREE_TYPE (elements.at (0));
vec<constructor_elt, va_gc> *constructor_vals = NULL;
for (size_t i = 0; i < element_count; ++i)
CONSTRUCTOR_APPEND_ELT (constructor_vals, NULL_TREE, elements.at (i));
tree vec_type = build_vector_type (output_element_type, element_count);
/* build_constructor creates a vector type which is not a vector_cst
that requires compile time constant elements. */
tree vec = build_constructor (vec_type, constructor_vals);
/* Add a temp variable for readability. */
tree tmp_var = create_tmp_var (vec_type, "vec_out");
tree vec_tmp_assign = build2 (MODIFY_EXPR, TREE_TYPE (tmp_var), tmp_var, vec);
m_parent.m_cf->append_statement (vec_tmp_assign);
return tmp_var;
}
/* Visits the element(s) in the OPERAND, calling HANDLER to each of them. */
tree
@ -1757,4 +1303,3 @@ brig_code_entry_handler::int_constant_value (tree node)
n = TREE_OPERAND (n, 0);
return int_cst_value (n);
}

View File

@ -35,8 +35,6 @@ class tree_element_unary_visitor;
class brig_code_entry_handler : public brig_entry_handler
{
public:
typedef std::map<std::pair<BrigOpcode16_t, BrigType16_t>, tree> builtin_map;
brig_code_entry_handler (brig_to_generic &parent);
/* Handles the brig_code data at the given pointer and adds it to the
@ -51,8 +49,6 @@ protected:
tree get_tree_expr_type_for_hsa_type (BrigType16_t brig_type) const;
tree get_tree_cst_for_hsa_operand (const BrigOperandConstantBytes *brigConst,
tree type) const;
tree get_builtin_for_hsa_opcode (tree type, BrigOpcode16_t brig_opcode,
BrigType16_t brig_type) const;
tree get_comparison_result_type (tree source_type);
tree build_code_ref (const BrigBase &ref);
@ -73,16 +69,6 @@ protected:
bool needs_workitem_context_data (BrigOpcode16_t brig_opcode) const;
void unpack (tree value, tree_stl_vec &elements);
tree pack (tree_stl_vec &elements);
bool can_expand_builtin (BrigOpcode16_t brig_opcode) const;
tree expand_builtin (BrigOpcode16_t brig_opcode, tree_stl_vec &operands);
tree expand_or_call_builtin (BrigOpcode16_t brig_opcode,
BrigType16_t brig_type, tree arith_type,
tree_stl_vec &operands);
tree add_temp_var (std::string name, tree expr);
tree build_f2h_conversion (tree source);
@ -100,10 +86,6 @@ protected:
tree extend_int (tree input, tree dest_type, tree src_type);
/* HSAIL-specific builtin functions not yet integrated to gcc. */
static builtin_map s_custom_builtins;
private:
tree_stl_vec build_or_analyze_operands (const BrigInstBase &brig_inst,
@ -299,9 +281,6 @@ private:
tree build_unpack_lo_or_hi (BrigOpcode16_t brig_opcode, tree arith_type,
tree_stl_vec &operands);
tree_code get_tree_code_for_hsa_opcode (BrigOpcode16_t brig_opcode,
BrigType16_t brig_type) const;
};
class brig_cvt_inst_handler : public brig_inst_mod_handler

View File

@ -53,45 +53,45 @@ brig_directive_control_handler::operator () (const BrigBase *base)
case BRIG_CONTROL_MAXDYNAMICGROUPSIZE:
{
m_parent.m_cf->m_descriptor.max_dynamic_group_size
= int_constant_value (operands.at (0));
= brig_function::int_constant_value (operands.at (0));
break;
}
case BRIG_CONTROL_MAXFLATGRIDSIZE:
{
m_parent.m_cf->m_descriptor.max_flat_grid_size
= int_constant_value (operands.at (0));
= brig_function::int_constant_value (operands.at (0));
break;
}
case BRIG_CONTROL_MAXFLATWORKGROUPSIZE:
{
m_parent.m_cf->m_descriptor.max_flat_workgroup_size
= int_constant_value (operands.at (0));
= brig_function::int_constant_value (operands.at (0));
break;
}
case BRIG_CONTROL_REQUIREDDIM:
{
m_parent.m_cf->m_descriptor.required_dim
= int_constant_value (operands.at (0));
= brig_function::int_constant_value (operands.at (0));
break;
}
case BRIG_CONTROL_REQUIREDGRIDSIZE:
{
m_parent.m_cf->m_descriptor.required_grid_size[0]
= int_constant_value (operands.at (0));
= brig_function::int_constant_value (operands.at (0));
m_parent.m_cf->m_descriptor.required_grid_size[1]
= int_constant_value (operands.at (1));
= brig_function::int_constant_value (operands.at (1));
m_parent.m_cf->m_descriptor.required_grid_size[2]
= int_constant_value (operands.at (2));
= brig_function::int_constant_value (operands.at (2));
break;
}
case BRIG_CONTROL_REQUIREDWORKGROUPSIZE:
{
m_parent.m_cf->m_descriptor.required_workgroup_size[0]
= int_constant_value (operands.at (0));
= brig_function::int_constant_value (operands.at (0));
m_parent.m_cf->m_descriptor.required_workgroup_size[1]
= int_constant_value (operands.at (1));
= brig_function::int_constant_value (operands.at (1));
m_parent.m_cf->m_descriptor.required_workgroup_size[2]
= int_constant_value (operands.at (2));
= brig_function::int_constant_value (operands.at (2));
break;
}
case BRIG_CONTROL_REQUIRENOPARTIALWORKGROUPS:

View File

@ -83,6 +83,12 @@ brig_cvt_inst_handler::generate (const BrigBase *base)
tree &input = operands.at (1);
tree &output = operands.at (0);
if (m_parent.m_cf->is_id_val (input))
{
input = m_parent.m_cf->id_val (input);
src_type = TREE_TYPE (input);
}
size_t conv_src_size = int_size_in_bytes (src_type);
size_t conv_dst_size = int_size_in_bytes (dest_type);
size_t src_reg_size = int_size_in_bytes (TREE_TYPE (input));

View File

@ -93,6 +93,25 @@ brig_directive_function_handler::operator () (const BrigBase *base)
represent HSAIL registers. */
tree bind_expr = build3 (BIND_EXPR, void_type_node, NULL, stmt_list, NULL);
tree restrict_char_ptr
= build_qualified_type (build_pointer_type (char_type_node),
TYPE_QUAL_RESTRICT);
tree restrict_void_ptr
= build_qualified_type (build_pointer_type (void_type_node),
TYPE_QUAL_RESTRICT);
tree restrict_const_char_ptr
= build_qualified_type (build_pointer_type
(build_qualified_type (char_type_node,
TYPE_QUAL_CONST)),
TYPE_QUAL_RESTRICT);
tree restrict_const_void_ptr
= build_qualified_type (build_pointer_type
(build_qualified_type (void_type_node,
TYPE_QUAL_CONST)),
TYPE_QUAL_RESTRICT);
if (is_kernel)
{
tree name_identifier
@ -107,12 +126,11 @@ brig_directive_function_handler::operator () (const BrigBase *base)
3) a void* parameter that contains the first flat address of the group
region allocated to the current work-group. */
tree char_ptr_type_node = build_pointer_type (char_type_node);
fndecl = build_decl (UNKNOWN_LOCATION, FUNCTION_DECL, name_identifier,
build_function_type_list (void_type_node,
char_ptr_type_node,
ptr_type_node,
ptr_type_node, NULL_TREE));
restrict_const_char_ptr,
restrict_void_ptr,
restrict_char_ptr, NULL_TREE));
SET_DECL_ASSEMBLER_NAME (fndecl, name_identifier);
@ -125,9 +143,10 @@ brig_directive_function_handler::operator () (const BrigBase *base)
= gccbrig_get_target_addr_space_id (BRIG_SEGMENT_KERNARG);
tree arg_arg = build_decl (UNKNOWN_LOCATION, PARM_DECL,
get_identifier ("__args"), char_ptr_type_node);
get_identifier ("__args"),
restrict_const_char_ptr);
DECL_ARGUMENTS (fndecl) = arg_arg;
DECL_ARG_TYPE (arg_arg) = char_ptr_type_node;
DECL_ARG_TYPE (arg_arg) = restrict_const_char_ptr;
DECL_CONTEXT (arg_arg) = fndecl;
DECL_ARTIFICIAL (arg_arg) = 1;
TREE_READONLY (arg_arg) = 1;
@ -189,7 +208,7 @@ brig_directive_function_handler::operator () (const BrigBase *base)
if (arg_decls == NULL_TREE)
arg_decls = arg_var;
else
chainon (arg_decls, arg_var);
arg_decls = chainon (arg_decls, arg_var);
m_parent.m_cf->add_arg_variable (brigVar, arg_var);
@ -230,18 +249,13 @@ brig_directive_function_handler::operator () (const BrigBase *base)
vec_safe_push (args, TREE_TYPE (arg_var));
m_parent.m_cf->add_arg_variable (brigVar, arg_var);
if (arg_decls == NULL_TREE)
arg_decls = arg_var;
else
chainon (arg_decls, arg_var);
arg_decls = chainon (arg_decls, arg_var);
}
}
vec_safe_push (args, ptr_type_node);
vec_safe_push (args, ptr_type_node);
vec_safe_push (args, ptr_type_node);
vec_safe_push (args, ptr_type_node);
vec_safe_push (args, restrict_void_ptr);
vec_safe_push (args, restrict_char_ptr);
vec_safe_push (args, uint32_type_node);
vec_safe_push (args, restrict_char_ptr);
fndecl = build_decl (UNKNOWN_LOCATION, FUNCTION_DECL, name_identifier,
build_function_type_vec (ret_type, args));
@ -254,26 +268,30 @@ brig_directive_function_handler::operator () (const BrigBase *base)
/* All functions need the hidden __context argument passed on
because they might call WI-specific functions which need
the context info. */
the context info. Only kernels can write it, if they need
to update the local ids in the work-item loop. */
tree context_arg_type
= true ? restrict_void_ptr : restrict_const_void_ptr;
tree context_arg = build_decl (UNKNOWN_LOCATION, PARM_DECL,
get_identifier ("__context"), ptr_type_node);
if (DECL_ARGUMENTS (fndecl) == NULL_TREE)
DECL_ARGUMENTS (fndecl) = context_arg;
else
chainon (DECL_ARGUMENTS (fndecl), context_arg);
get_identifier ("__context"),
context_arg_type);
DECL_ARGUMENTS (fndecl) = chainon (DECL_ARGUMENTS (fndecl), context_arg);
DECL_CONTEXT (context_arg) = fndecl;
DECL_ARG_TYPE (context_arg) = ptr_type_node;
DECL_ARG_TYPE (context_arg) = context_arg_type;
DECL_ARTIFICIAL (context_arg) = 1;
TREE_READONLY (context_arg) = 1;
TREE_USED (context_arg) = 1;
m_parent.m_cf->m_context_arg = context_arg;
/* They can also access group memory, so we need to pass the
group pointer along too. */
tree group_base_arg
= build_decl (UNKNOWN_LOCATION, PARM_DECL,
get_identifier ("__group_base_addr"), ptr_type_node);
chainon (DECL_ARGUMENTS (fndecl), group_base_arg);
DECL_ARG_TYPE (group_base_arg) = ptr_type_node;
get_identifier ("__group_base_addr"),
restrict_char_ptr);
DECL_ARGUMENTS (fndecl) = chainon (DECL_ARGUMENTS (fndecl), group_base_arg);
DECL_ARG_TYPE (group_base_arg) = restrict_char_ptr;
DECL_CONTEXT (group_base_arg) = fndecl;
DECL_ARTIFICIAL (group_base_arg) = 1;
TREE_READONLY (group_base_arg) = 1;
@ -288,7 +306,7 @@ brig_directive_function_handler::operator () (const BrigBase *base)
tree group_local_offset_arg
= build_decl (UNKNOWN_LOCATION, PARM_DECL,
get_identifier ("__group_local_offset"), uint32_type_node);
chainon (DECL_ARGUMENTS (fndecl), group_local_offset_arg);
DECL_ARGUMENTS (fndecl) = chainon (DECL_ARGUMENTS (fndecl), group_local_offset_arg);
DECL_ARG_TYPE (group_local_offset_arg) = uint32_type_node;
DECL_CONTEXT (group_local_offset_arg) = fndecl;
DECL_ARTIFICIAL (group_local_offset_arg) = 1;
@ -299,24 +317,25 @@ brig_directive_function_handler::operator () (const BrigBase *base)
/* Same for private. */
tree private_base_arg
= build_decl (UNKNOWN_LOCATION, PARM_DECL,
get_identifier ("__private_base_addr"), ptr_type_node);
chainon (DECL_ARGUMENTS (fndecl), private_base_arg);
DECL_ARG_TYPE (private_base_arg) = ptr_type_node;
get_identifier ("__private_base_addr"), restrict_char_ptr);
DECL_ARGUMENTS (fndecl) = chainon (DECL_ARGUMENTS (fndecl), private_base_arg);
DECL_ARG_TYPE (private_base_arg) = restrict_char_ptr;
DECL_CONTEXT (private_base_arg) = fndecl;
DECL_ARTIFICIAL (private_base_arg) = 1;
TREE_READONLY (private_base_arg) = 1;
TREE_USED (private_base_arg) = 1;
m_parent.m_cf->m_private_base_arg = private_base_arg;
DECL_SAVED_TREE (fndecl) = bind_expr;
set_externally_visible (fndecl);
if (base->kind == BRIG_KIND_DIRECTIVE_FUNCTION)
{
TREE_STATIC (fndecl) = 0;
TREE_PUBLIC (fndecl) = 1;
DECL_EXTERNAL (fndecl) = 0;
DECL_DECLARED_INLINE_P (fndecl) = 1;
set_inline (fndecl);
set_externally_visible (fndecl);
}
else if (base->kind == BRIG_KIND_DIRECTIVE_KERNEL)
{
@ -330,6 +349,7 @@ brig_directive_function_handler::operator () (const BrigBase *base)
TREE_STATIC (fndecl) = 0;
TREE_PUBLIC (fndecl) = 1;
DECL_EXTERNAL (fndecl) = 1;
set_inline (fndecl);
}
else if (base->kind == BRIG_KIND_DIRECTIVE_INDIRECT_FUNCTION)
{
@ -371,11 +391,8 @@ brig_directive_function_handler::operator () (const BrigBase *base)
}
m_parent.start_function (fndecl);
m_parent.m_cf->m_func_decl = fndecl;
m_parent.m_cf->m_current_bind_expr = bind_expr;
m_parent.m_cf->m_context_arg = context_arg;
m_parent.m_cf->m_private_base_arg = private_base_arg;
if (ret_value != NULL_TREE && TREE_TYPE (ret_value) != void_type_node)
{

File diff suppressed because it is too large Load Diff

View File

@ -105,6 +105,30 @@ public:
void analyze_calls ();
tree expand_builtin (BrigOpcode16_t brig_opcode, tree_stl_vec &operands);
tree expand_or_call_builtin (BrigOpcode16_t brig_opcode,
BrigType16_t brig_type, tree arith_type,
tree_stl_vec &operands);
bool can_expand_builtin (BrigOpcode16_t brig_opcode) const;
tree get_builtin_for_hsa_opcode (tree type, BrigOpcode16_t brig_opcode,
BrigType16_t brig_type) const;
void unpack (tree value, tree_stl_vec &elements);
tree pack (tree_stl_vec &elements);
tree add_temp_var (std::string name, tree expr);
static bool needs_workitem_context_data (BrigOpcode16_t brig_opcode);
static HOST_WIDE_INT int_constant_value (tree node);
static tree_code get_tree_code_for_hsa_opcode (BrigOpcode16_t brig_opcode,
BrigType16_t brig_type);
void start_new_bb ();
void add_reg_var_update (tree reg_var, tree val);
bool is_id_val (tree reg_var);
tree id_val (tree reg_var);
const BrigDirectiveExecutable *m_brig_def;
bool m_is_kernel;
@ -183,6 +207,11 @@ public:
tree m_wg_id_vars[3];
tree m_wg_size_vars[3];
tree m_grid_size_vars[3];
/* Explicitly computed WG base for the absolute IDs which is used
as the initial value when looping that dimension. We update
the abs id with ++ to make it easy for the vectorizer. */
tree m_abs_id_base_vars[3];
tree m_abs_id_vars[3];
/* Set to true in case the kernel contains at least one dispatch packet
(work-item ID-related) builtin call that could not be expanded to
@ -219,6 +248,20 @@ private:
/* Bookkeeping for the different HSA registers and their tree declarations
for the currently generated function. */
reg_decl_index_entry *m_regs[BRIG_2_TREE_HSAIL_TOTAL_REG_COUNT];
/* Map for keeping book reads of ID variables, which can be propagated
to uses in address expressions to produce cleaner indexing functions
with unnecessary casts stripped off, etc. */
typedef std::map<tree, tree> id_val_map;
/* Keeps track of ID values alive in registers in the currently
processed BB. */
id_val_map m_id_val_defs;
/* HSAIL-specific builtin functions not yet integrated to gcc. */
typedef std::map<std::pair<BrigOpcode16_t, BrigType16_t>, tree> builtin_map;
static builtin_map s_custom_builtins;
};
#endif

View File

@ -31,7 +31,10 @@ brig_directive_label_handler::operator () (const BrigBase *base)
std::string label_str ((const char *) (label_name->bytes),
label_name->byteCount);
m_parent.m_cf->start_new_bb ();
tree stmt = build_stmt (LABEL_EXPR, m_parent.m_cf->label (label_str));
m_parent.m_cf->append_statement (stmt);
return base->byteCount;
}

View File

@ -59,7 +59,7 @@ brig_lane_inst_handler::operator () (const BrigBase *base)
elements.push_back (zero_cst);
elements.push_back (zero_cst);
expr = pack (elements);
expr = m_parent.m_cf->pack (elements);
}
else if (inst.base.opcode == BRIG_OPCODE_ACTIVELANEPERMUTE)
{

View File

@ -63,7 +63,7 @@ brig_mem_inst_handler::build_mem_access (const BrigInstBase *brig_inst,
{
/* Add a temporary variable so there won't be multiple
reads in case of vector unpack. */
mem_ref = add_temp_var ("mem_read", mem_ref);
mem_ref = m_parent.m_cf->add_temp_var ("mem_read", mem_ref);
return build_output_assignment (*brig_inst, data, mem_ref);
}
else
@ -95,8 +95,9 @@ brig_mem_inst_handler::operator () (const BrigBase *base)
inputs.push_back (operands[1]);
inputs.push_back (align_opr);
tree builtin_call
= expand_or_call_builtin (BRIG_OPCODE_ALLOCA, BRIG_TYPE_U32,
uint32_type_node, inputs);
= m_parent.m_cf->expand_or_call_builtin (BRIG_OPCODE_ALLOCA,
BRIG_TYPE_U32,
uint32_type_node, inputs);
build_output_assignment (*brig_inst, operands[0], builtin_call);
m_parent.m_cf->m_has_allocas = true;
return base->byteCount;

View File

@ -58,13 +58,22 @@ typedef struct __attribute__((__packed__))
/* The prefix to use in the ELF section containing descriptor for
a function. */
#define PHSA_DESC_SECTION_PREFIX "phsa.desc."
#define PHSA_HOST_DEF_PTR_PREFIX "__phsa.host_def."
/* The frontend error messages are parsed by the host runtime. Known
prefix strings are used to separate the different runtime error
codes. */
#define PHSA_ERROR_PREFIX_INCOMPATIBLE_MODULE "Incompatible module: "
#define PHSA_ERROR_PREFIX_CORRUPTED_MODULE "Corrupted module: "
/* Offsets of attributes in the PHSA context structs.
Used by -fphsa-wi-context-opt. */
#define PHSA_CONTEXT_OFFS_WI_IDS 0
#define PHSA_CONTEXT_OFFS_WG_IDS (PHSA_CONTEXT_OFFS_WI_IDS + 3 * 4)
#define PHSA_CONTEXT_WG_SIZES (PHSA_CONTEXT_OFFS_WG_IDS + 3 * 4)
#define PHSA_CONTEXT_CURRENT_WG_SIZES (PHSA_CONTEXT_WG_SIZES + 3 * 4)
#endif

View File

@ -31,6 +31,11 @@ BRIG Separate Alias(d)
-dump=
BRIG Joined Alias(d)
fassume-phsa
BRIG Report Var(flag_assume_phsa) Init(1) Optimization
Assume we are finalizing for phsa and its libhsail-rt. Enables additional
phsa-specific optimizations (default).
L
BRIG Joined Separate
; Not documented

View File

@ -283,7 +283,9 @@ DEF_FUNCTION_TYPE_1 (BT_FN_UINT_INT, BT_UINT, BT_INT)
DEF_FUNCTION_TYPE_1 (BT_FN_UINT_ULONG, BT_UINT, BT_ULONG)
DEF_FUNCTION_TYPE_1 (BT_FN_UINT_LONG, BT_UINT, BT_LONG)
DEF_FUNCTION_TYPE_1 (BT_FN_UINT_PTR, BT_UINT, BT_PTR)
DEF_FUNCTION_TYPE_1 (BT_FN_UINT_CONST_PTR, BT_UINT, BT_CONST_PTR)
DEF_FUNCTION_TYPE_1 (BT_FN_ULONG_PTR, BT_ULONG, BT_PTR)
DEF_FUNCTION_TYPE_1 (BT_FN_ULONG_CONST_PTR, BT_ULONG, BT_CONST_PTR)
DEF_FUNCTION_TYPE_1 (BT_FN_ULONG_ULONG, BT_ULONG, BT_ULONG)
DEF_FUNCTION_TYPE_1 (BT_FN_ULONGLONG_ULONGLONG, BT_ULONGLONG, BT_ULONGLONG)
DEF_FUNCTION_TYPE_1 (BT_FN_INT8_FLOAT, BT_INT8, BT_FLOAT)
@ -480,6 +482,7 @@ DEF_FUNCTION_TYPE_2 (BT_FN_BOOL_SIZE_CONST_VPTR, BT_BOOL, BT_SIZE,
DEF_FUNCTION_TYPE_2 (BT_FN_BOOL_INT_BOOL, BT_BOOL, BT_INT, BT_BOOL)
DEF_FUNCTION_TYPE_2 (BT_FN_VOID_UINT_UINT, BT_VOID, BT_UINT, BT_UINT)
DEF_FUNCTION_TYPE_2 (BT_FN_UINT_UINT_PTR, BT_UINT, BT_UINT, BT_PTR)
DEF_FUNCTION_TYPE_2 (BT_FN_UINT_UINT_CONST_PTR, BT_UINT, BT_UINT, BT_CONST_PTR)
DEF_FUNCTION_TYPE_2 (BT_FN_PTR_CONST_PTR_SIZE, BT_PTR, BT_CONST_PTR, BT_SIZE)
DEF_FUNCTION_TYPE_2 (BT_FN_PTR_CONST_PTR_CONST_PTR, BT_PTR, BT_CONST_PTR, BT_CONST_PTR)
DEF_FUNCTION_TYPE_2 (BT_FN_VOID_PTRPTR_CONST_PTR, BT_VOID, BT_PTR_PTR, BT_CONST_PTR)
@ -569,6 +572,7 @@ DEF_FUNCTION_TYPE_3 (BT_FN_VOID_DOUBLE_DOUBLEPTR_DOUBLEPTR,
DEF_FUNCTION_TYPE_3 (BT_FN_VOID_LONGDOUBLE_LONGDOUBLEPTR_LONGDOUBLEPTR,
BT_VOID, BT_LONGDOUBLE, BT_LONGDOUBLE_PTR, BT_LONGDOUBLE_PTR)
DEF_FUNCTION_TYPE_3 (BT_FN_VOID_PTR_PTR_PTR, BT_VOID, BT_PTR, BT_PTR, BT_PTR)
DEF_FUNCTION_TYPE_3 (BT_FN_VOID_PTR_PTR_UINT32, BT_VOID, BT_PTR, BT_PTR, BT_UINT32)
DEF_FUNCTION_TYPE_3 (BT_FN_INT_CONST_STRING_PTR_CONST_STRING_PTR_CONST_STRING,
BT_INT, BT_CONST_STRING, BT_PTR_CONST_STRING, BT_PTR_CONST_STRING)
DEF_FUNCTION_TYPE_3 (BT_FN_INT_INT_CONST_STRING_VALIST_ARG,

View File

@ -1,4 +1,9 @@
2018-05-04 Carl Love <cel@us.ibm.com>
2018-05-04 Pekka Jääskeläinen <pekka.jaaskelainen@parmance.com>
* testsuite/brig.dg/test/gimple/smoke_test.hsail: Fix the test
to match the currently produced gimple.
2018-05-04 Carl Love <cel@us.ibm.com>
* gcc.target/powerpc/vsx-vector-6.h (foo): Add test for vec_max,
vec_trunc.
* gcc.target/powerpc/vsx-vector-6-le.c (dg-final): Update xvcmpeqdp,

View File

@ -41,15 +41,15 @@ prog kernel &KernelWithBarrier(kernarg_u64 %input_ptr, kernarg_u64 %output_ptr)
};
/* The kernel function itself should have a fingerprint as follows */
/* _Kernel (unsigned char * __args, void * __context, void * __group_base_addr, void * __private_base_addr) */
/* { dg-final { scan-tree-dump "_Kernel \\\(unsigned char \\\* __args, void \\\* __context, void \\\* __group_base_addr, unsigned int __group_local_offset, void \\\* __private_base_addr\\\)" "gimple"} } */
/* _Kernel (const unsigned char * restrict __args, void * restrict __context, unsigned char * restrict __group_base_addr, unsigned int __group_local_offset, unsigned char * restrict __private_base_addr) */
/* { dg-final { scan-tree-dump "_Kernel \\\(const unsigned char \\\* restrict __args, void \\\* restrict __context, unsigned char \\\* restrict __group_base_addr, unsigned int __group_local_offset, unsigned char \\\* restrict __private_base_addr\\\)" "gimple"} } */
/* ld_kernarg: mem_read.0 = MEM[(unsigned long *)__args]; */
/* { dg-final { scan-tree-dump "mem_read.\[0-9\] = MEM\\\[\\\(unsigned long \\\*\\\)__args\\\];" "gimple"} } */
/* The latter ld_global_u32 should be visible as a pointer dereference (after pointer arithmetics on a temporary var): */
/* mem_read.2 = *D.1691; */
/* { dg-final { scan-tree-dump "mem_read.\[0-9\] = \\\*\[_0-9\]+;" "gimple"} } */
/* { dg-final { scan-tree-dump "mem_read.\[0-9\]+ = \\\*\[_0-9\]+;" "gimple"} } */
/* add_u32s should generate +operators */
/* { dg-final { scan-tree-dump "s2 = s0 \\\+ s1;" "gimple"} } */
@ -71,8 +71,8 @@ prog kernel &KernelWithBarrier(kernarg_u64 %input_ptr, kernarg_u64 %output_ptr)
/* { dg-final { scan-tree-dump "if \\\(__local_z < __cur_wg_size_z\\\) goto __wi_loop_z; else goto" "gimple"} } */
/* The launcher should call __hsail_launch_wg_function in this case: */
/* Kernel (void * __context, void * __group_base_addr) */
/* { dg-final { scan-tree-dump "Kernel \\\(void \\\* __context, void \\\* __group_base_addr\\\)" "gimple"} } */
/* Kernel (void * restrict __context, unsigned char * restrict __group_base_addr) */
/* { dg-final { scan-tree-dump "Kernel \\\(void \\\* restrict __context, unsigned char \\\* restrict __group_base_addr\\\)" "gimple"} } */
/* { dg-final { scan-tree-dump "__hsail_launch_wg_function \\\(_Kernel, __context, __group_base_addr, group_local_offset.*\\\);" "gimple"} }*/
/* The kernel should have the magic metadata section injected to the ELF. */

View File

@ -1,3 +1,10 @@
2018-05-04 Pekka Jääskeläinen <pekka.jaaskelainen@parmance.com>
* include/internal/phsa-rt.h: Whitespace cleanup.
* include/internal/workitems.h: Store work item ID data to easily
accessible locations.
* rt/workitems.c: Same.
2018-05-04 Pekka Jääskeläinen <pekka.jaaskelainen@parmance.com>
* rt/workitems.c: Fix an alloca stack underflow.

View File

@ -54,7 +54,6 @@ typedef void (*gccbrigKernelFunc) (unsigned char *, void *, void *, uint32_t,
*/
typedef struct
{
/* Data set by the HSA Runtime's kernel launcher. */
hsa_kernel_dispatch_packet_t *dp;

View File

@ -45,11 +45,6 @@
typedef struct
{
/* The group id of the currently executed WG. */
size_t x;
size_t y;
size_t z;
/* This is 1 in case there are more work groups to execute.
If 0, the work-item threads should finish themselves. */
int more_wgs;
@ -89,6 +84,16 @@ typedef struct
stack frame. Initialized to point outside the private segment. */
uint32_t alloca_frame_p;
/* The group id of the currently executed WG. This is for fiber based
execution. The group ids are duplicated also to the per WI context
struct for simplified single pointer access in the GCCBRIG produced
code.
*/
uint32_t x;
uint32_t y;
uint32_t z;
} PHSAWorkGroup;
/* Data identifying a single work-item, passed to the work-item thread in case
@ -96,17 +101,42 @@ typedef struct
typedef struct
{
/* NOTE: These members STARTing here should not be moved as they are
accessed directly by code emitted by BRIG FE. */
/* The local id of the current WI. */
uint32_t x;
uint32_t y;
uint32_t z;
/* The group id of the currently executed WG. */
uint32_t group_x;
uint32_t group_y;
uint32_t group_z;
/* The local size of a complete WG. */
uint32_t wg_size_x;
uint32_t wg_size_y;
uint32_t wg_size_z;
/* The local size of the current WG. */
uint32_t cur_wg_size_x;
uint32_t cur_wg_size_y;
uint32_t cur_wg_size_z;
/* NOTE: Fixed members END here. */
PHSAKernelLaunchData *launch_data;
/* Identifies and keeps book of the currently executed WG of the WI swarm. */
volatile PHSAWorkGroup *wg;
/* The local id of the current WI. */
size_t x;
size_t y;
size_t z;
#ifdef HAVE_FIBERS
fiber_t fiber;
#endif
} PHSAWorkItem;
} __attribute__((packed)) PHSAWorkItem;
#endif

View File

@ -107,11 +107,20 @@ phsa_work_item_thread (int arg0, int arg1)
the current_work_group_* is set to point to the WG executed next. */
if (!wi->wg->more_wgs)
break;
wi->group_x = wg->x;
wi->group_y = wg->y;
wi->group_z = wg->z;
wi->cur_wg_size_x = __hsail_currentworkgroupsize (0, wi);
wi->cur_wg_size_y = __hsail_currentworkgroupsize (1, wi);
wi->cur_wg_size_z = __hsail_currentworkgroupsize (2, wi);
#ifdef DEBUG_PHSA_RT
printf (
"Running work-item %lu/%lu/%lu for wg %lu/%lu/%lu / %lu/%lu/%lu...\n",
wi->x, wi->y, wi->z, wg->x, wg->y, wg->z, l_data->wg_max_x,
l_data->wg_max_y, l_data->wg_max_z);
wi->x, wi->y, wi->z, wi->group_x, wi->group_y, wi->group_z,
l_data->wg_max_x, l_data->wg_max_y, l_data->wg_max_z);
#endif
if (wi->x < __hsail_currentworkgroupsize (0, wi)
@ -180,6 +189,13 @@ phsa_work_item_thread (int arg0, int arg1)
else
wg->x++;
#endif
wi->group_x = wg->x;
wi->group_y = wg->y;
wi->group_z = wg->z;
wi->cur_wg_size_x = __hsail_currentworkgroupsize (0, wi);
wi->cur_wg_size_y = __hsail_currentworkgroupsize (1, wi);
wi->cur_wg_size_z = __hsail_currentworkgroupsize (2, wi);
/* Reinitialize the work-group barrier according to the new WG's
size, which might not be the same as the previous ones, due
@ -233,6 +249,7 @@ phsa_execute_wi_gang (PHSAKernelLaunchData *context, void *group_base_ptr,
PHSAWorkItem *wi_threads = NULL;
PHSAWorkGroup wg;
size_t flat_wi_id = 0, x, y, z, max_x, max_y, max_z;
uint32_t group_x, group_y, group_z;
fiber_barrier_t wg_start_barrier;
fiber_barrier_t wg_completion_barrier;
fiber_barrier_t wg_sync_barrier;
@ -257,13 +274,13 @@ phsa_execute_wi_gang (PHSAKernelLaunchData *context, void *group_base_ptr,
wg.initial_group_offset = group_local_offset;
#ifdef EXECUTE_WGS_BACKWARDS
wg.x = context->wg_max_x - 1;
wg.y = context->wg_max_y - 1;
wg.z = context->wg_max_z - 1;
group_x = context->wg_max_x - 1;
group_y = context->wg_max_y - 1;
group_z = context->wg_max_z - 1;
#else
wg.x = context->wg_min_x;
wg.y = context->wg_min_y;
wg.z = context->wg_min_z;
group_x = context->wg_min_x;
group_y = context->wg_min_y;
group_z = context->wg_min_z;
#endif
fiber_barrier_init (&wg_sync_barrier, wg_size);
@ -290,6 +307,19 @@ phsa_execute_wi_gang (PHSAKernelLaunchData *context, void *group_base_ptr,
PHSAWorkItem *wi = &wi_threads[flat_wi_id];
wi->launch_data = context;
wi->wg = &wg;
wg.x = wi->group_x = group_x;
wg.y = wi->group_y = group_y;
wg.z = wi->group_z = group_z;
wi->wg_size_x = context->dp->workgroup_size_x;
wi->wg_size_y = context->dp->workgroup_size_y;
wi->wg_size_z = context->dp->workgroup_size_z;
wi->cur_wg_size_x = __hsail_currentworkgroupsize (0, wi);
wi->cur_wg_size_y = __hsail_currentworkgroupsize (1, wi);
wi->cur_wg_size_z = __hsail_currentworkgroupsize (2, wi);
wi->x = x;
wi->y = y;
wi->z = z;
@ -467,9 +497,17 @@ phsa_execute_work_groups (PHSAKernelLaunchData *context, void *group_base_ptr,
for (wg_y = context->wg_min_y; wg_y < context->wg_max_y; ++wg_y)
for (wg_x = context->wg_min_x; wg_x < context->wg_max_x; ++wg_x)
{
wi.wg->x = wg_x;
wi.wg->y = wg_y;
wi.wg->z = wg_z;
wi.group_x = wg_x;
wi.group_y = wg_y;
wi.group_z = wg_z;
wi.wg_size_x = context->dp->workgroup_size_x;
wi.wg_size_y = context->dp->workgroup_size_y;
wi.wg_size_z = context->dp->workgroup_size_z;
wi.cur_wg_size_x = __hsail_currentworkgroupsize (0, &wi);
wi.cur_wg_size_y = __hsail_currentworkgroupsize (1, &wi);
wi.cur_wg_size_z = __hsail_currentworkgroupsize (2, &wi);
context->kernel (context->kernarg_addr, &wi, group_base_ptr,
group_local_offset, private_base_ptr);
@ -564,15 +602,15 @@ __hsail_workitemabsid (uint32_t dim, PHSAWorkItem *context)
default:
case 0:
/* Overflow semantics in the case of WG dim > grid dim. */
id = ((uint64_t) context->wg->x * dp->workgroup_size_x + context->x)
id = ((uint64_t) context->group_x * dp->workgroup_size_x + context->x)
% dp->grid_size_x;
break;
case 1:
id = ((uint64_t) context->wg->y * dp->workgroup_size_y + context->y)
id = ((uint64_t) context->group_y * dp->workgroup_size_y + context->y)
% dp->grid_size_y;
break;
case 2:
id = ((uint64_t) context->wg->z * dp->workgroup_size_z + context->z)
id = ((uint64_t) context->group_z * dp->workgroup_size_z + context->z)
% dp->grid_size_z;
break;
}
@ -590,15 +628,15 @@ __hsail_workitemabsid_u64 (uint32_t dim, PHSAWorkItem *context)
default:
case 0:
/* Overflow semantics in the case of WG dim > grid dim. */
id = ((uint64_t) context->wg->x * dp->workgroup_size_x + context->x)
id = ((uint64_t) context->group_x * dp->workgroup_size_x + context->x)
% dp->grid_size_x;
break;
case 1:
id = ((uint64_t) context->wg->y * dp->workgroup_size_y + context->y)
id = ((uint64_t) context->group_y * dp->workgroup_size_y + context->y)
% dp->grid_size_y;
break;
case 2:
id = ((uint64_t) context->wg->z * dp->workgroup_size_z + context->z)
id = ((uint64_t) context->group_z * dp->workgroup_size_z + context->z)
% dp->grid_size_z;
break;
}
@ -738,19 +776,19 @@ __hsail_currentworkgroupsize (uint32_t dim, PHSAWorkItem *wi)
{
default:
case 0:
if ((uint64_t) wi->wg->x < dp->grid_size_x / dp->workgroup_size_x)
if ((uint64_t) wi->group_x < dp->grid_size_x / dp->workgroup_size_x)
wg_size = dp->workgroup_size_x; /* Full WG. */
else
wg_size = dp->grid_size_x % dp->workgroup_size_x; /* Partial WG. */
break;
case 1:
if ((uint64_t) wi->wg->y < dp->grid_size_y / dp->workgroup_size_y)
if ((uint64_t) wi->group_y < dp->grid_size_y / dp->workgroup_size_y)
wg_size = dp->workgroup_size_y; /* Full WG. */
else
wg_size = dp->grid_size_y % dp->workgroup_size_y; /* Partial WG. */
break;
case 2:
if ((uint64_t) wi->wg->z < dp->grid_size_z / dp->workgroup_size_z)
if ((uint64_t) wi->group_z < dp->grid_size_z / dp->workgroup_size_z)
wg_size = dp->workgroup_size_z; /* Full WG. */
else
wg_size = dp->grid_size_z % dp->workgroup_size_z; /* Partial WG. */
@ -798,11 +836,11 @@ __hsail_workgroupid (uint32_t dim, PHSAWorkItem *wi)
{
default:
case 0:
return wi->wg->x;
return wi->group_x;
case 1:
return wi->wg->y;
return wi->group_y;
case 2:
return wi->wg->z;
return wi->group_z;
}
}