From 02889d23ee3b02854dff203dd87b9a25e30b61b4 Mon Sep 17 00:00:00 2001 From: Chung-Lin Tang Date: Thu, 9 Feb 2017 13:46:20 +0000 Subject: [PATCH] gimplify.c (gimplify_scan_omp_clauses): No special handling for OMP_CLAUSE_TILE. 2017-02-09 Nathan Sidwell Cesar Philippidis Joseph Myers Chung-Lin Tang gcc/ * gimplify.c (gimplify_scan_omp_clauses): No special handling for OMP_CLAUSE_TILE. (gimplify_adjust_omp_clauses): Don't delete TILE. (gimplify_omp_for): Deal with TILE. * internal-fn.c (expand_GOACC_TILE): New function. * internal-fn.def (GOACC_DIM_POS): Comment may be overly conservative. (GOACC_TILE): New. * omp-expand.c (struct oacc_collapse): Add tile and outer fields. (expand_oacc_collapse_init): Add LOC paramter. Initialize tile element fields. (expand_oacc_collapse_vars): Add INNER parm, adjust for tiling, avoid DIV for outermost collapse var. (expand_oacc_for): Insert tile element loop as needed. Adjust. Remove out of date comments, fix whitespace. * omp-general.c (omp_extract_for_data): Deal with tiling. * omp-general.h (enum oacc_loop_flags): Add OLF_TILE flag, adjust OLF_DIM_BASE value. (struct omp_for_data): Add tiling field. * omp-low.c (scan_sharing_clauses): Allow OMP_CLAUSE_TILE. (lower_oacc_head_mark): Add OLF_TILE as appropriate. Ensure 2 levels for auto loops. Remove default auto determining, moved to oacc_loop_fixed_partitions. * omp-offload.c (struct oacc_loop): Change 'ifns' to vector of call stmts, add e_mask field. (oacc_dim_call): New function, abstracted out from oacc_thread_numbers. (oacc_thread_numbers): Use oacc_dim_call. (oacc_xform_tile): New. (new_oacc_loop_raw): Initialize e_mask, adjust for ifns vector. (finish_oacc_loop): Adjust for ifns vector. (oacc_loop_discover_walk): Append loop abstraction sites to list, add case for GOACC_TILE fns. (oacc_loop_xform_loop): Delete. (oacc_loop_process): Iterate over call list directly, and add handling for GOACC_TILE fns. (oacc_loop_fixed_partitions): Determine default auto, deal with TILE, dump partitioning. (oacc_loop_auto_partitions): Add outer_assign parm. Assign all but vector partitioning to outer loops. Assign 2 partitions to loops when available. Add TILE handling. (oacc_loop_partition): Adjust oacc_loop_auto_partitions call. (execite_oacc_device_lower): Process GOACC_TILE fns, ignore unknown specs. * tree-nested.c (convert_nonlocal_omp_clauses): Allow OMP_CLAUSE_TILE. * tree.c (omp_clause_num_ops): Adjust TILE ops. * tree.h (OMP_CLAUSE_TILE_ITERVAR, OMP_CLAUSE_TILE_COUNT): New. gcc/c/ * c-parser.c (c_parser_omp_clause_collapse): Disallow tile. (c_parser_oacc_clause_tile): Disallow collapse. Fix parsing and semantic checking. * c-parser.c (c_parser_omp_for_loop): Accept tiling constructs. gcc/cp/ * parser.c (cp_parser_oacc_clause_tile): Disallow collapse. Fix parsing. Parse constant expression. Remove semantic checking. (cp_parser_omp_clause_collapse): Disallow tile. (cp_parser_omp_for_loop): Deal with tile clause. Don't emit a parse error about missing for after already emitting one. Use more conventional for idiom for unbounded loop. * pt.c (tsubst_omp_clauses): Handle OMP_CLAUSE_TILE. * semantics.c (finish_omp_clauses): Correct TILE semantic check. (finish_omp_for): Deal with tile clause. gcc/fortran/ * openmp.c (resolve_omp_clauses): Error on directives containing both tile and collapse clauses. (resolve_oacc_loop_blocks): Represent '*' tile arguments as zero. * trans-openmp.c (gfc_trans_omp_do): Lower tiled loops like collapsed loops. gcc/testsuite/ * c-c++-common/goacc/combined-directives.c: Remove xfail. * c-c++-common/goacc/loop-auto-1.c: Adjust and add additional case. * c-c++-common/goacc/loop-auto-2.c: New. * c-c++-common/goacc/tile.c: Include stdbool, fix expected errors. * c-c++-common/goacc/tile-2.c: New. * g++.dg/goacc/template.C: Test tile subst. Adjust erroneous uses. * g++.dg/goacc/tile-1.C: New, check tile subst. * gcc.dg/goacc/loop-processing-1.c: Adjust dg-final pattern. * gfortran.dg/goacc/combined-directives.f90: Remove xfail. * gfortran.dg/goacc/tile-1.f90: New test. * gfortran.dg/goacc/tile-2.f90: New test. * gfortran.dg/goacc/tile-lowering.f95: New test. libgomp/ * testsuite/libgomp.oacc-c-c++-common/tile-1.c: New. * testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c: Adjust and add additional case. * testsuite/libgomp.oacc-c-c++-common/vprop.c: XFAIL under "openacc_nvidia_accel_selected". * libgomp.oacc-fortran/nested-function-1.f90 (test2): Add num_workers(8) clause. From-SVN: r245300 --- gcc/ChangeLog | 48 +++ gcc/c/ChangeLog | 8 + gcc/c/c-parser.c | 40 +- gcc/cp/ChangeLog | 13 + gcc/cp/parser.c | 39 +- gcc/cp/pt.c | 14 +- gcc/cp/semantics.c | 33 +- gcc/fortran/ChangeLog | 9 + gcc/fortran/openmp.c | 6 +- gcc/fortran/trans-openmp.c | 11 + gcc/gimplify.c | 24 +- gcc/internal-fn.c | 8 + gcc/internal-fn.def | 6 +- gcc/omp-expand.c | 224 ++++++++++-- gcc/omp-general.c | 30 +- gcc/omp-general.h | 8 +- gcc/omp-low.c | 28 +- gcc/omp-offload.c | 346 ++++++++++++------ gcc/testsuite/ChangeLog | 18 + .../c-c++-common/goacc/combined-directives.c | 3 +- .../c-c++-common/goacc/loop-auto-1.c | 19 +- .../c-c++-common/goacc/loop-auto-2.c | 107 ++++++ gcc/testsuite/c-c++-common/goacc/tile-2.c | 21 ++ gcc/testsuite/c-c++-common/goacc/tile.c | 99 ++--- gcc/testsuite/g++.dg/goacc/template.C | 6 +- gcc/testsuite/g++.dg/goacc/tile-1.C | 16 + .../gcc.dg/goacc/loop-processing-1.c | 2 +- .../gfortran.dg/goacc/combined-directives.f90 | 3 +- gcc/testsuite/gfortran.dg/goacc/tile-1.f90 | 339 +++++++++++++++++ gcc/testsuite/gfortran.dg/goacc/tile-2.f90 | 21 ++ .../gfortran.dg/goacc/tile-lowering.f95 | 292 +++++++++++++++ gcc/tree-nested.c | 6 +- gcc/tree.c | 2 +- gcc/tree.h | 4 + libgomp/ChangeLog | 11 + .../libgomp.oacc-c-c++-common/loop-auto-1.c | 20 +- .../libgomp.oacc-c-c++-common/tile-1.c | 281 ++++++++++++++ .../libgomp.oacc-c-c++-common/vprop.c | 3 + .../nested-function-1.f90 | 2 +- 39 files changed, 1872 insertions(+), 298 deletions(-) create mode 100644 gcc/testsuite/c-c++-common/goacc/loop-auto-2.c create mode 100644 gcc/testsuite/c-c++-common/goacc/tile-2.c create mode 100644 gcc/testsuite/g++.dg/goacc/tile-1.C create mode 100644 gcc/testsuite/gfortran.dg/goacc/tile-1.f90 create mode 100644 gcc/testsuite/gfortran.dg/goacc/tile-2.f90 create mode 100644 gcc/testsuite/gfortran.dg/goacc/tile-lowering.f95 create mode 100644 libgomp/testsuite/libgomp.oacc-c-c++-common/tile-1.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 92737111fb8..7228afcbc6f 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,51 @@ +2017-02-09 Nathan Sidwell + Chung-Lin Tang + + * gimplify.c (gimplify_scan_omp_clauses): No special handling for + OMP_CLAUSE_TILE. + (gimplify_adjust_omp_clauses): Don't delete TILE. + (gimplify_omp_for): Deal with TILE. + * internal-fn.c (expand_GOACC_TILE): New function. + * internal-fn.def (GOACC_DIM_POS): Comment may be overly conservative. + (GOACC_TILE): New. + * omp-expand.c (struct oacc_collapse): Add tile and outer fields. + (expand_oacc_collapse_init): Add LOC paramter. Initialize tile + element fields. + (expand_oacc_collapse_vars): Add INNER parm, adjust for tiling, + avoid DIV for outermost collapse var. + (expand_oacc_for): Insert tile element loop as needed. Adjust. + Remove out of date comments, fix whitespace. + * omp-general.c (omp_extract_for_data): Deal with tiling. + * omp-general.h (enum oacc_loop_flags): Add OLF_TILE flag, + adjust OLF_DIM_BASE value. + (struct omp_for_data): Add tiling field. + * omp-low.c (scan_sharing_clauses): Allow OMP_CLAUSE_TILE. + (lower_oacc_head_mark): Add OLF_TILE as appropriate. Ensure 2 levels + for auto loops. Remove default auto determining, moved to + oacc_loop_fixed_partitions. + * omp-offload.c (struct oacc_loop): Change 'ifns' to vector of call + stmts, add e_mask field. + (oacc_dim_call): New function, abstracted out from oacc_thread_numbers. + (oacc_thread_numbers): Use oacc_dim_call. + (oacc_xform_tile): New. + (new_oacc_loop_raw): Initialize e_mask, adjust for ifns vector. + (finish_oacc_loop): Adjust for ifns vector. + (oacc_loop_discover_walk): Append loop abstraction sites to list, + add case for GOACC_TILE fns. + (oacc_loop_xform_loop): Delete. + (oacc_loop_process): Iterate over call list directly, and add + handling for GOACC_TILE fns. + (oacc_loop_fixed_partitions): Determine default auto, deal with TILE, + dump partitioning. + (oacc_loop_auto_partitions): Add outer_assign parm. Assign all but + vector partitioning to outer loops. Assign 2 partitions to loops + when available. Add TILE handling. + (oacc_loop_partition): Adjust oacc_loop_auto_partitions call. + (execite_oacc_device_lower): Process GOACC_TILE fns, ignore unknown specs. + * tree-nested.c (convert_nonlocal_omp_clauses): Allow OMP_CLAUSE_TILE. + * tree.c (omp_clause_num_ops): Adjust TILE ops. + * tree.h (OMP_CLAUSE_TILE_ITERVAR, OMP_CLAUSE_TILE_COUNT): New. + 2017-02-09 Gerald Pfeifer * configure.ac (ACX_BUGURL): Update. diff --git a/gcc/c/ChangeLog b/gcc/c/ChangeLog index 1ab194a6310..c05721df7ec 100644 --- a/gcc/c/ChangeLog +++ b/gcc/c/ChangeLog @@ -1,3 +1,11 @@ +2016-02-09 Nathan Sidwell + Chung-Lin Tang + + * c-parser.c (c_parser_omp_clause_collapse): Disallow tile. + (c_parser_oacc_clause_tile): Disallow collapse. Fix parsing and + semantic checking. + * c-parser.c (c_parser_omp_for_loop): Accept tiling constructs. + 2017-02-07 Richard Biener * gimple-parser.c (c_parser_gimple_expr_list): Simplify. diff --git a/gcc/c/c-parser.c b/gcc/c/c-parser.c index 5c152ab28b0..6e83728b577 100644 --- a/gcc/c/c-parser.c +++ b/gcc/c/c-parser.c @@ -11023,6 +11023,7 @@ c_parser_omp_clause_collapse (c_parser *parser, tree list) location_t loc; check_no_duplicate_clause (list, OMP_CLAUSE_COLLAPSE, "collapse"); + check_no_duplicate_clause (list, OMP_CLAUSE_TILE, "tile"); loc = c_parser_peek_token (parser)->location; if (c_parser_require (parser, CPP_OPEN_PAREN, "expected %<(%>")) @@ -11933,10 +11934,11 @@ static tree c_parser_oacc_clause_tile (c_parser *parser, tree list) { tree c, expr = error_mark_node; - location_t loc, expr_loc; + location_t loc; tree tile = NULL_TREE; check_no_duplicate_clause (list, OMP_CLAUSE_TILE, "tile"); + check_no_duplicate_clause (list, OMP_CLAUSE_COLLAPSE, "collapse"); loc = c_parser_peek_token (parser)->location; if (!c_parser_require (parser, CPP_OPEN_PAREN, "expected %<(%>")) @@ -11944,16 +11946,19 @@ c_parser_oacc_clause_tile (c_parser *parser, tree list) do { + if (tile && !c_parser_require (parser, CPP_COMMA, "expected %<,%>")) + return list; + if (c_parser_next_token_is (parser, CPP_MULT) && (c_parser_peek_2nd_token (parser)->type == CPP_COMMA || c_parser_peek_2nd_token (parser)->type == CPP_CLOSE_PAREN)) { c_parser_consume_token (parser); - expr = integer_minus_one_node; + expr = integer_zero_node; } else { - expr_loc = c_parser_peek_token (parser)->location; + location_t expr_loc = c_parser_peek_token (parser)->location; c_expr cexpr = c_parser_expr_no_commas (parser, NULL); cexpr = convert_lvalue_to_rvalue (expr_loc, cexpr, false, true); expr = cexpr.value; @@ -11965,28 +11970,19 @@ c_parser_oacc_clause_tile (c_parser *parser, tree list) return list; } - if (!INTEGRAL_TYPE_P (TREE_TYPE (expr))) - { - c_parser_error (parser, "% value must be integral"); - return list; - } - expr = c_fully_fold (expr, false, NULL); - /* Attempt to statically determine when expr isn't positive. */ - c = fold_build2_loc (expr_loc, LE_EXPR, boolean_type_node, expr, - build_int_cst (TREE_TYPE (expr), 0)); - protected_set_expr_location (c, expr_loc); - if (c == boolean_true_node) + if (!INTEGRAL_TYPE_P (TREE_TYPE (expr)) + || !tree_fits_shwi_p (expr) + || tree_to_shwi (expr) <= 0) { - warning_at (expr_loc, 0,"% value must be positive"); - expr = integer_one_node; + error_at (expr_loc, "% argument needs positive" + " integral constant"); + expr = integer_zero_node; } } tile = tree_cons (NULL_TREE, expr, tile); - if (c_parser_next_token_is (parser, CPP_COMMA)) - c_parser_consume_token (parser); } while (c_parser_next_token_is_not (parser, CPP_CLOSE_PAREN)); @@ -14910,11 +14906,17 @@ c_parser_omp_for_loop (location_t loc, c_parser *parser, enum tree_code code, bool fail = false, open_brace_parsed = false; int i, collapse = 1, ordered = 0, count, nbraces = 0; location_t for_loc; + bool tiling = false; vec *for_block = make_tree_vector (); for (cl = clauses; cl; cl = OMP_CLAUSE_CHAIN (cl)) if (OMP_CLAUSE_CODE (cl) == OMP_CLAUSE_COLLAPSE) collapse = tree_to_shwi (OMP_CLAUSE_COLLAPSE_EXPR (cl)); + else if (OMP_CLAUSE_CODE (cl) == OMP_CLAUSE_TILE) + { + tiling = true; + collapse = list_length (OMP_CLAUSE_TILE_LIST (cl)); + } else if (OMP_CLAUSE_CODE (cl) == OMP_CLAUSE_ORDERED && OMP_CLAUSE_ORDERED_EXPR (cl)) { @@ -14944,7 +14946,7 @@ c_parser_omp_for_loop (location_t loc, c_parser *parser, enum tree_code code, pc = &OMP_CLAUSE_CHAIN (*pc); } - gcc_assert (collapse >= 1 && ordered >= 0); + gcc_assert (tiling || (collapse >= 1 && ordered >= 0)); count = ordered ? ordered : collapse; declv = make_tree_vec (count); diff --git a/gcc/cp/ChangeLog b/gcc/cp/ChangeLog index dc87561793c..291dcddcde1 100644 --- a/gcc/cp/ChangeLog +++ b/gcc/cp/ChangeLog @@ -1,3 +1,16 @@ +2016-02-09 Nathan Sidwell + Chung-Lin Tang + + * parser.c (cp_parser_oacc_clause_tile): Disallow collapse. Fix + parsing. Parse constant expression. Remove semantic checking. + (cp_parser_omp_clause_collapse): Disallow tile. + (cp_parser_omp_for_loop): Deal with tile clause. Don't emit a parse + error about missing for after already emitting one. Use more + conventional for idiom for unbounded loop. + * pt.c (tsubst_omp_clauses): Handle OMP_CLAUSE_TILE. + * semantics.c (finish_omp_clauses): Correct TILE semantic check. + (finish_omp_for): Deal with tile clause. + 2017-02-07 Nathan Sidwell * method.c (synthesized_method_base_walk): New. Broken out of ... diff --git a/gcc/cp/parser.c b/gcc/cp/parser.c index d2df777f344..41b08e1a7a3 100644 --- a/gcc/cp/parser.c +++ b/gcc/cp/parser.c @@ -31274,30 +31274,33 @@ cp_parser_oacc_clause_tile (cp_parser *parser, location_t clause_loc, tree list) tree c, expr = error_mark_node; tree tile = NULL_TREE; + /* Collapse and tile are mutually exclusive. (The spec doesn't say + so, but the spec authors never considered such a case and have + differing opinions on what it might mean, including 'not + allowed'.) */ check_no_duplicate_clause (list, OMP_CLAUSE_TILE, "tile", clause_loc); + check_no_duplicate_clause (list, OMP_CLAUSE_COLLAPSE, "collapse", + clause_loc); if (!cp_parser_require (parser, CPP_OPEN_PAREN, RT_OPEN_PAREN)) return list; do { + if (tile && !cp_parser_require (parser, CPP_COMMA, RT_COMMA)) + return list; + if (cp_lexer_next_token_is (parser->lexer, CPP_MULT) && (cp_lexer_nth_token_is (parser->lexer, 2, CPP_COMMA) || cp_lexer_nth_token_is (parser->lexer, 2, CPP_CLOSE_PAREN))) { cp_lexer_consume_token (parser->lexer); - expr = integer_minus_one_node; + expr = integer_zero_node; } else - expr = cp_parser_assignment_expression (parser, NULL, false, false); - - if (expr == error_mark_node) - return list; + expr = cp_parser_constant_expression (parser); tile = tree_cons (NULL_TREE, expr, tile); - - if (cp_lexer_next_token_is (parser->lexer, CPP_COMMA)) - cp_lexer_consume_token (parser->lexer); } while (cp_lexer_next_token_is_not (parser->lexer, CPP_CLOSE_PAREN)); @@ -31410,6 +31413,7 @@ cp_parser_omp_clause_collapse (cp_parser *parser, tree list, location_t location } check_no_duplicate_clause (list, OMP_CLAUSE_COLLAPSE, "collapse", location); + check_no_duplicate_clause (list, OMP_CLAUSE_TILE, "tile", location); c = build_omp_clause (loc, OMP_CLAUSE_COLLAPSE); OMP_CLAUSE_CHAIN (c) = list; OMP_CLAUSE_COLLAPSE_EXPR (c) = num; @@ -34416,10 +34420,16 @@ cp_parser_omp_for_loop (cp_parser *parser, enum tree_code code, tree clauses, int i, collapse = 1, ordered = 0, count, nbraces = 0; vec *for_block = make_tree_vector (); auto_vec orig_inits; + bool tiling = false; for (cl = clauses; cl; cl = OMP_CLAUSE_CHAIN (cl)) if (OMP_CLAUSE_CODE (cl) == OMP_CLAUSE_COLLAPSE) collapse = tree_to_shwi (OMP_CLAUSE_COLLAPSE_EXPR (cl)); + else if (OMP_CLAUSE_CODE (cl) == OMP_CLAUSE_TILE) + { + tiling = true; + collapse = list_length (OMP_CLAUSE_TILE_LIST (cl)); + } else if (OMP_CLAUSE_CODE (cl) == OMP_CLAUSE_ORDERED && OMP_CLAUSE_ORDERED_EXPR (cl)) { @@ -34449,7 +34459,7 @@ cp_parser_omp_for_loop (cp_parser *parser, enum tree_code code, tree clauses, pc = &OMP_CLAUSE_CHAIN (*pc); } - gcc_assert (collapse >= 1 && ordered >= 0); + gcc_assert (tiling || (collapse >= 1 && ordered >= 0)); count = ordered ? ordered : collapse; declv = make_tree_vec (count); @@ -34468,13 +34478,15 @@ cp_parser_omp_for_loop (cp_parser *parser, enum tree_code code, tree clauses, if (code != CILK_FOR && !cp_lexer_next_token_is_keyword (parser->lexer, RID_FOR)) { - cp_parser_error (parser, "for statement expected"); + if (!collapse_err) + cp_parser_error (parser, "for statement expected"); return NULL; } if (code == CILK_FOR && !cp_lexer_next_token_is_keyword (parser->lexer, RID_CILK_FOR)) { - cp_parser_error (parser, "_Cilk_for statement expected"); + if (!collapse_err) + cp_parser_error (parser, "_Cilk_for statement expected"); return NULL; } loc = cp_lexer_consume_token (parser->lexer)->location; @@ -34634,7 +34646,7 @@ cp_parser_omp_for_loop (cp_parser *parser, enum tree_code code, tree clauses, nested. Hopefully the final version clarifies this. For now handle (multiple) {'s and empty statements. */ cp_parser_parse_tentatively (parser); - do + for (;;) { if (cp_lexer_next_token_is_keyword (parser->lexer, RID_FOR)) break; @@ -34649,14 +34661,13 @@ cp_parser_omp_for_loop (cp_parser *parser, enum tree_code code, tree clauses, else { loc = cp_lexer_peek_token (parser->lexer)->location; - error_at (loc, "not enough collapsed for loops"); + error_at (loc, "not enough for loops to collapse"); collapse_err = true; cp_parser_abort_tentative_parse (parser); declv = NULL_TREE; break; } } - while (1); if (declv) { diff --git a/gcc/cp/pt.c b/gcc/cp/pt.c index 6072432382d..8863c281ad7 100644 --- a/gcc/cp/pt.c +++ b/gcc/cp/pt.c @@ -15078,6 +15078,7 @@ tsubst_omp_clauses (tree clauses, enum c_omp_region_type ort, = tsubst_omp_clause_decl (OMP_CLAUSE_DECL (oc), args, complain, in_decl); break; + case OMP_CLAUSE_TILE: case OMP_CLAUSE_IF: case OMP_CLAUSE_NUM_THREADS: case OMP_CLAUSE_SCHEDULE: @@ -15172,19 +15173,6 @@ tsubst_omp_clauses (tree clauses, enum c_omp_region_type ort, case OMP_CLAUSE_AUTO: case OMP_CLAUSE_SEQ: break; - case OMP_CLAUSE_TILE: - { - tree lnc, loc; - for (lnc = OMP_CLAUSE_TILE_LIST (nc), - loc = OMP_CLAUSE_TILE_LIST (oc); - loc; - loc = TREE_CHAIN (loc), lnc = TREE_CHAIN (lnc)) - { - TREE_VALUE (lnc) = tsubst_expr (TREE_VALUE (loc), args, - complain, in_decl, false); - } - } - break; default: gcc_unreachable (); } diff --git a/gcc/cp/semantics.c b/gcc/cp/semantics.c index e4f2a6a8807..e9fc4aa2797 100644 --- a/gcc/cp/semantics.c +++ b/gcc/cp/semantics.c @@ -7099,7 +7099,8 @@ finish_omp_clauses (tree clauses, enum c_omp_region_type ort) else if (!type_dependent_expression_p (t) && !INTEGRAL_TYPE_P (TREE_TYPE (t))) { - error ("% value must be integral"); + error_at (OMP_CLAUSE_LOCATION (c), + "% argument needs integral type"); remove = true; } else @@ -7107,14 +7108,16 @@ finish_omp_clauses (tree clauses, enum c_omp_region_type ort) t = mark_rvalue_use (t); if (!processing_template_decl) { + /* Zero is used to indicate '*', we permit you + to get there via an ICE of value zero. */ t = maybe_constant_value (t); - if (TREE_CODE (t) == INTEGER_CST - && tree_int_cst_sgn (t) != 1 - && t != integer_minus_one_node) + if (!tree_fits_shwi_p (t) + || tree_to_shwi (t) < 0) { - warning_at (OMP_CLAUSE_LOCATION (c), 0, - "% value must be positive"); - t = integer_one_node; + error_at (OMP_CLAUSE_LOCATION (c), + "% argument needs positive " + "integral constant"); + remove = true; } } t = fold_build_cleanup_point_expr (TREE_TYPE (t), t); @@ -8013,11 +8016,19 @@ finish_omp_for (location_t locus, enum tree_code code, tree declv, gcc_assert (TREE_VEC_LENGTH (declv) == TREE_VEC_LENGTH (incrv)); if (TREE_VEC_LENGTH (declv) > 1) { - tree c = omp_find_clause (clauses, OMP_CLAUSE_COLLAPSE); + tree c; + + c = omp_find_clause (clauses, OMP_CLAUSE_TILE); if (c) - collapse = tree_to_shwi (OMP_CLAUSE_COLLAPSE_EXPR (c)); - if (collapse != TREE_VEC_LENGTH (declv)) - ordered = TREE_VEC_LENGTH (declv); + collapse = list_length (OMP_CLAUSE_TILE_LIST (c)); + else + { + c = omp_find_clause (clauses, OMP_CLAUSE_COLLAPSE); + if (c) + collapse = tree_to_shwi (OMP_CLAUSE_COLLAPSE_EXPR (c)); + if (collapse != TREE_VEC_LENGTH (declv)) + ordered = TREE_VEC_LENGTH (declv); + } } for (i = 0; i < TREE_VEC_LENGTH (declv); i++) { diff --git a/gcc/fortran/ChangeLog b/gcc/fortran/ChangeLog index 0b3279667dd..3488f01e7cd 100644 --- a/gcc/fortran/ChangeLog +++ b/gcc/fortran/ChangeLog @@ -1,3 +1,12 @@ +2017-02-09 Cesar Philippidis + Joseph Myers + + * openmp.c (resolve_omp_clauses): Error on directives + containing both tile and collapse clauses. + (resolve_oacc_loop_blocks): Represent '*' tile arguments as zero. + * trans-openmp.c (gfc_trans_omp_do): Lower tiled loops like + collapsed loops. + 2017-02-07 Steven G. Kargl * trans-types.c (gfc_get_int_kind_from_width_isofortranen): Choose diff --git a/gcc/fortran/openmp.c b/gcc/fortran/openmp.c index d19ee948330..3ca23493251 100644 --- a/gcc/fortran/openmp.c +++ b/gcc/fortran/openmp.c @@ -4754,6 +4754,8 @@ resolve_omp_clauses (gfc_code *code, gfc_omp_clauses *omp_clauses, if (omp_clauses->wait_list) for (el = omp_clauses->wait_list; el; el = el->next) resolve_scalar_int_expr (el->expr, "WAIT"); + if (omp_clauses->collapse && omp_clauses->tile_list) + gfc_error ("Incompatible use of TILE and COLLAPSE at %L", &code->loc); if (omp_clauses->depend_source && code->op != EXEC_OMP_ORDERED) gfc_error ("SOURCE dependence type only allowed " "on ORDERED directive at %L", &code->loc); @@ -5900,11 +5902,11 @@ resolve_oacc_loop_blocks (gfc_code *code) if (el->expr == NULL) { /* NULL expressions are used to represent '*' arguments. - Convert those to a -1 expressions. */ + Convert those to a 0 expressions. */ el->expr = gfc_get_constant_expr (BT_INTEGER, gfc_default_integer_kind, &code->loc); - mpz_set_si (el->expr->value.integer, -1); + mpz_set_si (el->expr->value.integer, 0); } else { diff --git a/gcc/fortran/trans-openmp.c b/gcc/fortran/trans-openmp.c index 4f525feb502..662036f514d 100644 --- a/gcc/fortran/trans-openmp.c +++ b/gcc/fortran/trans-openmp.c @@ -3488,6 +3488,17 @@ gfc_trans_omp_do (gfc_code *code, gfc_exec_op op, stmtblock_t *pblock, dovar_init *di; unsigned ix; vec *saved_doacross_steps = doacross_steps; + gfc_expr_list *tile = do_clauses ? do_clauses->tile_list : clauses->tile_list; + + /* Both collapsed and tiled loops are lowered the same way. In + OpenACC, those clauses are not compatible, so prioritize the tile + clause, if present. */ + if (tile) + { + collapse = 0; + for (gfc_expr_list *el = tile; el; el = el->next) + collapse++; + } doacross_steps = NULL; if (clauses->orderedc) diff --git a/gcc/gimplify.c b/gcc/gimplify.c index feb5fa0f9e9..dd73fc258e5 100644 --- a/gcc/gimplify.c +++ b/gcc/gimplify.c @@ -8340,20 +8340,11 @@ gimplify_scan_omp_clauses (tree *list_p, gimple_seq *pre_p, remove = true; break; - case OMP_CLAUSE_TILE: - for (tree list = OMP_CLAUSE_TILE_LIST (c); !remove && list; - list = TREE_CHAIN (list)) - { - if (gimplify_expr (&TREE_VALUE (list), pre_p, NULL, - is_gimple_val, fb_rvalue) == GS_ERROR) - remove = true; - } - break; - case OMP_CLAUSE_NOWAIT: case OMP_CLAUSE_ORDERED: case OMP_CLAUSE_UNTIED: case OMP_CLAUSE_COLLAPSE: + case OMP_CLAUSE_TILE: case OMP_CLAUSE_AUTO: case OMP_CLAUSE_SEQ: case OMP_CLAUSE_INDEPENDENT: @@ -9122,13 +9113,7 @@ gimplify_adjust_omp_clauses (gimple_seq *pre_p, gimple_seq body, tree *list_p, case OMP_CLAUSE_VECTOR: case OMP_CLAUSE_AUTO: case OMP_CLAUSE_SEQ: - break; - case OMP_CLAUSE_TILE: - /* We're not yet making use of the information provided by OpenACC - tile clauses. Discard these here, to simplify later middle end - processing. */ - remove = true; break; default: @@ -9583,10 +9568,13 @@ gimplify_omp_for (tree *expr_p, gimple_seq *pre_p) (OMP_FOR_INIT (for_stmt)) * 2); } - int collapse = 1; + int collapse = 1, tile = 0; c = omp_find_clause (OMP_FOR_CLAUSES (for_stmt), OMP_CLAUSE_COLLAPSE); if (c) collapse = tree_to_shwi (OMP_CLAUSE_COLLAPSE_EXPR (c)); + c = omp_find_clause (OMP_FOR_CLAUSES (for_stmt), OMP_CLAUSE_TILE); + if (c) + tile = list_length (OMP_CLAUSE_TILE_LIST (c)); for (i = 0; i < TREE_VEC_LENGTH (OMP_FOR_INIT (for_stmt)); i++) { t = TREE_VEC_ELT (OMP_FOR_INIT (for_stmt), i); @@ -10000,7 +9988,7 @@ gimplify_omp_for (tree *expr_p, gimple_seq *pre_p) OMP_CLAUSE_LINEAR_STEP (c2) = OMP_CLAUSE_LINEAR_STEP (c); } - if ((var != decl || collapse > 1) && orig_for_stmt == for_stmt) + if ((var != decl || collapse > 1 || tile) && orig_for_stmt == for_stmt) { for (c = OMP_FOR_CLAUSES (for_stmt); c ; c = OMP_CLAUSE_CHAIN (c)) if (((OMP_CLAUSE_CODE (c) == OMP_CLAUSE_LASTPRIVATE diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c index 0d61375462d..1ccc803631a 100644 --- a/gcc/internal-fn.c +++ b/gcc/internal-fn.c @@ -2479,6 +2479,14 @@ expand_GOACC_REDUCTION (internal_fn, gcall *) gcc_unreachable (); } +/* This is expanded by oacc_device_lower pass. */ + +static void +expand_GOACC_TILE (internal_fn, gcall *) +{ + gcc_unreachable (); +} + /* Set errno to EDOM. */ static void diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def index fd25a952299..9f682322c87 100644 --- a/gcc/internal-fn.def +++ b/gcc/internal-fn.def @@ -187,7 +187,7 @@ DEF_INTERNAL_FN (PHI, 0, NULL) dimension. DIM_POS is pure (and not const) so that it isn't thought to clobber memory and can be gcse'd within a single parallel region, but not across FORK/JOIN boundaries. They take a - single INTEGER_CST argument. */ + single INTEGER_CST argument. This might be overly conservative. */ DEF_INTERNAL_FN (GOACC_DIM_SIZE, ECF_CONST | ECF_NOTHROW | ECF_LEAF, ".") DEF_INTERNAL_FN (GOACC_DIM_POS, ECF_PURE | ECF_NOTHROW | ECF_LEAF, ".") @@ -197,6 +197,10 @@ DEF_INTERNAL_FN (GOACC_LOOP, ECF_PURE | ECF_NOTHROW, NULL) /* OpenACC reduction abstraction. See internal-fn.h for usage. */ DEF_INTERNAL_FN (GOACC_REDUCTION, ECF_NOTHROW | ECF_LEAF, NULL) +/* Openacc tile abstraction. Describes the spans of the element loop. + GOACC_TILE (num-loops, loop-no, tile-arg, tile-mask, element-mask). */ +DEF_INTERNAL_FN (GOACC_TILE, ECF_NOTHROW | ECF_LEAF, NULL) + /* Set errno to EDOM, if GCC knows how to do that directly for the current target. */ DEF_INTERNAL_FN (SET_EDOM, ECF_LEAF | ECF_NOTHROW, NULL) diff --git a/gcc/omp-expand.c b/gcc/omp-expand.c index 736573611a3..55e54e4dbf8 100644 --- a/gcc/omp-expand.c +++ b/gcc/omp-expand.c @@ -1409,7 +1409,9 @@ struct oacc_collapse { tree base; /* Base value. */ tree iters; /* Number of steps. */ - tree step; /* step size. */ + tree step; /* Step size. */ + tree tile; /* Tile increment (if tiled). */ + tree outer; /* Tile iterator var. */ }; /* Helper for expand_oacc_for. Determine collapsed loop information. @@ -1419,15 +1421,20 @@ struct oacc_collapse static tree expand_oacc_collapse_init (const struct omp_for_data *fd, gimple_stmt_iterator *gsi, - oacc_collapse *counts, tree bound_type) + oacc_collapse *counts, tree bound_type, + location_t loc) { + tree tiling = fd->tiling; tree total = build_int_cst (bound_type, 1); int ix; gcc_assert (integer_onep (fd->loop.step)); gcc_assert (integer_zerop (fd->loop.n1)); - for (ix = 0; ix != fd->collapse; ix++) + /* When tiling, the first operand of the tile clause applies to the + innermost loop, and we work outwards from there. Seems + backwards, but whatever. */ + for (ix = fd->collapse; ix--;) { const omp_for_data_loop *loop = &fd->loops[ix]; @@ -1442,6 +1449,30 @@ expand_oacc_collapse_init (const struct omp_for_data *fd, if (POINTER_TYPE_P (diff_type) || TYPE_UNSIGNED (diff_type)) diff_type = signed_type_for (diff_type); + if (tiling) + { + tree num = build_int_cst (integer_type_node, fd->collapse); + tree loop_no = build_int_cst (integer_type_node, ix); + tree tile = TREE_VALUE (tiling); + gcall *call + = gimple_build_call_internal (IFN_GOACC_TILE, 5, num, loop_no, tile, + /* gwv-outer=*/integer_zero_node, + /* gwv-inner=*/integer_zero_node); + + counts[ix].outer = create_tmp_var (iter_type, ".outer"); + counts[ix].tile = create_tmp_var (diff_type, ".tile"); + gimple_call_set_lhs (call, counts[ix].tile); + gimple_set_location (call, loc); + gsi_insert_before (gsi, call, GSI_SAME_STMT); + + tiling = TREE_CHAIN (tiling); + } + else + { + counts[ix].tile = NULL; + counts[ix].outer = loop->v; + } + tree b = loop->n1; tree e = loop->n2; tree s = loop->step; @@ -1495,13 +1526,14 @@ expand_oacc_collapse_init (const struct omp_for_data *fd, return total; } -/* Emit initializers for collapsed loop members. IVAR is the outer +/* Emit initializers for collapsed loop members. INNER is true if + this is for the element loop of a TILE. IVAR is the outer loop iteration variable, from which collapsed loop iteration values are calculated. COUNTS array has been initialized by expand_oacc_collapse_inits. */ static void -expand_oacc_collapse_vars (const struct omp_for_data *fd, +expand_oacc_collapse_vars (const struct omp_for_data *fd, bool inner, gimple_stmt_iterator *gsi, const oacc_collapse *counts, tree ivar) { @@ -1513,7 +1545,8 @@ expand_oacc_collapse_vars (const struct omp_for_data *fd, { const omp_for_data_loop *loop = &fd->loops[ix]; const oacc_collapse *collapse = &counts[ix]; - tree iter_type = TREE_TYPE (loop->v); + tree v = inner ? loop->v : collapse->outer; + tree iter_type = TREE_TYPE (v); tree diff_type = TREE_TYPE (collapse->step); tree plus_type = iter_type; enum tree_code plus_code = PLUS_EXPR; @@ -1525,24 +1558,25 @@ expand_oacc_collapse_vars (const struct omp_for_data *fd, plus_type = sizetype; } - expr = fold_build2 (TRUNC_MOD_EXPR, ivar_type, ivar, - fold_convert (ivar_type, collapse->iters)); + expr = ivar; + if (ix) + { + tree mod = fold_convert (ivar_type, collapse->iters); + ivar = fold_build2 (TRUNC_DIV_EXPR, ivar_type, expr, mod); + expr = fold_build2 (TRUNC_MOD_EXPR, ivar_type, expr, mod); + ivar = force_gimple_operand_gsi (gsi, ivar, true, NULL_TREE, + true, GSI_SAME_STMT); + } + expr = fold_build2 (MULT_EXPR, diff_type, fold_convert (diff_type, expr), collapse->step); - expr = fold_build2 (plus_code, iter_type, collapse->base, + expr = fold_build2 (plus_code, iter_type, + inner ? collapse->outer : collapse->base, fold_convert (plus_type, expr)); expr = force_gimple_operand_gsi (gsi, expr, false, NULL_TREE, true, GSI_SAME_STMT); - gassign *ass = gimple_build_assign (loop->v, expr); + gassign *ass = gimple_build_assign (v, expr); gsi_insert_before (gsi, ass, GSI_SAME_STMT); - - if (ix) - { - expr = fold_build2 (TRUNC_DIV_EXPR, ivar_type, ivar, - fold_convert (ivar_type, collapse->iters)); - ivar = force_gimple_operand_gsi (gsi, expr, true, NULL_TREE, - true, GSI_SAME_STMT); - } } } @@ -5230,7 +5264,8 @@ expand_omp_taskloop_for_inner (struct omp_region *region, where LTGT is < or >. We may have a specified chunking size, CHUNKING (constant 0 for no chunking) and we will have a GWV partitioning mask, specifying dimensions over which the loop is to be - partitioned (see note below). We generate code that looks like: + partitioned (see note below). We generate code that looks like + (this ignores tiling): [incoming FALL->body, BRANCH->exit] typedef signedintify (typeof (V)) T; // underlying signed integral type @@ -5260,11 +5295,7 @@ expand_omp_taskloop_for_inner (struct omp_region *region, [incoming] V = B + ((range -/+ 1) / S +/- 1) * S [*] - [*] Needed if V live at end of loop - - Note: CHUNKING & GWV mask are specified explicitly here. This is a - transition, and will be specified by a more general mechanism shortly. - */ + [*] Needed if V live at end of loop. */ static void expand_oacc_for (struct omp_region *region, struct omp_for_data *fd) @@ -5327,9 +5358,16 @@ expand_oacc_for (struct omp_region *region, struct omp_for_data *fd) tree step = create_tmp_var (diff_type, ".step"); bool up = cond_code == LT_EXPR; tree dir = build_int_cst (diff_type, up ? +1 : -1); - bool chunking = !gimple_in_ssa_p (cfun);; + bool chunking = !gimple_in_ssa_p (cfun); bool negating; + /* Tiling vars. */ + tree tile_size = NULL_TREE; + tree element_s = NULL_TREE; + tree e_bound = NULL_TREE, e_offset = NULL_TREE, e_step = NULL_TREE; + basic_block elem_body_bb = NULL; + basic_block elem_cont_bb = NULL; + /* SSA instances. */ tree offset_incr = NULL_TREE; tree offset_init = NULL_TREE; @@ -5360,11 +5398,12 @@ expand_oacc_for (struct omp_region *region, struct omp_for_data *fd) gwv = build_int_cst (integer_type_node, GOMP_DIM_MASK (GOMP_DIM_GANG)); } - if (fd->collapse > 1) + if (fd->collapse > 1 || fd->tiling) { + gcc_assert (!gimple_in_ssa_p (cfun) && up); counts = XALLOCAVEC (struct oacc_collapse, fd->collapse); tree total = expand_oacc_collapse_init (fd, &gsi, counts, - TREE_TYPE (fd->loop.n2)); + TREE_TYPE (fd->loop.n2), loc); if (SSA_VAR_P (fd->loop.n2)) { @@ -5373,7 +5412,6 @@ expand_oacc_for (struct omp_region *region, struct omp_for_data *fd) ass = gimple_build_assign (fd->loop.n2, total); gsi_insert_before (&gsi, ass, GSI_SAME_STMT); } - } tree b = fd->loop.n1; @@ -5397,6 +5435,29 @@ expand_oacc_for (struct omp_region *region, struct omp_for_data *fd) expr = fold_convert (diff_type, chunk_size); chunk_size = force_gimple_operand_gsi (&gsi, expr, true, NULL_TREE, true, GSI_SAME_STMT); + + if (fd->tiling) + { + /* Determine the tile size and element step, + modify the outer loop step size. */ + tile_size = create_tmp_var (diff_type, ".tile_size"); + expr = build_int_cst (diff_type, 1); + for (int ix = 0; ix < fd->collapse; ix++) + expr = fold_build2 (MULT_EXPR, diff_type, counts[ix].tile, expr); + expr = force_gimple_operand_gsi (&gsi, expr, true, + NULL_TREE, true, GSI_SAME_STMT); + ass = gimple_build_assign (tile_size, expr); + gsi_insert_before (&gsi, ass, GSI_SAME_STMT); + + element_s = create_tmp_var (diff_type, ".element_s"); + ass = gimple_build_assign (element_s, s); + gsi_insert_before (&gsi, ass, GSI_SAME_STMT); + + expr = fold_build2 (MULT_EXPR, diff_type, s, tile_size); + s = force_gimple_operand_gsi (&gsi, expr, true, + NULL_TREE, true, GSI_SAME_STMT); + } + /* Determine the range, avoiding possible unsigned->signed overflow. */ negating = !up && TYPE_UNSIGNED (iter_type); expr = fold_build2 (MINUS_EXPR, plus_type, @@ -5501,8 +5562,72 @@ expand_oacc_for (struct omp_region *region, struct omp_for_data *fd) true, GSI_SAME_STMT); ass = gimple_build_assign (v, expr); gsi_insert_before (&gsi, ass, GSI_SAME_STMT); - if (fd->collapse > 1) - expand_oacc_collapse_vars (fd, &gsi, counts, v); + + if (fd->collapse > 1 || fd->tiling) + expand_oacc_collapse_vars (fd, false, &gsi, counts, v); + + if (fd->tiling) + { + /* Determine the range of the element loop -- usually simply + the tile_size, but could be smaller if the final + iteration of the outer loop is a partial tile. */ + tree e_range = create_tmp_var (diff_type, ".e_range"); + + expr = build2 (MIN_EXPR, diff_type, + build2 (MINUS_EXPR, diff_type, bound, offset), + build2 (MULT_EXPR, diff_type, tile_size, + element_s)); + expr = force_gimple_operand_gsi (&gsi, expr, false, NULL_TREE, + true, GSI_SAME_STMT); + ass = gimple_build_assign (e_range, expr); + gsi_insert_before (&gsi, ass, GSI_SAME_STMT); + + /* Determine bound, offset & step of inner loop. */ + e_bound = create_tmp_var (diff_type, ".e_bound"); + e_offset = create_tmp_var (diff_type, ".e_offset"); + e_step = create_tmp_var (diff_type, ".e_step"); + + /* Mark these as element loops. */ + tree t, e_gwv = integer_minus_one_node; + tree chunk = build_int_cst (diff_type, 0); /* Never chunked. */ + + t = build_int_cst (integer_type_node, IFN_GOACC_LOOP_OFFSET); + call = gimple_build_call_internal (IFN_GOACC_LOOP, 7, t, dir, e_range, + element_s, chunk, e_gwv, chunk); + gimple_call_set_lhs (call, e_offset); + gimple_set_location (call, loc); + gsi_insert_before (&gsi, call, GSI_SAME_STMT); + + t = build_int_cst (integer_type_node, IFN_GOACC_LOOP_BOUND); + call = gimple_build_call_internal (IFN_GOACC_LOOP, 7, t, dir, e_range, + element_s, chunk, e_gwv, e_offset); + gimple_call_set_lhs (call, e_bound); + gimple_set_location (call, loc); + gsi_insert_before (&gsi, call, GSI_SAME_STMT); + + t = build_int_cst (integer_type_node, IFN_GOACC_LOOP_STEP); + call = gimple_build_call_internal (IFN_GOACC_LOOP, 6, t, dir, e_range, + element_s, chunk, e_gwv); + gimple_call_set_lhs (call, e_step); + gimple_set_location (call, loc); + gsi_insert_before (&gsi, call, GSI_SAME_STMT); + + /* Add test and split block. */ + expr = build2 (cond_code, boolean_type_node, e_offset, e_bound); + stmt = gimple_build_cond_empty (expr); + gsi_insert_before (&gsi, stmt, GSI_SAME_STMT); + split = split_block (body_bb, stmt); + elem_body_bb = split->dest; + if (cont_bb == body_bb) + cont_bb = elem_body_bb; + body_bb = split->src; + + split->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE; + + /* Initialize the user's loop vars. */ + gsi = gsi_start_bb (elem_body_bb); + expand_oacc_collapse_vars (fd, true, &gsi, counts, e_offset); + } } /* Loop increment goes into cont_bb. If this is not a loop, we @@ -5516,10 +5641,34 @@ expand_oacc_for (struct omp_region *region, struct omp_for_data *fd) gomp_continue *cont_stmt = as_a (gsi_stmt (gsi)); loc = gimple_location (cont_stmt); + if (fd->tiling) + { + /* Insert element loop increment and test. */ + expr = build2 (PLUS_EXPR, diff_type, e_offset, e_step); + expr = force_gimple_operand_gsi (&gsi, expr, false, NULL_TREE, + true, GSI_SAME_STMT); + ass = gimple_build_assign (e_offset, expr); + gsi_insert_before (&gsi, ass, GSI_SAME_STMT); + expr = build2 (cond_code, boolean_type_node, e_offset, e_bound); + + stmt = gimple_build_cond_empty (expr); + gsi_insert_before (&gsi, stmt, GSI_SAME_STMT); + split = split_block (cont_bb, stmt); + elem_cont_bb = split->src; + cont_bb = split->dest; + + split->flags ^= EDGE_FALLTHRU | EDGE_FALSE_VALUE; + make_edge (elem_cont_bb, elem_body_bb, EDGE_TRUE_VALUE); + + make_edge (body_bb, cont_bb, EDGE_FALSE_VALUE); + + gsi = gsi_for_stmt (cont_stmt); + } + /* Increment offset. */ if (gimple_in_ssa_p (cfun)) - expr= build2 (plus_code, iter_type, offset, - fold_convert (plus_type, step)); + expr = build2 (plus_code, iter_type, offset, + fold_convert (plus_type, step)); else expr = build2 (PLUS_EXPR, diff_type, offset, step); expr = force_gimple_operand_gsi (&gsi, expr, false, NULL_TREE, @@ -5592,7 +5741,7 @@ expand_oacc_for (struct omp_region *region, struct omp_for_data *fd) if (cont_bb) { - /* We now have one or two nested loops. Update the loop + /* We now have one, two or three nested loops. Update the loop structures. */ struct loop *parent = entry_bb->loop_father; struct loop *body = body_bb->loop_father; @@ -5619,6 +5768,15 @@ expand_oacc_for (struct omp_region *region, struct omp_for_data *fd) body_loop->header = body_bb; body_loop->latch = cont_bb; add_loop (body_loop, parent); + + if (fd->tiling) + { + /* Insert tiling's element loop. */ + struct loop *inner_loop = alloc_loop (); + inner_loop->header = elem_body_bb; + inner_loop->latch = elem_cont_bb; + add_loop (inner_loop, body_loop); + } } } } diff --git a/gcc/omp-general.c b/gcc/omp-general.c index 649dbf30621..3f9aec8d6a7 100644 --- a/gcc/omp-general.c +++ b/gcc/omp-general.c @@ -133,13 +133,9 @@ omp_extract_for_data (gomp_for *for_stmt, struct omp_for_data *fd, fd->for_stmt = for_stmt; fd->pre = NULL; - if (gimple_omp_for_collapse (for_stmt) > 1) - fd->loops = loops; - else - fd->loops = &fd->loop; - fd->have_nowait = distribute || simd; fd->have_ordered = false; + fd->tiling = NULL_TREE; fd->collapse = 1; fd->ordered = 0; fd->sched_kind = OMP_CLAUSE_SCHEDULE_STATIC; @@ -184,9 +180,22 @@ omp_extract_for_data (gomp_for *for_stmt, struct omp_for_data *fd, collapse_count = &OMP_CLAUSE_COLLAPSE_COUNT (t); } break; + case OMP_CLAUSE_TILE: + fd->tiling = OMP_CLAUSE_TILE_LIST (t); + fd->collapse = list_length (fd->tiling); + gcc_assert (fd->collapse); + collapse_iter = &OMP_CLAUSE_TILE_ITERVAR (t); + collapse_count = &OMP_CLAUSE_TILE_COUNT (t); + break; default: break; } + + if (fd->collapse > 1 || fd->tiling) + fd->loops = loops; + else + fd->loops = &fd->loop; + if (fd->ordered && fd->collapse == 1 && loops != NULL) { fd->loops = loops; @@ -205,7 +214,7 @@ omp_extract_for_data (gomp_for *for_stmt, struct omp_for_data *fd, fd->sched_kind = OMP_CLAUSE_SCHEDULE_STATIC; gcc_assert (fd->chunk_size == NULL); } - gcc_assert (fd->collapse == 1 || collapse_iter != NULL); + gcc_assert ((fd->collapse == 1 && !fd->tiling) || collapse_iter != NULL); if (taskloop) fd->sched_kind = OMP_CLAUSE_SCHEDULE_RUNTIME; if (fd->sched_kind == OMP_CLAUSE_SCHEDULE_RUNTIME) @@ -223,7 +232,10 @@ omp_extract_for_data (gomp_for *for_stmt, struct omp_for_data *fd, int cnt = fd->ordered ? fd->ordered : fd->collapse; for (i = 0; i < cnt; i++) { - if (i == 0 && fd->collapse == 1 && (fd->ordered == 0 || loops == NULL)) + if (i == 0 + && fd->collapse == 1 + && !fd->tiling + && (fd->ordered == 0 || loops == NULL)) loop = &fd->loop; else if (loops != NULL) loop = loops + i; @@ -252,7 +264,7 @@ omp_extract_for_data (gomp_for *for_stmt, struct omp_for_data *fd, || (fd->sched_kind == OMP_CLAUSE_SCHEDULE_STATIC && !fd->have_ordered)) { - if (fd->collapse == 1) + if (fd->collapse == 1 && !fd->tiling) iter_type = TREE_TYPE (loop->v); else if (i == 0 || TYPE_PRECISION (iter_type) @@ -383,7 +395,7 @@ omp_extract_for_data (gomp_for *for_stmt, struct omp_for_data *fd, *collapse_count = create_tmp_var (iter_type, ".count"); } - if (fd->collapse > 1 || (fd->ordered && loops)) + if (fd->collapse > 1 || fd->tiling || (fd->ordered && loops)) { fd->loop.v = *collapse_iter; fd->loop.n1 = build_int_cst (TREE_TYPE (fd->loop.v), 0); diff --git a/gcc/omp-general.h b/gcc/omp-general.h index d89e8c179bf..3cf7fcec41f 100644 --- a/gcc/omp-general.h +++ b/gcc/omp-general.h @@ -31,9 +31,10 @@ enum oacc_loop_flags { OLF_AUTO = 1u << 1, /* Compiler chooses axes. */ OLF_INDEPENDENT = 1u << 2, /* Iterations are known independent. */ OLF_GANG_STATIC = 1u << 3, /* Gang partitioning is static (has op). */ - + OLF_TILE = 1u << 4, /* Tiled loop. */ + /* Explicitly specified loop axes. */ - OLF_DIM_BASE = 4, + OLF_DIM_BASE = 5, OLF_DIM_GANG = 1u << (OLF_DIM_BASE + GOMP_DIM_GANG), OLF_DIM_WORKER = 1u << (OLF_DIM_BASE + GOMP_DIM_WORKER), OLF_DIM_VECTOR = 1u << (OLF_DIM_BASE + GOMP_DIM_VECTOR), @@ -58,7 +59,8 @@ struct omp_for_data tree chunk_size; gomp_for *for_stmt; tree pre, iter_type; - int collapse; + tree tiling; /* Tiling values (if non null). */ + int collapse; /* Collapsed loops, 1 for a non-collapsed loop. */ int ordered; bool have_nowait, have_ordered, simd_schedule; unsigned char sched_modifiers; diff --git a/gcc/omp-low.c b/gcc/omp-low.c index ff0f4477cd7..35df02c70a4 100644 --- a/gcc/omp-low.c +++ b/gcc/omp-low.c @@ -1330,6 +1330,7 @@ scan_sharing_clauses (tree clauses, omp_context *ctx, case OMP_CLAUSE_INDEPENDENT: case OMP_CLAUSE_AUTO: case OMP_CLAUSE_SEQ: + case OMP_CLAUSE_TILE: case OMP_CLAUSE__SIMT_: break; @@ -1340,7 +1341,6 @@ scan_sharing_clauses (tree clauses, omp_context *ctx, install_var_local (decl, ctx); break; - case OMP_CLAUSE_TILE: case OMP_CLAUSE__CACHE_: default: gcc_unreachable (); @@ -1501,11 +1501,11 @@ scan_sharing_clauses (tree clauses, omp_context *ctx, case OMP_CLAUSE_INDEPENDENT: case OMP_CLAUSE_AUTO: case OMP_CLAUSE_SEQ: + case OMP_CLAUSE_TILE: case OMP_CLAUSE__GRIDDIM_: case OMP_CLAUSE__SIMT_: break; - case OMP_CLAUSE_TILE: case OMP_CLAUSE__CACHE_: default: gcc_unreachable (); @@ -5610,6 +5610,10 @@ lower_oacc_head_mark (location_t loc, tree ddvar, tree clauses, tag |= OLF_INDEPENDENT; break; + case OMP_CLAUSE_TILE: + tag |= OLF_TILE; + break; + default: continue; } @@ -5627,14 +5631,20 @@ lower_oacc_head_mark (location_t loc, tree ddvar, tree clauses, if (!tgt || is_oacc_parallel (tgt)) tag |= OLF_INDEPENDENT; - /* A loop lacking SEQ, GANG, WORKER and/or VECTOR is implicitly AUTO. */ - if (!(tag & (((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1) << OLF_DIM_BASE) - | OLF_SEQ))) - tag |= OLF_AUTO; + if (tag & OLF_TILE) + /* Tiling could use all 3 levels. */ + levels = 3; + else + { + /* A loop lacking SEQ, GANG, WORKER and/or VECTOR could be AUTO. + Ensure at least one level, or 2 for possible auto + partitioning */ + bool maybe_auto = !(tag & (((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1) + << OLF_DIM_BASE) | OLF_SEQ)); - /* Ensure at least one level. */ - if (!levels) - levels++; + if (levels < 1u + maybe_auto) + levels = 1u + maybe_auto; + } args.quick_push (build_int_cst (integer_type_node, levels)); args.quick_push (build_int_cst (integer_type_node, tag)); diff --git a/gcc/omp-offload.c b/gcc/omp-offload.c index 6ff6bc2eeb9..e4ce48cb8e5 100644 --- a/gcc/omp-offload.c +++ b/gcc/omp-offload.c @@ -67,9 +67,10 @@ struct oacc_loop tree routine; /* Pseudo-loop enclosing a routine. */ unsigned mask; /* Partitioning mask. */ + unsigned e_mask; /* Partitioning of element loops (when tiling). */ unsigned inner; /* Partitioning of inner loops. */ unsigned flags; /* Partitioning flags. */ - unsigned ifns; /* Contained loop abstraction functions. */ + vec ifns; /* Contained loop abstraction functions. */ tree chunk_size; /* Chunk size. */ gcall *head_end; /* Final marker of head sequence. */ }; @@ -217,6 +218,23 @@ omp_finish_file (void) } } +/* Call dim_pos (POS == true) or dim_size (POS == false) builtins for + axis DIM. Return a tmp var holding the result. */ + +static tree +oacc_dim_call (bool pos, int dim, gimple_seq *seq) +{ + tree arg = build_int_cst (unsigned_type_node, dim); + tree size = create_tmp_var (integer_type_node); + enum internal_fn fn = pos ? IFN_GOACC_DIM_POS : IFN_GOACC_DIM_SIZE; + gimple *call = gimple_build_call_internal (fn, 1, arg); + + gimple_call_set_lhs (call, size); + gimple_seq_add_stmt (seq, call); + + return size; +} + /* Find the number of threads (POS = false), or thread number (POS = true) for an OpenACC region partitioned as MASK. Setup code required for the calculation is added to SEQ. */ @@ -231,29 +249,17 @@ oacc_thread_numbers (bool pos, int mask, gimple_seq *seq) for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++) if (GOMP_DIM_MASK (ix) & mask) { - tree arg = build_int_cst (unsigned_type_node, ix); - if (res) { /* We had an outer index, so scale that by the size of this dimension. */ - tree n = create_tmp_var (integer_type_node); - gimple *call - = gimple_build_call_internal (IFN_GOACC_DIM_SIZE, 1, arg); - - gimple_call_set_lhs (call, n); - gimple_seq_add_stmt (seq, call); + tree n = oacc_dim_call (false, ix, seq); res = fold_build2 (MULT_EXPR, integer_type_node, res, n); } if (pos) { /* Determine index in this dimension. */ - tree id = create_tmp_var (integer_type_node); - gimple *call = gimple_build_call_internal - (IFN_GOACC_DIM_POS, 1, arg); - - gimple_call_set_lhs (call, id); - gimple_seq_add_stmt (seq, call); + tree id = oacc_dim_call (true, ix, seq); if (res) res = fold_build2 (PLUS_EXPR, integer_type_node, res, id); else @@ -452,6 +458,85 @@ oacc_xform_loop (gcall *call) gsi_replace_with_seq (&gsi, seq, true); } +/* Transform a GOACC_TILE call. Determines the element loop span for + the specified loop of the nest. This is 1 if we're not tiling. + + GOACC_TILE (collapse_count, loop_no, tile_arg, gwv_tile, gwv_element); */ + +static void +oacc_xform_tile (gcall *call) +{ + gimple_stmt_iterator gsi = gsi_for_stmt (call); + unsigned collapse = tree_to_uhwi (gimple_call_arg (call, 0)); + /* Inner loops have higher loop_nos. */ + unsigned loop_no = tree_to_uhwi (gimple_call_arg (call, 1)); + tree tile_size = gimple_call_arg (call, 2); + unsigned e_mask = tree_to_uhwi (gimple_call_arg (call, 4)); + tree lhs = gimple_call_lhs (call); + tree type = TREE_TYPE (lhs); + gimple_seq seq = NULL; + tree span = build_int_cst (type, 1); + + gcc_assert (!(e_mask + & ~(GOMP_DIM_MASK (GOMP_DIM_VECTOR) + | GOMP_DIM_MASK (GOMP_DIM_WORKER)))); + push_gimplify_context (!seen_error ()); + +#ifndef ACCEL_COMPILER + /* Partitioning disabled on host compilers. */ + e_mask = 0; +#endif + if (!e_mask) + /* Not paritioning. */ + span = integer_one_node; + else if (!integer_zerop (tile_size)) + /* User explicitly specified size. */ + span = tile_size; + else + { + /* Pick a size based on the paritioning of the element loop and + the number of loop nests. */ + tree first_size = NULL_TREE; + tree second_size = NULL_TREE; + + if (e_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)) + first_size = oacc_dim_call (false, GOMP_DIM_VECTOR, &seq); + if (e_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) + second_size = oacc_dim_call (false, GOMP_DIM_WORKER, &seq); + + if (!first_size) + { + first_size = second_size; + second_size = NULL_TREE; + } + + if (loop_no + 1 == collapse) + { + span = first_size; + if (!loop_no && second_size) + span = fold_build2 (MULT_EXPR, TREE_TYPE (span), + span, second_size); + } + else if (loop_no + 2 == collapse) + span = second_size; + else + span = NULL_TREE; + + if (!span) + /* There's no obvious element size for this loop. Options + are 1, first_size or some non-unity constant (32 is my + favourite). We should gather some statistics. */ + span = first_size; + } + + span = fold_convert (type, span); + gimplify_assign (lhs, span, &seq); + + pop_gimplify_context (NULL); + + gsi_replace_with_seq (&gsi, seq, true); +} + /* Default partitioned and minimum partitioned dimensions. */ static int oacc_default_dims[GOMP_DIM_MAX]; @@ -610,8 +695,7 @@ new_oacc_loop_raw (oacc_loop *parent, location_t loc) memset (loop->tails, 0, sizeof (loop->tails)); loop->routine = NULL_TREE; - loop->mask = loop->flags = loop->inner = 0; - loop->ifns = 0; + loop->mask = loop->e_mask = loop->flags = loop->inner = 0; loop->chunk_size = 0; loop->head_end = NULL; @@ -674,7 +758,7 @@ static oacc_loop * finish_oacc_loop (oacc_loop *loop) { /* If the loop has been collapsed, don't partition it. */ - if (!loop->ifns) + if (loop->ifns.is_empty ()) loop->mask = loop->flags = 0; return loop->parent; } @@ -810,9 +894,10 @@ oacc_loop_discover_walk (oacc_loop *loop, basic_block bb) break; case IFN_GOACC_LOOP: - /* Count the goacc loop abstraction fns, to determine if the - loop was collapsed already. */ - loop->ifns++; + case IFN_GOACC_TILE: + /* Record the abstraction function, so we can manipulate it + later. */ + loop->ifns.safe_push (call); break; case IFN_UNIQUE: @@ -947,51 +1032,6 @@ oacc_loop_xform_head_tail (gcall *from, int level) } } -/* Transform the IFN_GOACC_LOOP internal functions by providing the - determined partitioning mask and chunking argument. END_MARKER - points at the end IFN_HEAD_TAIL call intgroducing the loop. IFNS - is the number of IFN_GOACC_LOOP calls for the loop. MASK_ARG is - the replacement partitioning mask and CHUNK_ARG is the replacement - chunking arg. */ - -static void -oacc_loop_xform_loop (gcall *end_marker, unsigned ifns, - tree mask_arg, tree chunk_arg) -{ - gimple_stmt_iterator gsi = gsi_for_stmt (end_marker); - - gcc_checking_assert (ifns); - for (;;) - { - for (; !gsi_end_p (gsi); gsi_next (&gsi)) - { - gimple *stmt = gsi_stmt (gsi); - - if (!is_gimple_call (stmt)) - continue; - - gcall *call = as_a (stmt); - - if (!gimple_call_internal_p (call)) - continue; - - if (gimple_call_internal_fn (call) != IFN_GOACC_LOOP) - continue; - - *gimple_call_arg_ptr (call, 5) = mask_arg; - *gimple_call_arg_ptr (call, 4) = chunk_arg; - ifns--; - if (!ifns) - return; - } - - /* The LOOP_BOUND ifn could be in the single successor - block. */ - basic_block bb = single_succ (gsi_bb (gsi)); - gsi = gsi_start_bb (bb); - } -} - /* Process the discovered OpenACC loops, setting the correct partitioning level etc. */ @@ -1004,13 +1044,34 @@ oacc_loop_process (oacc_loop *loop) if (loop->mask && !loop->routine) { int ix; - unsigned mask = loop->mask; - unsigned dim = GOMP_DIM_GANG; - tree mask_arg = build_int_cst (unsigned_type_node, mask); + tree mask_arg = build_int_cst (unsigned_type_node, loop->mask); + tree e_mask_arg = build_int_cst (unsigned_type_node, loop->e_mask); tree chunk_arg = loop->chunk_size; + gcall *call; + + for (ix = 0; loop->ifns.iterate (ix, &call); ix++) + switch (gimple_call_internal_fn (call)) + { + case IFN_GOACC_LOOP: + { + bool is_e = gimple_call_arg (call, 5) == integer_minus_one_node; + gimple_call_set_arg (call, 5, is_e ? e_mask_arg : mask_arg); + if (!is_e) + gimple_call_set_arg (call, 4, chunk_arg); + } + break; - oacc_loop_xform_loop (loop->head_end, loop->ifns, mask_arg, chunk_arg); + case IFN_GOACC_TILE: + gimple_call_set_arg (call, 3, mask_arg); + gimple_call_set_arg (call, 4, e_mask_arg); + break; + default: + gcc_unreachable (); + } + + unsigned dim = GOMP_DIM_GANG; + unsigned mask = loop->mask | loop->e_mask; for (ix = 0; ix != GOMP_DIM_MAX && mask; ix++) { while (!(GOMP_DIM_MASK (dim) & mask)) @@ -1050,10 +1111,16 @@ oacc_loop_fixed_partitions (oacc_loop *loop, unsigned outer_mask) { bool auto_par = (loop->flags & OLF_AUTO) != 0; bool seq_par = (loop->flags & OLF_SEQ) != 0; - + bool tiling = (loop->flags & OLF_TILE) != 0; + this_mask = ((loop->flags >> OLF_DIM_BASE) & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1)); + /* Apply auto partitioning if this is a non-partitioned regular + loop, or (no more than) single axis tiled loop. */ + bool maybe_auto + = !seq_par && this_mask == (tiling ? this_mask & -this_mask : 0); + if ((this_mask != 0) + auto_par + seq_par > 1) { if (noisy) @@ -1062,7 +1129,7 @@ oacc_loop_fixed_partitions (oacc_loop *loop, unsigned outer_mask) ? "% overrides other OpenACC loop specifiers" : "% conflicts with other OpenACC loop " "specifiers"); - auto_par = false; + maybe_auto = false; loop->flags &= ~OLF_AUTO; if (seq_par) { @@ -1071,15 +1138,19 @@ oacc_loop_fixed_partitions (oacc_loop *loop, unsigned outer_mask) this_mask = 0; } } - if (auto_par && (loop->flags & OLF_INDEPENDENT)) - mask_all |= GOMP_DIM_MASK (GOMP_DIM_MAX); + + if (maybe_auto && (loop->flags & OLF_INDEPENDENT)) + { + loop->flags |= OLF_AUTO; + mask_all |= GOMP_DIM_MASK (GOMP_DIM_MAX); + } } if (this_mask & outer_mask) { const oacc_loop *outer; for (outer = loop->parent; outer; outer = outer->parent) - if (outer->mask & this_mask) + if ((outer->mask | outer->e_mask) & this_mask) break; if (noisy) @@ -1125,13 +1196,33 @@ oacc_loop_fixed_partitions (oacc_loop *loop, unsigned outer_mask) } } - loop->mask = this_mask; mask_all |= this_mask; + if (loop->flags & OLF_TILE) + { + /* When tiling, vector goes to the element loop, and failing + that we put worker there. The std doesn't contemplate + specifying all three. We choose to put worker and vector on + the element loops in that case. */ + unsigned this_e_mask = this_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR); + if (!this_e_mask || this_mask & GOMP_DIM_MASK (GOMP_DIM_GANG)) + this_e_mask |= this_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER); + + loop->e_mask = this_e_mask; + this_mask ^= this_e_mask; + } + + loop->mask = this_mask; + + if (dump_file) + fprintf (dump_file, "Loop %s:%d user specified %d & %d\n", + LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc), + loop->mask, loop->e_mask); + if (loop->child) { - loop->inner = oacc_loop_fixed_partitions (loop->child, - outer_mask | this_mask); + unsigned tmp_mask = outer_mask | this_mask | loop->e_mask; + loop->inner = oacc_loop_fixed_partitions (loop->child, tmp_mask); mask_all |= loop->inner; } @@ -1143,14 +1234,17 @@ oacc_loop_fixed_partitions (oacc_loop *loop, unsigned outer_mask) /* Walk the OpenACC loop heirarchy to assign auto-partitioned loops. OUTER_MASK is the partitioning this loop is contained within. + OUTER_ASSIGN is true if an outer loop is being auto-partitioned. Return the cumulative partitioning used by this loop, siblings and children. */ static unsigned -oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask) +oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask, + bool outer_assign) { bool assign = (loop->flags & OLF_AUTO) && (loop->flags & OLF_INDEPENDENT); bool noisy = true; + bool tiling = loop->flags & OLF_TILE; #ifdef ACCEL_COMPILER /* When device_type is supported, we want the device compiler to be @@ -1158,29 +1252,50 @@ oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask) noisy = false; #endif - if (assign && outer_mask < GOMP_DIM_MASK (GOMP_DIM_MAX - 1)) + if (assign && (!outer_assign | loop->inner)) { - /* Allocate the outermost loop at the outermost available - level. */ - unsigned this_mask = outer_mask + 1; + /* Allocate outermost and non-innermost loops at the outermost + non-innermost available level. */ + unsigned this_mask = GOMP_DIM_MASK (GOMP_DIM_GANG); - if (!(this_mask & loop->inner)) - loop->mask = this_mask; + /* Find the first outermost available partition. */ + while (this_mask <= outer_mask) + this_mask <<= 1; + + /* Grab two axes if tiling, and we've not assigned anything */ + if (tiling && !(loop->mask | loop->e_mask)) + this_mask |= this_mask << 1; + + /* Prohibit the innermost partitioning at the moment. */ + this_mask &= GOMP_DIM_MASK (GOMP_DIM_MAX - 1) - 1; + + /* Don't use any dimension explicitly claimed by an inner loop. */ + this_mask &= ~loop->inner; + + if (tiling && !loop->e_mask) + { + /* If we got two axes, allocate the inner one to the element + loop. */ + loop->e_mask = this_mask & (this_mask << 1); + this_mask ^= loop->e_mask; + } + + loop->mask |= this_mask; } if (loop->child) { - unsigned child_mask = outer_mask | loop->mask; - - if (loop->mask || assign) - child_mask |= GOMP_DIM_MASK (GOMP_DIM_MAX); - - loop->inner = oacc_loop_auto_partitions (loop->child, child_mask); + unsigned tmp_mask = outer_mask | loop->mask | loop->e_mask; + loop->inner = oacc_loop_auto_partitions (loop->child, tmp_mask, + outer_assign | assign); } - if (assign && !loop->mask) + if (assign && (!loop->mask || (tiling && !loop->e_mask) || !outer_assign)) { - /* Allocate the loop at the innermost available level. */ + /* Allocate the loop at the innermost available level. Note + that we do this even if we already assigned this loop the + outermost available level above. That way we'll partition + this along 2 axes, if they are available. */ unsigned this_mask = 0; /* Determine the outermost partitioning used within this loop. */ @@ -1193,24 +1308,44 @@ oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask) /* And avoid picking one use by an outer loop. */ this_mask &= ~outer_mask; - if (!this_mask && noisy) - warning_at (loop->loc, 0, - "insufficient partitioning available to parallelize loop"); + /* If tiling and we failed completely above, grab the next one + too. Making sure it doesn't hit an outer loop. */ + if (tiling) + { + this_mask &= ~(loop->e_mask | loop->mask); + unsigned tile_mask = ((this_mask >> 1) + & ~(outer_mask | loop->e_mask | loop->mask)); - loop->mask = this_mask; + if (tile_mask || loop->mask) + { + loop->e_mask |= this_mask; + this_mask = tile_mask; + } + if (!loop->e_mask && noisy) + warning_at (loop->loc, 0, + "insufficient partitioning available" + " to parallelize element loop"); + } + + loop->mask |= this_mask; + if (!loop->mask && noisy) + warning_at (loop->loc, 0, + "insufficient partitioning available" + " to parallelize%s loop", tiling ? " tile" : ""); } if (assign && dump_file) - fprintf (dump_file, "Auto loop %s:%d assigned %d\n", + fprintf (dump_file, "Auto loop %s:%d assigned %d & %d\n", LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc), - loop->mask); + loop->mask, loop->e_mask); unsigned inner_mask = 0; if (loop->sibling) - inner_mask |= oacc_loop_auto_partitions (loop->sibling, outer_mask); + inner_mask |= oacc_loop_auto_partitions (loop->sibling, + outer_mask, outer_assign); - inner_mask |= loop->inner | loop->mask; + inner_mask |= loop->inner | loop->mask | loop->e_mask; return inner_mask; } @@ -1226,7 +1361,7 @@ oacc_loop_partition (oacc_loop *loop, unsigned outer_mask) if (mask_all & GOMP_DIM_MASK (GOMP_DIM_MAX)) { mask_all ^= GOMP_DIM_MASK (GOMP_DIM_MAX); - mask_all |= oacc_loop_auto_partitions (loop, outer_mask); + mask_all |= oacc_loop_auto_partitions (loop, outer_mask, false); } return mask_all; } @@ -1376,6 +1511,11 @@ execute_oacc_device_lower () { default: break; + case IFN_GOACC_TILE: + oacc_xform_tile (call); + rescan = true; + break; + case IFN_GOACC_LOOP: oacc_xform_loop (call); rescan = true; @@ -1403,7 +1543,7 @@ execute_oacc_device_lower () switch (kind) { default: - gcc_unreachable (); + break; case IFN_UNIQUE_OACC_FORK: case IFN_UNIQUE_OACC_JOIN: diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index aeba293c41d..95c0bfc5c7e 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,21 @@ +2017-02-09 Nathan Sidwell + Cesar Philippidis + Joseph Myers + Chung-Lin Tang + + * c-c++-common/goacc/combined-directives.c: Remove xfail. + * c-c++-common/goacc/loop-auto-1.c: Adjust and add additional case. + * c-c++-common/goacc/loop-auto-2.c: New. + * c-c++-common/goacc/tile.c: Include stdbool, fix expected errors. + * c-c++-common/goacc/tile-2.c: New. + * g++.dg/goacc/template.C: Test tile subst. Adjust erroneous uses. + * g++.dg/goacc/tile-1.C: New, check tile subst. + * gcc.dg/goacc/loop-processing-1.c: Adjust dg-final pattern. + * gfortran.dg/goacc/combined-directives.f90: Remove xfail. + * gfortran.dg/goacc/tile-1.f90: New test. + * gfortran.dg/goacc/tile-2.f90: New test. + * gfortran.dg/goacc/tile-lowering.f95: New test. + 2017-02-09 Richard Biener PR tree-optimization/69823 diff --git a/gcc/testsuite/c-c++-common/goacc/combined-directives.c b/gcc/testsuite/c-c++-common/goacc/combined-directives.c index 3fa800d7bbe..c2a3c57b48b 100644 --- a/gcc/testsuite/c-c++-common/goacc/combined-directives.c +++ b/gcc/testsuite/c-c++-common/goacc/combined-directives.c @@ -111,7 +111,6 @@ test () // { dg-final { scan-tree-dump-times "acc loop vector" 2 "gimple" } } // { dg-final { scan-tree-dump-times "acc loop seq" 2 "gimple" } } // { dg-final { scan-tree-dump-times "acc loop auto" 2 "gimple" } } -// XFAILed: OpenACC tile clauses are discarded during gimplification. -// { dg-final { scan-tree-dump-times "acc loop tile.2, 3" 2 "gimple" { xfail *-*-* } } } +// { dg-final { scan-tree-dump-times "acc loop tile.2, 3" 2 "gimple" } } // { dg-final { scan-tree-dump-times "acc loop independent private.i" 2 "gimple" } } // { dg-final { scan-tree-dump-times "private.z" 2 "gimple" } } diff --git a/gcc/testsuite/c-c++-common/goacc/loop-auto-1.c b/gcc/testsuite/c-c++-common/goacc/loop-auto-1.c index 33d53409fe3..124befc4002 100644 --- a/gcc/testsuite/c-c++-common/goacc/loop-auto-1.c +++ b/gcc/testsuite/c-c++-common/goacc/loop-auto-1.c @@ -74,6 +74,21 @@ void Foo () for (int kx = 0; kx < 10; kx++) {} } } + +#pragma acc loop auto + for (int ix = 0; ix < 10; ix++) + { +#pragma acc loop auto + for (int jx = 0; jx < 10; jx++) + { +#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */ + for (int kx = 0; kx < 10; kx++) + { +#pragma acc loop auto + for (int lx = 0; lx < 10; lx++) {} + } + } + } } } @@ -214,10 +229,10 @@ void Vector (void) #pragma acc loop auto for (int ix = 0; ix < 10; ix++) {} -#pragma acc loop auto +#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */ for (int ix = 0; ix < 10; ix++) { -#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */ +#pragma acc loop auto for (int jx = 0; jx < 10; jx++) {} } } diff --git a/gcc/testsuite/c-c++-common/goacc/loop-auto-2.c b/gcc/testsuite/c-c++-common/goacc/loop-auto-2.c new file mode 100644 index 00000000000..af3f0bddf2c --- /dev/null +++ b/gcc/testsuite/c-c++-common/goacc/loop-auto-2.c @@ -0,0 +1,107 @@ + +// Tile parititioning + +void Ok () +{ +#pragma acc parallel num_gangs (10) num_workers(32) vector_length(32) + { + +#pragma acc loop tile(*) gang vector + for (int ix = 0; ix < 10; ix++) + { + } + +#pragma acc loop tile(*) + for (int ix = 0; ix < 10; ix++) + { + } + +#pragma acc loop tile(*) gang + for (int ix = 0; ix < 10; ix++) + { + #pragma acc loop vector + for (int jx = 0; jx < 10; jx++) + ; + } + +#pragma acc loop tile(*) + for (int ix = 0; ix < 10; ix++) + { + #pragma acc loop vector + for (int jx = 0; jx < 10; jx++) + ; + } + +#pragma acc loop gang + for (int jx = 0; jx < 10; jx++) + { +#pragma acc loop tile(*) vector + for (int ix = 0; ix < 10; ix++) + { + } + +#pragma acc loop tile(*) + for (int ix = 0; ix < 10; ix++) + { + } + } + +#pragma acc loop tile(*) worker + for (int ix = 0; ix < 10; ix++) + { + #pragma acc loop vector + for (int jx = 0; jx < 10; jx++) + ; + } + } +} + +void Bad () +{ +#pragma acc parallel num_gangs (10) num_workers(32) vector_length(32) + { + +#pragma acc loop tile(*) gang vector /* { dg-message "containing loop" } */ + for (int ix = 0; ix < 10; ix++) + { +#pragma acc loop vector /* { dg-error "uses same" } */ + for (int jx = 0; jx < 10; jx++) + ; + } + +#pragma acc loop tile(*) gang vector + for (int ix = 0; ix < 10; ix++) + { + #pragma acc loop auto /* { dg-warning "insufficient partitioning" } */ + for (int jx = 0; jx < 10; jx++) + ; + } + +#pragma acc loop tile(*) auto /* { dg-warning "insufficient partitioning" } */ + for (int ix = 0; ix < 10; ix++) + { + #pragma acc loop worker + for (int jx = 0; jx < 10; jx++) + ; + } + +#pragma acc loop worker /* { dg-message "containing loop" } */ + for (int jx = 0; jx < 10; jx++) + { +#pragma acc loop tile(*) gang vector /* { dg-error "incorrectly nested" } */ + for (int ix = 0; ix < 10; ix++) + { + } + +#pragma acc loop tile(*) vector /* { dg-warning "insufficient partitioning" } */ + for (int ix = 0; ix < 10; ix++) + { + } + +#pragma acc loop tile(*) /* { dg-warning "insufficient partitioning" } */ + for (int ix = 0; ix < 10; ix++) + { + } + } + } +} diff --git a/gcc/testsuite/c-c++-common/goacc/tile-2.c b/gcc/testsuite/c-c++-common/goacc/tile-2.c new file mode 100644 index 00000000000..c8b240d225b --- /dev/null +++ b/gcc/testsuite/c-c++-common/goacc/tile-2.c @@ -0,0 +1,21 @@ +int main () +{ +#pragma acc parallel + { +#pragma acc loop tile (*,*) + for (int ix = 0; ix < 30; ix++) + ; /* { dg-error "not enough" } */ + +#pragma acc loop tile (*,*) + for (int ix = 0; ix < 30; ix++) + for (int jx = 0; jx < ix; jx++) /* { dg-error "condition expression" } */ + ; + +#pragma acc loop tile (*) + for (int ix = 0; ix < 30; ix++) + for (int jx = 0; jx < ix; jx++) /* OK */ + ; + + } + return 0; +} diff --git a/gcc/testsuite/c-c++-common/goacc/tile.c b/gcc/testsuite/c-c++-common/goacc/tile.c index 8e70e718e1f..f10535a69e5 100644 --- a/gcc/testsuite/c-c++-common/goacc/tile.c +++ b/gcc/testsuite/c-c++-common/goacc/tile.c @@ -1,7 +1,9 @@ +#include + int main () { - int i, *a, b; + int i, j, k, *a, b; #pragma acc parallel loop tile (10) for (i = 0; i < 100; i++) @@ -13,11 +15,14 @@ main () #pragma acc parallel loop tile (10, *) for (i = 0; i < 100; i++) - ; + for (j = 0; j < 100; j++) + ; -#pragma acc parallel loop tile (10, *, i) +#pragma acc parallel loop tile (10, *, i) // { dg-error "" } for (i = 0; i < 100; i++) - ; + for (j = 0; j < 100; j++) + for (k = 0; k < 100; k++) + ; #pragma acc parallel loop tile // { dg-error "expected '\\\('" } for (i = 0; i < 100; i++) @@ -35,37 +40,44 @@ main () for (i = 0; i < 100; i++) ; -#pragma acc parallel loop tile (1.1) // { dg-error "'tile' value must be integral" } +#pragma acc parallel loop tile (1.1) // { dg-error "'tile' argument needs" } for (i = 0; i < 100; i++) ; -#pragma acc parallel loop tile (-3) // { dg-warning "'tile' value must be positive" } +#pragma acc parallel loop tile (-3) // { dg-error "'tile' argument needs" } for (i = 0; i < 100; i++) ; -#pragma acc parallel loop tile (10,-3) // { dg-warning "'tile' value must be positive" } +#pragma acc parallel loop tile (10,-3) // { dg-error "'tile' argument needs" } for (i = 0; i < 100; i++) - ; + for (j = 0; j < 100; j++) + ; -#pragma acc parallel loop tile (-100,10,5) // { dg-warning "'tile' value must be positive" } +#pragma acc parallel loop tile (-100,10,5) // { dg-error "'tile' argument needs" } for (i = 0; i < 100; i++) - ; + for (j = 0; j < 100; j++) + for (k = 0; k < 100; k++) + ; -#pragma acc parallel loop tile (1,2.0,true) // { dg-error "" } +#pragma acc parallel loop tile (1,true) for (i = 0; i < 100; i++) - ; + for (j = 0; j < 100; j++) + ; -#pragma acc parallel loop tile (*a, 1) +#pragma acc parallel loop tile (*a, 1) // { dg-error "" } for (i = 0; i < 100; i++) - ; + for (j = 0; j < 100; j++) + ; -#pragma acc parallel loop tile (1, *a, b) +#pragma acc parallel loop tile (1, b) // { dg-error "" } for (i = 0; i < 100; i++) - ; + for (j = 0; j < 100; j++) + ; -#pragma acc parallel loop tile (b, 1, *a) +#pragma acc parallel loop tile (b, 1) // { dg-error "" } for (i = 0; i < 100; i++) - ; + for (j = 0; j < 100; j++) + ; return 0; } @@ -73,7 +85,7 @@ main () void par (void) { - int i, j; + int i, j, k; #pragma acc parallel { @@ -95,22 +107,22 @@ void par (void) for (j = 1; j < 10; j++) { } } -#pragma acc loop tile(-2) // { dg-warning "'tile' value must be positive" } +#pragma acc loop tile(-2) // { dg-error "'tile' argument needs" } for (i = 1; i < 10; i++) { } -#pragma acc loop tile(i) +#pragma acc loop tile(i) // { dg-error "" } for (i = 1; i < 10; i++) { } #pragma acc loop tile(2, 2, 1) for (i = 1; i < 3; i++) { for (j = 4; j < 6; j++) - { } + for (k = 0; k< 100; k++); } #pragma acc loop tile(2, 2) for (i = 1; i < 5; i+=2) { - for (j = i + 1; j < 7; j+=i) + for (j = i + 1; j < 7; j+=i) // { dg-error "initializer expression" } { } } #pragma acc loop vector tile(*) @@ -156,24 +168,21 @@ void p3 (void) for (j = 1; j < 10; j++) { } } -#pragma acc parallel loop tile(-2) // { dg-warning "'tile' value must be positive" } +#pragma acc parallel loop tile(-2) // { dg-error "'tile' argument needs" } for (i = 1; i < 10; i++) { } -#pragma acc parallel loop tile(i) +#pragma acc parallel loop tile(i) // { dg-error "" } for (i = 1; i < 10; i++) { } #pragma acc parallel loop tile(2, 2, 1) for (i = 1; i < 3; i++) - { - for (j = 4; j < 6; j++) - { } - } + for (j = 4; j < 6; j++) + for (int k = 1 ; k < 2; k++) + ; #pragma acc parallel loop tile(2, 2) for (i = 1; i < 5; i+=2) - { - for (j = i + 1; j < 7; j++) - { } - } + for (j = i + 1; j < 7; j++) // { dg-error "initializer expression" } + { } #pragma acc parallel loop vector tile(*) for (i = 0; i < 10; i++) { } @@ -227,22 +236,23 @@ kern (void) #pragma acc loop tile(*, 1) for (i = 0; i < 10; i++) { - for (j = 0; j < 10; i++) + for (j = 0; j < 10; i++) /* { dg-error "increment expression" } */ { } } -#pragma acc loop tile(-2) // { dg-warning "'tile' value must be positive" } +#pragma acc loop tile(-2) // { dg-error "'tile' argument needs" } for (i = 0; i < 10; i++) { } -#pragma acc loop tile(i) +#pragma acc loop tile(i) // { dg-error "" } for (i = 0; i < 10; i++) { } #pragma acc loop tile(2, 2, 1) for (i = 2; i < 4; i++) - for (i = 4; i < 6; i++) + for (j = 4; j < 6; j++) + for (int k = 4; k < 6; k++) { } #pragma acc loop tile(2, 2) for (i = 1; i < 5; i+=2) - for (j = i+1; j < 7; i++) + for (j = i+1; j < 7; j++) /* { dg-error "initializer expression" } */ { } #pragma acc loop vector tile(*) for (i = 0; i < 10; i++) @@ -288,22 +298,21 @@ void k3 (void) for (j = 1; j < 10; j++) { } } -#pragma acc kernels loop tile(-2) // { dg-warning "'tile' value must be positive" } +#pragma acc kernels loop tile(-2) // { dg-error "'tile' argument needs" } for (i = 1; i < 10; i++) { } -#pragma acc kernels loop tile(i) +#pragma acc kernels loop tile(i) // { dg-error "" } for (i = 1; i < 10; i++) { } #pragma acc kernels loop tile(2, 2, 1) for (i = 1; i < 3; i++) - { - for (j = 4; j < 6; j++) - { } - } + for (j = 4; j < 6; j++) + for (int k = 1; k < 7; k++) + ; #pragma acc kernels loop tile(2, 2) for (i = 1; i < 5; i++) { - for (j = i + 1; j < 7; j += i) + for (j = i + 1; j < 7; j += i) /* { dg-error "initializer expression" } */ { } } #pragma acc kernels loop vector tile(*) diff --git a/gcc/testsuite/g++.dg/goacc/template.C b/gcc/testsuite/g++.dg/goacc/template.C index f139dc25b58..74f40d8922e 100644 --- a/gcc/testsuite/g++.dg/goacc/template.C +++ b/gcc/testsuite/g++.dg/goacc/template.C @@ -5,7 +5,7 @@ accDouble(int val) return val * 2; } -template T +template T oacc_parallel_copy (T a) { T b = 0; @@ -36,7 +36,7 @@ oacc_parallel_copy (T a) for (int j = 0; j < 5; j++) b = a; -#pragma acc loop auto tile (a, 3) +#pragma acc loop auto tile (I, 3) for (int i = 0; i < a; i++) for (int j = 0; j < 5; j++) b = a; @@ -135,7 +135,7 @@ oacc_kernels_copy (T a) int main () { - int b = oacc_parallel_copy (5); + int b = oacc_parallel_copy (5); int c = oacc_kernels_copy (5); return b + c; diff --git a/gcc/testsuite/g++.dg/goacc/tile-1.C b/gcc/testsuite/g++.dg/goacc/tile-1.C new file mode 100644 index 00000000000..27c53835d36 --- /dev/null +++ b/gcc/testsuite/g++.dg/goacc/tile-1.C @@ -0,0 +1,16 @@ +/* of tile erroneously clobbered the template, resulting + in missing errors and other fun. */ + +template +void Foo () +{ +#pragma acc parallel loop tile(I) // { dg-error "" } + for (int ix = 0; ix < 10; ix++) + ; +} + +int main () +{ + Foo<1> (); // OK + Foo<-1> (); // error +} diff --git a/gcc/testsuite/gcc.dg/goacc/loop-processing-1.c b/gcc/testsuite/gcc.dg/goacc/loop-processing-1.c index 619576a17ee..07f56a25329 100644 --- a/gcc/testsuite/gcc.dg/goacc/loop-processing-1.c +++ b/gcc/testsuite/gcc.dg/goacc/loop-processing-1.c @@ -15,4 +15,4 @@ void vector_1 (int *ary, int size) } } -/* { dg-final { scan-tree-dump {OpenACC loops.*Loop 0\(0\).*Loop 14\(1\).*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_HEAD_MARK, 0, 1, 20\);.*Head-0:.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_HEAD_MARK, 0, 1, 20\);.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_FORK, \.data_dep\.[0-9_]+, 0\);.*Tail-0:.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_TAIL_MARK, \.data_dep\.[0-9_]+, 1\);.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_JOIN, \.data_dep\.[0-9_]+, 0\);.*Loop 6\(4\).*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_HEAD_MARK, 0, 1, 6\);.*Head-0:.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_HEAD_MARK, 0, 1, 6\);.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_FORK, \.data_dep\.[0-9_]+, 2\);.*Tail-0:.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_TAIL_MARK, \.data_dep\.[0-9_]+, 1\);.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_JOIN, \.data_dep\.[0-9_]+, 2\);} "oaccdevlow" } } */ +/* { dg-final { scan-tree-dump {OpenACC loops.*Loop 0\(0\).*Loop 24\(1\).*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_HEAD_MARK, 0, 1, 36\);.*Head-0:.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_HEAD_MARK, 0, 1, 36\);.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_FORK, \.data_dep\.[0-9_]+, 0\);.*Tail-0:.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_TAIL_MARK, \.data_dep\.[0-9_]+, 1\);.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_JOIN, \.data_dep\.[0-9_]+, 0\);.*Loop 6\(6\).*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_HEAD_MARK, 0, 2, 6\);.*Head-0:.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_HEAD_MARK, 0, 2, 6\);.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_FORK, \.data_dep\.[0-9_]+, 1\);.*Head-1:.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_HEAD_MARK, \.data_dep\.[0-9_]+, 1\);.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_FORK, \.data_dep\.[0-9_]+, 2\);.*Tail-1:.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_TAIL_MARK, \.data_dep\.[0-9_]+, 2\);.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_JOIN, \.data_dep\.[0-9_]+, 2\);.*Tail-0:.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_TAIL_MARK, \.data_dep\.[0-9_]+, 1\);.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_JOIN, \.data_dep\.[0-9_]+, 1\);} "oaccdevlow" } } */ diff --git a/gcc/testsuite/gfortran.dg/goacc/combined-directives.f90 b/gcc/testsuite/gfortran.dg/goacc/combined-directives.f90 index abb5e6b6c3d..42a447ad06b 100644 --- a/gcc/testsuite/gfortran.dg/goacc/combined-directives.f90 +++ b/gcc/testsuite/gfortran.dg/goacc/combined-directives.f90 @@ -143,8 +143,7 @@ end subroutine test ! { dg-final { scan-tree-dump-times "acc loop private.i. private.j. vector" 2 "gimple" } } ! { dg-final { scan-tree-dump-times "acc loop private.i. private.j. seq" 2 "gimple" } } ! { dg-final { scan-tree-dump-times "acc loop private.i. private.j. auto" 2 "gimple" } } -! XFAILed: OpenACC tile clauses are discarded during gimplification. -! { dg-final { scan-tree-dump-times "acc loop private.i. private.j. tile.2, 3" 2 "gimple" { xfail *-*-* } } } +! { dg-final { scan-tree-dump-times "acc loop private.i. private.j. tile.2, 3" 2 "gimple" } } ! { dg-final { scan-tree-dump-times "acc loop private.i. independent" 2 "gimple" } } ! { dg-final { scan-tree-dump-times "private.z" 2 "gimple" } } ! { dg-final { scan-tree-dump-times "omp target oacc_\[^ \]+ map.force_tofrom:y" 2 "gimple" } } diff --git a/gcc/testsuite/gfortran.dg/goacc/tile-1.f90 b/gcc/testsuite/gfortran.dg/goacc/tile-1.f90 new file mode 100644 index 00000000000..3dbabda0342 --- /dev/null +++ b/gcc/testsuite/gfortran.dg/goacc/tile-1.f90 @@ -0,0 +1,339 @@ +subroutine parloop + integer, parameter :: n = 100 + integer i, j, k, a + + !$acc parallel loop tile(10) + do i = 1, n + end do + + !$acc parallel loop tile(*) + do i = 1, n + end do + + !$acc parallel loop tile(10, *) + do i = 1, n + do j = 1, n + end do + end do + + !$acc parallel loop tile(10, *, i) ! { dg-error "" } + do i = 1, n + do j = 1, n + do k = 1, n + end do + end do + end do + + !$acc parallel loop tile ! { dg-error "Unclassifiable" } + do i = 1, n + end do + + !$acc parallel loop tile() ! { dg-error "Syntax error" } + do i = 1, n + end do + + !$acc parallel loop tile(,1) ! { dg-error "Syntax error" } + do i = 1, n + end do + + !$acc parallel loop tile(,,) ! { dg-error "Syntax error" } + do i = 1, n + end do + + !$acc parallel loop tile(1.1) ! { dg-error "requires a scalar INTEGER" } + do i = 1, n + end do + + !$acc parallel loop tile(-3) ! { dg-warning "must be positive" } + do i = 1, n + end do + + !$acc parallel loop tile(10, -3) ! { dg-warning "must be positive" } + do i = 1, n + do j = 1, n + end do + end do + + !$acc parallel loop tile(-100, 10, 5) ! { dg-warning "must be positive" } + do i = 1, n + do j = 1, n + do k = 1, n + end do + end do + end do + + !$acc parallel loop tile(10, .true.) ! { dg-error "requires a scalar" } + do i = 1, n + do j = 1, n + end do + end do + + !$acc parallel loop tile(1, a) ! { dg-error "constant expression" } + do i = 1, n + do j = 1, n + end do + end do + + !$acc parallel loop tile(a, 1) ! { dg-error "constant expression" } + do i = 1, n + do j = 1, n + end do + end do + + !$acc parallel loop tile(2, 3) collapse (2) ! { dg-error "Incompatible use" } + do i = 1, n + do j = 1, n + end do + end do +end subroutine parloop + +subroutine par + integer, parameter :: n = 100 + integer i, j, k + + !$acc parallel + !$acc loop tile ! { dg-error "Unclassifiable" } + do i = 1, n + end do + + !$acc loop tile() ! { dg-error "Syntax error" } + do i = 1, n + end do + + !$acc loop tile(1) + do i = 1, n + end do + + !$acc loop tile(*) + do i = 1, n + end do + + !$acc loop tile(2) + do i = 1, n + do j = 1, n + end do + end do + + !$acc loop tile(-2) ! { dg-warning "must be positive" } + do i = 1, n + end do + + !$acc loop tile(i) ! { dg-error "constant expression" } + do i = 1, n + end do + + !$acc loop tile(2, 2, 1) + do i = 1, n + do j = 1, n + do k = 1, n + end do + end do + end do + + !$acc parallel loop tile(2, 2) + do i = 1, n + do j = i+1, n, j ! { dg-error "rectangular iteration space" } + end do + end do + + !$acc loop vector tile(*) + do i = 1, n + end do + + !$acc loop worker tile(*) + do i = 1, n + end do + + !$acc loop gang tile(*) + do i = 1, n + end do + + !$acc loop vector gang tile(*) + do i = 1, n + end do + + !$acc loop vector worker tile(*) + do i = 1, n + end do + + !$acc loop gang worker tile(*) + do i = 1, n + end do + + !$acc loop tile(2, 3) collapse (2) ! { dg-error "Incompatible use" } + do i = 1, n + do j = 1, n + end do + end do + !$acc end parallel +end subroutine par + +subroutine kern + integer, parameter :: n = 100 + integer i, j, k + + !$acc kernels + !$acc loop tile ! { dg-error "Unclassifiable" } + do i = 1, n + end do + + !$acc loop tile() ! { dg-error "Syntax error" } + do i = 1, n + end do + + !$acc loop tile(1) + do i = 1, n + end do + + !$acc loop tile(*) + do i = 1, n + end do + + !$acc loop tile(2) + do i = 1, n + do j = 1, n + end do + end do + + !$acc loop tile(-2) ! { dg-warning "must be positive" } + do i = 1, n + end do + + !$acc loop tile(i) ! { dg-error "constant expression" } + do i = 1, n + end do + + !$acc loop tile(2, 2, 1) + do i = 1, n + do j = 1, n + do k = 1, n + end do + end do + end do + + !$acc parallel loop tile(2, 2) + do i = 1, n + do j = 1, n + end do + end do + + !$acc loop vector tile(*) + do i = 1, n + end do + + !$acc loop worker tile(*) + do i = 1, n + end do + + !$acc loop gang tile(*) + do i = 1, n + end do + + !$acc loop vector gang tile(*) + do i = 1, n + end do + + !$acc loop vector worker tile(*) + do i = 1, n + end do + + !$acc loop gang worker tile(*) + do i = 1, n + end do + + !$acc loop tile(2, 3) collapse (2) ! { dg-error "Incompatible use" } + do i = 1, n + do j = 1, n + end do + end do + !$acc end kernels +end subroutine kern + +subroutine kernsloop + integer, parameter :: n = 100 + integer i, j, k, a + + !$acc kernels loop tile(10) + do i = 1, n + end do + + !$acc kernels loop tile(*) + do i = 1, n + end do + + !$acc kernels loop tile(10, *) + do i = 1, n + do j = 1, n + end do + end do + + !$acc kernels loop tile(10, *, i) ! { dg-error "" } + do i = 1, n + do j = 1, n + do k = 1, n + end do + end do + end do + + !$acc kernels loop tile ! { dg-error "Unclassifiable" } + do i = 1, n + end do + + !$acc kernels loop tile() ! { dg-error "Syntax error" } + do i = 1, n + end do + + !$acc kernels loop tile(,1) ! { dg-error "Syntax error" } + do i = 1, n + end do + + !$acc kernels loop tile(,,) ! { dg-error "Syntax error" } + do i = 1, n + end do + + !$acc kernels loop tile(1.1) ! { dg-error "requires a scalar INTEGER" } + do i = 1, n + end do + + !$acc kernels loop tile(-3) ! { dg-warning "must be positive" } + do i = 1, n + end do + + !$acc kernels loop tile(10, -3) ! { dg-warning "must be positive" } + do i = 1, n + do j = 1, n + end do + end do + + !$acc kernels loop tile(-100, 10, 5) ! { dg-warning "must be positive" } + do i = 1, n + do j = 1, n + do k = 1, n + end do + end do + end do + + !$acc kernels loop tile(10, .true.) ! { dg-error "requires a scalar" } + do i = 1, n + do j = 1, n + end do + end do + + !$acc kernels loop tile(1, a) ! { dg-error "constant expression" } + do i = 1, n + do j = 1, n + end do + end do + + !$acc kernels loop tile(a, 1) ! { dg-error "constant expression" } + do i = 1, n + do j = 1, n + end do + end do + + !$acc kernels loop tile(2, 3) collapse (2) ! { dg-error "Incompatible use" } + do i = 1, n + do j = 1, n + end do + end do +end subroutine kernsloop diff --git a/gcc/testsuite/gfortran.dg/goacc/tile-2.f90 b/gcc/testsuite/gfortran.dg/goacc/tile-2.f90 new file mode 100644 index 00000000000..c56754380cc --- /dev/null +++ b/gcc/testsuite/gfortran.dg/goacc/tile-2.f90 @@ -0,0 +1,21 @@ +subroutine par + integer ix, jx + + !$acc parallel + !$acc loop tile (*,*) ! { dg-error "not enough DO loops for tiled" } + do ix = 1, 30 + end do + + !$acc loop tile (*,*) + do ix = 1, 30 + do jx = 1, ix ! { dg-error "tiled loops don.t form rectangular" } + end do + end do + + !$acc loop tile (*) + do ix = 1, 30 + do jx = 1, ix + end do + end do + !$acc end parallel +end subroutine par diff --git a/gcc/testsuite/gfortran.dg/goacc/tile-lowering.f95 b/gcc/testsuite/gfortran.dg/goacc/tile-lowering.f95 new file mode 100644 index 00000000000..1cb8b9cc512 --- /dev/null +++ b/gcc/testsuite/gfortran.dg/goacc/tile-lowering.f95 @@ -0,0 +1,292 @@ +! { dg-do compile } +! { dg-additional-options "-fdump-tree-original" } + +subroutine par + integer i, j, k + + !$acc parallel + !$acc loop tile (1) + do i = 1, 10 + end do + + !$acc loop tile (*) + do i = 1, 10 + end do + + !$acc loop tile (1,2) + do i = 1, 10 + do j = 1, 10 + end do + end do + + !$acc loop tile (*,2) + do i = 1, 10 + do j = 1, 10 + end do + end do + + !$acc loop tile (1,*) + do i = 1, 10 + do j = 1, 10 + end do + end do + + !$acc loop tile (*,*) + do i = 1, 10 + do j = 1, 10 + end do + end do + + !$acc loop tile (1,2,3) + do i = 1, 10 + do j = 1, 10 + do k = 1, 10 + end do + end do + end do + + !$acc loop tile (*,2,3) + do i = 1, 10 + do j = 1, 10 + do k = 1, 10 + end do + end do + end do + + !$acc loop tile (1,*,3) + do i = 1, 10 + do j = 1, 10 + do k = 1, 10 + end do + end do + end do + + !$acc loop tile (1,2,*) + do i = 1, 10 + do j = 1, 10 + do k = 1, 10 + end do + end do + end do + !$acc end parallel +end subroutine par + +subroutine kerns + integer i, j, k + + !$acc kernels + !$acc loop tile (1) + do i = 1, 10 + end do + + !$acc loop tile (*) + do i = 1, 10 + end do + + !$acc loop tile (1,2) + do i = 1, 10 + do j = 1, 10 + end do + end do + + !$acc loop tile (*,2) + do i = 1, 10 + do j = 1, 10 + end do + end do + + !$acc loop tile (1,*) + do i = 1, 10 + do j = 1, 10 + end do + end do + + !$acc loop tile (*,*) + do i = 1, 10 + do j = 1, 10 + end do + end do + + !$acc loop tile (1,2,3) + do i = 1, 10 + do j = 1, 10 + do k = 1, 10 + end do + end do + end do + + !$acc loop tile (*,2,3) + do i = 1, 10 + do j = 1, 10 + do k = 1, 10 + end do + end do + end do + + !$acc loop tile (1,*,3) + do i = 1, 10 + do j = 1, 10 + do k = 1, 10 + end do + end do + end do + + !$acc loop tile (1,2,*) + do i = 1, 10 + do j = 1, 10 + do k = 1, 10 + end do + end do + end do + !$acc end kernels +end subroutine kerns + +subroutine parloop + integer i, j, k + + !$acc parallel loop tile (1) + do i = 1, 10 + end do + + !$acc parallel loop tile (*) + do i = 1, 10 + end do + + !$acc parallel loop tile (1,2) + do i = 1, 10 + do j = 1, 10 + end do + end do + + !$acc parallel loop tile (*,2) + do i = 1, 10 + do j = 1, 10 + end do + end do + + !$acc parallel loop tile (1,*) + do i = 1, 10 + do j = 1, 10 + end do + end do + + !$acc parallel loop tile (*,*) + do i = 1, 10 + do j = 1, 10 + end do + end do + + !$acc parallel loop tile (1,2,3) + do i = 1, 10 + do j = 1, 10 + do k = 1, 10 + end do + end do + end do + + !$acc parallel loop tile (*,2,3) + do i = 1, 10 + do j = 1, 10 + do k = 1, 10 + end do + end do + end do + + !$acc parallel loop tile (1,*,3) + do i = 1, 10 + do j = 1, 10 + do k = 1, 10 + end do + end do + end do + + !$acc parallel loop tile (1,2,*) + do i = 1, 10 + do j = 1, 10 + do k = 1, 10 + end do + end do + end do +end subroutine parloop + +subroutine kernloop + integer i, j, k + + !$acc kernels loop tile (1) + do i = 1, 10 + end do + + !$acc kernels loop tile (*) + do i = 1, 10 + end do + + !$acc kernels loop tile (1,2) + do i = 1, 10 + do j = 1, 10 + end do + end do + + !$acc kernels loop tile (*,2) + do i = 1, 10 + do j = 1, 10 + end do + end do + + !$acc kernels loop tile (1,*) + do i = 1, 10 + do j = 1, 10 + end do + end do + + !$acc kernels loop tile (*,*) + do i = 1, 10 + do j = 1, 10 + end do + end do + + !$acc kernels loop tile (1,2,3) + do i = 1, 10 + do j = 1, 10 + do k = 1, 10 + end do + end do + end do + + !$acc kernels loop tile (*,2,3) + do i = 1, 10 + do j = 1, 10 + do k = 1, 10 + end do + end do + end do + + !$acc kernels loop tile (1,*,3) + do i = 1, 10 + do j = 1, 10 + do k = 1, 10 + end do + end do + end do + + !$acc kernels loop tile (1,2,*) + do i = 1, 10 + do j = 1, 10 + do k = 1, 10 + end do + end do + end do +end subroutine kernloop + + +! { dg-final { scan-tree-dump-times "tile\\(1\\)" 4 "original" } } +! { dg-final { scan-tree-dump-times "tile\\(0\\)" 4 "original" } } +! { dg-final { scan-tree-dump-times "tile\\(1, 2\\)" 4 "original" } } +! { dg-final { scan-tree-dump-times "tile\\(0, 2\\)" 4 "original" } } +! { dg-final { scan-tree-dump-times "tile\\(1, 0\\)" 4 "original" } } +! { dg-final { scan-tree-dump-times "tile\\(0, 0\\)" 4 "original" } } +! { dg-final { scan-tree-dump-times "tile\\(1, 2, 3\\)" 4 "original" } } +! { dg-final { scan-tree-dump-times "tile\\(0, 2, 3\\)" 4 "original" } } +! { dg-final { scan-tree-dump-times "tile\\(1, 0, 3\\)" 4 "original" } } +! { dg-final { scan-tree-dump-times "tile\\(1, 2, 0\\)" 4 "original" } } +! { dg-final { scan-tree-dump-times "for \\(" 88 "original" } } +! { dg-final { scan-tree-dump-times "while \\(" 0 "original" } } diff --git a/gcc/tree-nested.c b/gcc/tree-nested.c index c005e7c735d..4a25025ef8f 100644 --- a/gcc/tree-nested.c +++ b/gcc/tree-nested.c @@ -1274,6 +1274,7 @@ convert_nonlocal_omp_clauses (tree *pclauses, struct walk_stmt_info *wi) case OMP_CLAUSE_DEFAULT: case OMP_CLAUSE_COPYIN: case OMP_CLAUSE_COLLAPSE: + case OMP_CLAUSE_TILE: case OMP_CLAUSE_UNTIED: case OMP_CLAUSE_MERGEABLE: case OMP_CLAUSE_PROC_BIND: @@ -1286,8 +1287,6 @@ convert_nonlocal_omp_clauses (tree *pclauses, struct walk_stmt_info *wi) case OMP_CLAUSE_AUTO: break; - /* OpenACC tile clauses are discarded during gimplification. */ - case OMP_CLAUSE_TILE: /* The following clause belongs to the OpenACC cache directive, which is discarded during gimplification. */ case OMP_CLAUSE__CACHE_: @@ -1982,6 +1981,7 @@ convert_local_omp_clauses (tree *pclauses, struct walk_stmt_info *wi) case OMP_CLAUSE_DEFAULT: case OMP_CLAUSE_COPYIN: case OMP_CLAUSE_COLLAPSE: + case OMP_CLAUSE_TILE: case OMP_CLAUSE_UNTIED: case OMP_CLAUSE_MERGEABLE: case OMP_CLAUSE_PROC_BIND: @@ -1994,8 +1994,6 @@ convert_local_omp_clauses (tree *pclauses, struct walk_stmt_info *wi) case OMP_CLAUSE_AUTO: break; - /* OpenACC tile clauses are discarded during gimplification. */ - case OMP_CLAUSE_TILE: /* The following clause belongs to the OpenACC cache directive, which is discarded during gimplification. */ case OMP_CLAUSE__CACHE_: diff --git a/gcc/tree.c b/gcc/tree.c index 804ab5ed58a..3e63415e673 100644 --- a/gcc/tree.c +++ b/gcc/tree.c @@ -328,7 +328,7 @@ unsigned const char omp_clause_num_ops[] = 1, /* OMP_CLAUSE_NUM_GANGS */ 1, /* OMP_CLAUSE_NUM_WORKERS */ 1, /* OMP_CLAUSE_VECTOR_LENGTH */ - 1, /* OMP_CLAUSE_TILE */ + 3, /* OMP_CLAUSE_TILE */ 2, /* OMP_CLAUSE__GRIDDIM_ */ }; diff --git a/gcc/tree.h b/gcc/tree.h index f63a678216e..3b12509e7df 100644 --- a/gcc/tree.h +++ b/gcc/tree.h @@ -1654,6 +1654,10 @@ extern void protected_set_expr_location (tree, location_t); #define OMP_CLAUSE_TILE_LIST(NODE) \ OMP_CLAUSE_OPERAND (OMP_CLAUSE_SUBCODE_CHECK (NODE, OMP_CLAUSE_TILE), 0) +#define OMP_CLAUSE_TILE_ITERVAR(NODE) \ + OMP_CLAUSE_OPERAND (OMP_CLAUSE_SUBCODE_CHECK (NODE, OMP_CLAUSE_TILE), 1) +#define OMP_CLAUSE_TILE_COUNT(NODE) \ + OMP_CLAUSE_OPERAND (OMP_CLAUSE_SUBCODE_CHECK (NODE, OMP_CLAUSE_TILE), 2) #define OMP_CLAUSE__GRIDDIM__DIMENSION(NODE) \ (OMP_CLAUSE_SUBCODE_CHECK (NODE, OMP_CLAUSE__GRIDDIM_)\ diff --git a/libgomp/ChangeLog b/libgomp/ChangeLog index 72dc8dacf28..132f9d732e9 100644 --- a/libgomp/ChangeLog +++ b/libgomp/ChangeLog @@ -1,3 +1,14 @@ +2017-02-09 Nathan Sidwell + Chung-Lin Tang + + * testsuite/libgomp.oacc-c-c++-common/tile-1.c: New. + * testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c: Adjust and + add additional case. + * testsuite/libgomp.oacc-c-c++-common/vprop.c: XFAIL under + "openacc_nvidia_accel_selected". + * libgomp.oacc-fortran/nested-function-1.f90 (test2): + Add num_workers(8) clause. + 2017-02-08 John David Anglin * testsuite/libgomp.oacc-c-c++-common/loop-dim-default.c: Skip on diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c index 3ca9388d405..863b6b38c34 100644 --- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c @@ -112,7 +112,7 @@ int vector_1 (int *ary, int size) ary[ix] = place (); } - return check (ary, size, 0, 0, 1); + return check (ary, size, 0, 1, 1); } int vector_2 (int *ary, int size) @@ -196,10 +196,24 @@ int gang_3 (int *ary, int size) ary[ix + jx * 64] = place (); } + return check (ary, size, 1, 1, 1); +} + +int gang_4 (int *ary, int size) +{ + clear (ary, size); + +#pragma acc parallel vector_length(32) copy(ary[0:size]) firstprivate (size) + { +#pragma acc loop auto + for (int jx = 0; jx < size; jx++) + ary[jx] = place (); + } + return check (ary, size, 1, 0, 1); } -#define N (32*32*32) +#define N (32*32*32*2) int main () { int ondev = 0; @@ -227,6 +241,8 @@ int main () return 1; if (gang_3 (ary, N)) return 1; + if (gang_4 (ary, N)) + return 1; return 0; } diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/tile-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/tile-1.c new file mode 100644 index 00000000000..8dcb956c59f --- /dev/null +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/tile-1.c @@ -0,0 +1,281 @@ +/* This code uses nvptx inline assembly guarded with acc_on_device, which is + not optimized away at -O0, and then confuses the target assembler. + { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */ + +/* { dg-additional-options "-fopenacc-dim=32" } */ + +#include +#include + +static int check (const int *ary, int size, int gp, int wp, int vp) +{ + int exit = 0; + int ix; + int gangs[32], workers[32], vectors[32]; + + for (ix = 0; ix < 32; ix++) + gangs[ix] = workers[ix] = vectors[ix] = 0; + + for (ix = 0; ix < size; ix++) + { + vectors[ary[ix] & 0xff]++; + workers[(ary[ix] >> 8) & 0xff]++; + gangs[(ary[ix] >> 16) & 0xff]++; + } + + for (ix = 0; ix < 32; ix++) + { + if (gp) + { + int expect = gangs[0]; + if (gangs[ix] != expect) + { + exit = 1; + printf ("gang %d not used %d times\n", ix, expect); + } + } + else if (ix && gangs[ix]) + { + exit = 1; + printf ("gang %d unexpectedly used\n", ix); + } + + if (wp) + { + int expect = workers[0]; + if (workers[ix] != expect) + { + exit = 1; + printf ("worker %d not used %d times\n", ix, expect); + } + } + else if (ix && workers[ix]) + { + exit = 1; + printf ("worker %d unexpectedly used\n", ix); + } + + if (vp) + { + int expect = vectors[0]; + if (vectors[ix] != expect) + { + exit = 1; + printf ("vector %d not used %d times\n", ix, expect); + } + } + else if (ix && vectors[ix]) + { + exit = 1; + printf ("vector %d unexpectedly used\n", ix); + } + + } + return exit; +} + +#pragma acc routine seq +static int __attribute__((noinline)) place () +{ + int r = 0; + + if (acc_on_device (acc_device_nvidia)) + { + int g = 0, w = 0, v = 0; + + __asm__ volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g)); + __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (w)); + __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (v)); + r = (g << 16) | (w << 8) | v; + } + return r; +} + +static void clear (int *ary, int size) +{ + int ix; + + for (ix = 0; ix < size; ix++) + ary[ix] = -1; +} + +int gang_vector_1 (int *ary, int size) +{ + clear (ary, size); +#pragma acc parallel vector_length(32) num_gangs (32) copy (ary[0:size]) firstprivate (size) + { +#pragma acc loop tile(128) gang vector + for (int jx = 0; jx < size; jx++) + ary[jx] = place (); + } + + return check (ary, size, 1, 0, 1); +} + +int gang_vector_2a (int *ary, int size) +{ + if (size % 256) + return 1; + + clear (ary, size); +#pragma acc parallel vector_length(32) num_gangs (32) copy (ary[0:size]) firstprivate (size) + { +#pragma acc loop tile(64, 64) gang vector + for (int jx = 0; jx < size / 256; jx++) + for (int ix = 0; ix < 256; ix++) + ary[jx * 256 + ix] = place (); + } + + return check (ary, size, 1, 0, 1); +} + +int gang_vector_2b (int *ary, int size) +{ + if (size % 256) + return 1; + + clear (ary, size); +#pragma acc parallel vector_length(32) num_gangs (32) copy (ary[0:size]) firstprivate (size) + { +#pragma acc loop tile(64, 64) gang vector + for (int jx = 0; jx < size; jx += 256) + for (int ix = 0; ix < 256; ix++) + ary[jx + ix] = place (); + } + + return check (ary, size, 1, 0, 1); +} + +int worker_vector_2a (int *ary, int size) +{ + if (size % 256) + return 1; + + clear (ary, size); +#pragma acc parallel vector_length(32) num_workers (32) copy (ary[0:size]) firstprivate (size) + { +#pragma acc loop tile(64, 64) worker vector + for (int jx = 0; jx < size / 256; jx++) + for (int ix = 0; ix < 256; ix++) + ary[jx * 256 + ix] = place (); + } + + return check (ary, size, 0, 1, 1); +} + +int worker_vector_2b (int *ary, int size) +{ + if (size % 256) + return 1; + + clear (ary, size); +#pragma acc parallel vector_length(32) num_workers (32) copy (ary[0:size]) firstprivate (size) + { +#pragma acc loop tile(64, 64) worker vector + for (int jx = 0; jx < size; jx += 256) + for (int ix = 0; ix < 256; ix++) + ary[jx + ix] = place (); + } + + return check (ary, size, 0, 1, 1); +} + +int gang_worker_vector_2a (int *ary, int size) +{ + if (size % 256) + return 1; + clear (ary, size); +#pragma acc parallel vector_length(32) num_workers (32) num_gangs(32) copy (ary[0:size]) firstprivate (size) + { +#pragma acc loop tile(32, 32) + for (int jx = 0; jx < size / 256; jx++) + for (int ix = 0; ix < 256; ix++) + ary[jx * 256 + ix] = place (); + } + + return check (ary, size, 1, 1, 1); +} + +int gang_worker_vector_2b (int *ary, int size) +{ + if (size % 256) + return 1; + clear (ary, size); +#pragma acc parallel vector_length(32) num_workers (32) num_gangs(32) copy (ary[0:size]) firstprivate (size) + { +#pragma acc loop tile(32, 32) + for (int jx = 0; jx < size; jx += 256) + for (int ix = 0; ix < 256; ix++) + ary[jx + ix] = place (); + } + + return check (ary, size, 1, 1, 1); +} + +int gang_worker_vector_star_2a (int *ary, int size) +{ + if (size % 256) + return 1; + + clear (ary, size); +#pragma acc parallel vector_length(32) num_workers (32) num_gangs(32) copy (ary[0:size]) firstprivate (size) + { +#pragma acc loop tile(*, *) + for (int jx = 0; jx < size / 256; jx++) + for (int ix = 0; ix < 256; ix++) + ary[jx * 256 + ix] = place (); + } + + return check (ary, size, 1, 1, 1); +} + +int gang_worker_vector_star_2b (int *ary, int size) +{ + if (size % 256) + return 1; + + clear (ary, size); +#pragma acc parallel vector_length(32) num_workers (32) num_gangs(32) copy (ary[0:size]) firstprivate (size) + { +#pragma acc loop tile(*, *) + for (int jx = 0; jx < size; jx +=256) + for (int ix = 0; ix < 256; ix++) + ary[jx + ix] = place (); + } + + return check (ary, size, 1, 1, 1); +} + +#define N (32*32*32*8) +int main () +{ + int ondev = 0; + +#pragma acc parallel copy(ondev) + { + ondev = acc_on_device (acc_device_not_host); + } + if (!ondev) + return 0; + + int ary[N]; + if (gang_vector_1 (ary, N)) + return 1; + if (gang_vector_2a (ary, N)) + return 1; + if (worker_vector_2a (ary, N)) + return 1; + if (gang_worker_vector_2a (ary, N)) + return 1; + if (gang_worker_vector_star_2a (ary, N)) + return 1; + if (gang_vector_2b (ary, N)) + return 1; + if (worker_vector_2b (ary, N)) + return 1; + if (gang_worker_vector_2b (ary, N)) + return 1; + if (gang_worker_vector_star_2b (ary, N)) + return 1; + return 0; +} diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/vprop.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/vprop.c index 17b9568a16c..c2bce8286d4 100644 --- a/libgomp/testsuite/libgomp.oacc-c-c++-common/vprop.c +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/vprop.c @@ -1,3 +1,6 @@ +/* { dg-do run } */ +/* { dg-xfail-run-if "PR78266" { openacc_nvidia_accel_selected } { "*" } { "" } } */ + #include #define test(type) \ diff --git a/libgomp/testsuite/libgomp.oacc-fortran/nested-function-1.f90 b/libgomp/testsuite/libgomp.oacc-fortran/nested-function-1.f90 index fdbca4481f8..c4af1992a05 100644 --- a/libgomp/testsuite/libgomp.oacc-fortran/nested-function-1.f90 +++ b/libgomp/testsuite/libgomp.oacc-fortran/nested-function-1.f90 @@ -33,7 +33,7 @@ firstdo: do i = 1, 3 subroutine test2 integer :: a(3,3,3), k, kk, kkk, l, ll, lll a = 0 - !$acc parallel + !$acc parallel num_workers(8) ! Use "gang(static:1)" here and below to effectively turn gang-redundant ! execution mode into something like gang-single. !$acc loop gang(static:1) collapse(1)