gcc/gcc/config/i386/i386-features.cc

3285 lines
86 KiB
C++
Raw Normal View History

2022-01-03 10:42:10 +01:00
/* Copyright (C) 1988-2022 Free Software Foundation, Inc.
This file is part of GCC.
GCC is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.
GCC is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with GCC; see the file COPYING3. If not see
<http://www.gnu.org/licenses/>. */
#define IN_TARGET_CODE 1
#include "config.h"
#include "system.h"
#include "coretypes.h"
#include "backend.h"
#include "rtl.h"
#include "tree.h"
#include "memmodel.h"
#include "gimple.h"
#include "cfghooks.h"
#include "cfgloop.h"
#include "df.h"
#include "tm_p.h"
#include "stringpool.h"
#include "expmed.h"
#include "optabs.h"
#include "regs.h"
#include "emit-rtl.h"
#include "recog.h"
#include "cgraph.h"
#include "diagnostic.h"
#include "cfgbuild.h"
#include "alias.h"
#include "fold-const.h"
#include "attribs.h"
#include "calls.h"
#include "stor-layout.h"
#include "varasm.h"
#include "output.h"
#include "insn-attr.h"
#include "flags.h"
#include "except.h"
#include "explow.h"
#include "expr.h"
#include "cfgrtl.h"
#include "common/common-target.h"
#include "langhooks.h"
#include "reload.h"
#include "gimplify.h"
#include "dwarf2.h"
#include "tm-constrs.h"
#include "cselib.h"
#include "sched-int.h"
#include "opts.h"
#include "tree-pass.h"
#include "context.h"
#include "pass_manager.h"
#include "target-globals.h"
#include "gimple-iterator.h"
#include "tree-vectorizer.h"
#include "shrink-wrap.h"
#include "builtins.h"
#include "rtl-iter.h"
#include "tree-iterator.h"
#include "dbgcnt.h"
#include "case-cfn-macros.h"
#include "dojump.h"
#include "fold-const-call.h"
#include "tree-vrp.h"
#include "tree-ssanames.h"
#include "selftest.h"
#include "selftest-rtl.h"
#include "print-rtl.h"
#include "intl.h"
#include "ifcvt.h"
#include "symbol-summary.h"
#include "ipa-prop.h"
#include "ipa-fnsummary.h"
#include "wide-int-bitmask.h"
#include "tree-vector-builder.h"
#include "debug.h"
#include "dwarf2out.h"
#include "i386-builtins.h"
#include "i386-features.h"
const char * const xlogue_layout::STUB_BASE_NAMES[XLOGUE_STUB_COUNT] = {
"savms64",
"resms64",
"resms64x",
"savms64f",
"resms64f",
"resms64fx"
};
const unsigned xlogue_layout::REG_ORDER[xlogue_layout::MAX_REGS] = {
/* The below offset values are where each register is stored for the layout
relative to incoming stack pointer. The value of each m_regs[].offset will
be relative to the incoming base pointer (rax or rsi) used by the stub.
s_instances: 0 1 2 3
Offset: realigned or aligned + 8
Register aligned aligned + 8 aligned w/HFP w/HFP */
XMM15_REG, /* 0x10 0x18 0x10 0x18 */
XMM14_REG, /* 0x20 0x28 0x20 0x28 */
XMM13_REG, /* 0x30 0x38 0x30 0x38 */
XMM12_REG, /* 0x40 0x48 0x40 0x48 */
XMM11_REG, /* 0x50 0x58 0x50 0x58 */
XMM10_REG, /* 0x60 0x68 0x60 0x68 */
XMM9_REG, /* 0x70 0x78 0x70 0x78 */
XMM8_REG, /* 0x80 0x88 0x80 0x88 */
XMM7_REG, /* 0x90 0x98 0x90 0x98 */
XMM6_REG, /* 0xa0 0xa8 0xa0 0xa8 */
SI_REG, /* 0xa8 0xb0 0xa8 0xb0 */
DI_REG, /* 0xb0 0xb8 0xb0 0xb8 */
BX_REG, /* 0xb8 0xc0 0xb8 0xc0 */
BP_REG, /* 0xc0 0xc8 N/A N/A */
R12_REG, /* 0xc8 0xd0 0xc0 0xc8 */
R13_REG, /* 0xd0 0xd8 0xc8 0xd0 */
R14_REG, /* 0xd8 0xe0 0xd0 0xd8 */
R15_REG, /* 0xe0 0xe8 0xd8 0xe0 */
};
/* Instantiate static const values. */
const HOST_WIDE_INT xlogue_layout::STUB_INDEX_OFFSET;
const unsigned xlogue_layout::MIN_REGS;
const unsigned xlogue_layout::MAX_REGS;
const unsigned xlogue_layout::MAX_EXTRA_REGS;
const unsigned xlogue_layout::VARIANT_COUNT;
const unsigned xlogue_layout::STUB_NAME_MAX_LEN;
/* Initialize xlogue_layout::s_stub_names to zero. */
char xlogue_layout::s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT]
[STUB_NAME_MAX_LEN];
/* Instantiates all xlogue_layout instances. */
const xlogue_layout xlogue_layout::s_instances[XLOGUE_SET_COUNT] = {
xlogue_layout (0, false),
xlogue_layout (8, false),
xlogue_layout (0, true),
xlogue_layout (8, true)
};
/* Return an appropriate const instance of xlogue_layout based upon values
in cfun->machine and crtl. */
PR c++/61339 - add mismatch between struct and class [-Wmismatched-tags] to non-bugs gcc/c/ChangeLog: PR c++/61339 * c-decl.c (xref_tag): Change class-key of PODs to struct and others to class. (field_decl_cmp): Same. * c-parser.c (c_parser_struct_or_union_specifier): Same. * c-tree.h: Same. * gimple-parser.c (c_parser_gimple_compound_statement): Same. gcc/c-family/ChangeLog: PR c++/61339 * c-opts.c (handle_deferred_opts): : Change class-key of PODs to struct and others to class. * c-pretty-print.h: Same. gcc/cp/ChangeLog: PR c++/61339 * cp-tree.h: Change class-key of PODs to struct and others to class. * search.c: Same. * semantics.c (finalize_nrv_r): Same. gcc/lto/ChangeLog: PR c++/61339 * lto-common.c (lto_splay_tree_new): : Change class-key of PODs to struct and others to class. (mentions_vars_p): Same. (register_resolution): Same. (lto_register_var_decl_in_symtab): Same. (lto_register_function_decl_in_symtab): Same. (cmp_tree): Same. (lto_read_decls): Same. gcc/ChangeLog: PR c++/61339 * auto-profile.c: Change class-key of PODs to struct and others to class. * basic-block.h: Same. * bitmap.c (bitmap_alloc): Same. * bitmap.h: Same. * builtins.c (expand_builtin_prefetch): Same. (expand_builtin_interclass_mathfn): Same. (expand_builtin_strlen): Same. (expand_builtin_mempcpy_args): Same. (expand_cmpstr): Same. (expand_builtin___clear_cache): Same. (expand_ifn_atomic_bit_test_and): Same. (expand_builtin_thread_pointer): Same. (expand_builtin_set_thread_pointer): Same. * caller-save.c (setup_save_areas): Same. (replace_reg_with_saved_mem): Same. (insert_restore): Same. (insert_save): Same. (add_used_regs): Same. * cfg.c (get_bb_copy): Same. (set_loop_copy): Same. * cfg.h: Same. * cfganal.h: Same. * cfgexpand.c (alloc_stack_frame_space): Same. (add_stack_var): Same. (add_stack_var_conflict): Same. (add_scope_conflicts_1): Same. (update_alias_info_with_stack_vars): Same. (expand_used_vars): Same. * cfghooks.c (redirect_edge_and_branch_force): Same. (delete_basic_block): Same. (split_edge): Same. (make_forwarder_block): Same. (force_nonfallthru): Same. (duplicate_block): Same. (lv_flush_pending_stmts): Same. * cfghooks.h: Same. * cfgloop.c (flow_loops_cfg_dump): Same. (flow_loop_nested_p): Same. (superloop_at_depth): Same. (get_loop_latch_edges): Same. (flow_loop_dump): Same. (flow_loops_dump): Same. (flow_loops_free): Same. (flow_loop_nodes_find): Same. (establish_preds): Same. (flow_loop_tree_node_add): Same. (flow_loop_tree_node_remove): Same. (flow_loops_find): Same. (find_subloop_latch_edge_by_profile): Same. (find_subloop_latch_edge_by_ivs): Same. (mfb_redirect_edges_in_set): Same. (form_subloop): Same. (merge_latch_edges): Same. (disambiguate_multiple_latches): Same. (disambiguate_loops_with_multiple_latches): Same. (flow_bb_inside_loop_p): Same. (glb_enum_p): Same. (get_loop_body_with_size): Same. (get_loop_body): Same. (fill_sons_in_loop): Same. (get_loop_body_in_dom_order): Same. (get_loop_body_in_custom_order): Same. (release_recorded_exits): Same. (get_loop_exit_edges): Same. (num_loop_branches): Same. (remove_bb_from_loops): Same. (find_common_loop): Same. (delete_loop): Same. (cancel_loop): Same. (verify_loop_structure): Same. (loop_preheader_edge): Same. (loop_exit_edge_p): Same. (single_exit): Same. (loop_exits_to_bb_p): Same. (loop_exits_from_bb_p): Same. (get_loop_location): Same. (record_niter_bound): Same. (get_estimated_loop_iterations_int): Same. (max_stmt_executions_int): Same. (likely_max_stmt_executions_int): Same. (get_estimated_loop_iterations): Same. (get_max_loop_iterations): Same. (get_max_loop_iterations_int): Same. (get_likely_max_loop_iterations): Same. * cfgloop.h (simple_loop_desc): Same. (get_loop): Same. (loop_depth): Same. (loop_outer): Same. (loop_iterator::next): Same. (loop_outermost): Same. * cfgloopanal.c (mark_irreducible_loops): Same. (num_loop_insns): Same. (average_num_loop_insns): Same. (expected_loop_iterations_unbounded): Same. (expected_loop_iterations): Same. (mark_loop_exit_edges): Same. (single_likely_exit): Same. * cfgloopmanip.c (fix_bb_placement): Same. (fix_bb_placements): Same. (remove_path): Same. (place_new_loop): Same. (add_loop): Same. (scale_loop_frequencies): Same. (scale_loop_profile): Same. (create_empty_if_region_on_edge): Same. (create_empty_loop_on_edge): Same. (loopify): Same. (unloop): Same. (fix_loop_placements): Same. (copy_loop_info): Same. (duplicate_loop): Same. (duplicate_subloops): Same. (loop_redirect_edge): Same. (can_duplicate_loop_p): Same. (duplicate_loop_to_header_edge): Same. (mfb_keep_just): Same. (has_preds_from_loop): Same. (create_preheader): Same. (create_preheaders): Same. (lv_adjust_loop_entry_edge): Same. (loop_version): Same. * cfgloopmanip.h: Same. * cgraph.h: Same. * cgraphbuild.c: Same. * combine.c (make_extraction): Same. * config/i386/i386-features.c: Same. * config/i386/i386-features.h: Same. * config/i386/i386.c (ix86_emit_outlined_ms2sysv_save): Same. (ix86_emit_outlined_ms2sysv_restore): Same. (ix86_noce_conversion_profitable_p): Same. (ix86_init_cost): Same. (ix86_simd_clone_usable): Same. * configure.ac: Same. * coretypes.h: Same. * data-streamer-in.c (string_for_index): Same. (streamer_read_indexed_string): Same. (streamer_read_string): Same. (bp_unpack_indexed_string): Same. (bp_unpack_string): Same. (streamer_read_uhwi): Same. (streamer_read_hwi): Same. (streamer_read_gcov_count): Same. (streamer_read_wide_int): Same. * data-streamer.h (streamer_write_bitpack): Same. (bp_unpack_value): Same. (streamer_write_char_stream): Same. (streamer_write_hwi_in_range): Same. (streamer_write_record_start): Same. * ddg.c (create_ddg_dep_from_intra_loop_link): Same. (add_cross_iteration_register_deps): Same. (build_intra_loop_deps): Same. * df-core.c (df_analyze): Same. (loop_post_order_compute): Same. (loop_inverted_post_order_compute): Same. * df-problems.c (df_rd_alloc): Same. (df_rd_simulate_one_insn): Same. (df_rd_local_compute): Same. (df_rd_init_solution): Same. (df_rd_confluence_n): Same. (df_rd_transfer_function): Same. (df_rd_free): Same. (df_rd_dump_defs_set): Same. (df_rd_top_dump): Same. (df_lr_alloc): Same. (df_lr_reset): Same. (df_lr_local_compute): Same. (df_lr_init): Same. (df_lr_confluence_n): Same. (df_lr_free): Same. (df_lr_top_dump): Same. (df_lr_verify_transfer_functions): Same. (df_live_alloc): Same. (df_live_reset): Same. (df_live_init): Same. (df_live_confluence_n): Same. (df_live_finalize): Same. (df_live_free): Same. (df_live_top_dump): Same. (df_live_verify_transfer_functions): Same. (df_mir_alloc): Same. (df_mir_reset): Same. (df_mir_init): Same. (df_mir_confluence_n): Same. (df_mir_free): Same. (df_mir_top_dump): Same. (df_word_lr_alloc): Same. (df_word_lr_reset): Same. (df_word_lr_init): Same. (df_word_lr_confluence_n): Same. (df_word_lr_free): Same. (df_word_lr_top_dump): Same. (df_md_alloc): Same. (df_md_simulate_one_insn): Same. (df_md_reset): Same. (df_md_init): Same. (df_md_free): Same. (df_md_top_dump): Same. * df-scan.c (df_insn_delete): Same. (df_insn_rescan): Same. (df_notes_rescan): Same. (df_sort_and_compress_mws): Same. (df_install_mws): Same. (df_refs_add_to_chains): Same. (df_ref_create_structure): Same. (df_ref_record): Same. (df_def_record_1): Same. (df_find_hard_reg_defs): Same. (df_uses_record): Same. (df_get_conditional_uses): Same. (df_get_call_refs): Same. (df_recompute_luids): Same. (df_get_entry_block_def_set): Same. (df_entry_block_defs_collect): Same. (df_get_exit_block_use_set): Same. (df_exit_block_uses_collect): Same. (df_mws_verify): Same. (df_bb_verify): Same. * df.h (df_scan_get_bb_info): Same. * doc/tm.texi: Same. * dse.c (record_store): Same. * dumpfile.h: Same. * emit-rtl.c (const_fixed_hasher::equal): Same. (set_mem_attributes_minus_bitpos): Same. (change_address): Same. (adjust_address_1): Same. (offset_address): Same. * emit-rtl.h: Same. * except.c (dw2_build_landing_pads): Same. (sjlj_emit_dispatch_table): Same. * explow.c (allocate_dynamic_stack_space): Same. (emit_stack_probe): Same. (probe_stack_range): Same. * expmed.c (store_bit_field_using_insv): Same. (store_bit_field_1): Same. (store_integral_bit_field): Same. (extract_bit_field_using_extv): Same. (extract_bit_field_1): Same. (emit_cstore): Same. * expr.c (emit_block_move_via_cpymem): Same. (expand_cmpstrn_or_cmpmem): Same. (set_storage_via_setmem): Same. (emit_single_push_insn_1): Same. (expand_assignment): Same. (store_constructor): Same. (expand_expr_real_2): Same. (expand_expr_real_1): Same. (try_casesi): Same. * flags.h: Same. * function.c (try_fit_stack_local): Same. (assign_stack_local_1): Same. (assign_stack_local): Same. (cut_slot_from_list): Same. (insert_slot_to_list): Same. (max_slot_level): Same. (move_slot_to_level): Same. (temp_address_hasher::equal): Same. (remove_unused_temp_slot_addresses): Same. (assign_temp): Same. (combine_temp_slots): Same. (update_temp_slot_address): Same. (preserve_temp_slots): Same. * function.h: Same. * fwprop.c: Same. * gcc-rich-location.h: Same. * gcov.c: Same. * genattrtab.c (check_attr_test): Same. (check_attr_value): Same. (convert_set_attr_alternative): Same. (convert_set_attr): Same. (check_defs): Same. (copy_boolean): Same. (get_attr_value): Same. (expand_delays): Same. (make_length_attrs): Same. (min_fn): Same. (make_alternative_compare): Same. (simplify_test_exp): Same. (tests_attr_p): Same. (get_attr_order): Same. (clear_struct_flag): Same. (gen_attr): Same. (compares_alternatives_p): Same. (gen_insn): Same. (gen_delay): Same. (find_attrs_to_cache): Same. (write_test_expr): Same. (walk_attr_value): Same. (write_attr_get): Same. (eliminate_known_true): Same. (write_insn_cases): Same. (write_attr_case): Same. (write_attr_valueq): Same. (write_attr_value): Same. (write_dummy_eligible_delay): Same. (next_comma_elt): Same. (find_attr): Same. (make_internal_attr): Same. (copy_rtx_unchanging): Same. (gen_insn_reserv): Same. (check_tune_attr): Same. (make_automaton_attrs): Same. (handle_arg): Same. * genextract.c (gen_insn): Same. (VEC_char_to_string): Same. * genmatch.c (print_operand): Same. (lower): Same. (parser::parse_operation): Same. (parser::parse_capture): Same. (parser::parse_c_expr): Same. (parser::parse_simplify): Same. (main): Same. * genoutput.c (output_operand_data): Same. (output_get_insn_name): Same. (compare_operands): Same. (place_operands): Same. (process_template): Same. (validate_insn_alternatives): Same. (validate_insn_operands): Same. (gen_expand): Same. (note_constraint): Same. * genpreds.c (write_one_predicate_function): Same. (add_constraint): Same. (process_define_register_constraint): Same. (write_lookup_constraint_1): Same. (write_lookup_constraint_array): Same. (write_insn_constraint_len): Same. (write_reg_class_for_constraint_1): Same. (write_constraint_satisfied_p_array): Same. * genrecog.c (optimize_subroutine_group): Same. * gensupport.c (process_define_predicate): Same. (queue_pattern): Same. (remove_from_queue): Same. (process_rtx): Same. (is_predicable): Same. (change_subst_attribute): Same. (subst_pattern_match): Same. (alter_constraints): Same. (alter_attrs_for_insn): Same. (shift_output_template): Same. (alter_output_for_subst_insn): Same. (process_one_cond_exec): Same. (subst_dup): Same. (process_define_cond_exec): Same. (mnemonic_htab_callback): Same. (gen_mnemonic_attr): Same. (read_md_rtx): Same. * ggc-page.c: Same. * gimple-loop-interchange.cc (dump_reduction): Same. (dump_induction): Same. (loop_cand::~loop_cand): Same. (free_data_refs_with_aux): Same. (tree_loop_interchange::interchange_loops): Same. (tree_loop_interchange::map_inductions_to_loop): Same. (tree_loop_interchange::move_code_to_inner_loop): Same. (compute_access_stride): Same. (compute_access_strides): Same. (proper_loop_form_for_interchange): Same. (tree_loop_interchange_compute_ddrs): Same. (prune_datarefs_not_in_loop): Same. (prepare_data_references): Same. (pass_linterchange::execute): Same. * gimple-loop-jam.c (bb_prevents_fusion_p): Same. (unroll_jam_possible_p): Same. (fuse_loops): Same. (adjust_unroll_factor): Same. (tree_loop_unroll_and_jam): Same. * gimple-loop-versioning.cc (loop_versioning::~loop_versioning): Same. (loop_versioning::expensive_stmt_p): Same. (loop_versioning::version_for_unity): Same. (loop_versioning::dump_inner_likelihood): Same. (loop_versioning::find_per_loop_multiplication): Same. (loop_versioning::analyze_term_using_scevs): Same. (loop_versioning::record_address_fragment): Same. (loop_versioning::analyze_expr): Same. (loop_versioning::analyze_blocks): Same. (loop_versioning::prune_conditions): Same. (loop_versioning::merge_loop_info): Same. (loop_versioning::add_loop_to_queue): Same. (loop_versioning::decide_whether_loop_is_versionable): Same. (loop_versioning::make_versioning_decisions): Same. (loop_versioning::implement_versioning_decisions): Same. * gimple-ssa-evrp-analyze.c (evrp_range_analyzer::record_ranges_from_phis): Same. * gimple-ssa-store-merging.c (split_store::split_store): Same. (count_multiple_uses): Same. (split_group): Same. (imm_store_chain_info::output_merged_store): Same. (pass_store_merging::process_store): Same. * gimple-ssa-strength-reduction.c (slsr_process_phi): Same. * gimple-ssa-warn-alloca.c (adjusted_warn_limit): Same. (is_max): Same. (alloca_call_type): Same. (pass_walloca::execute): Same. * gimple-streamer-in.c (input_phi): Same. (input_gimple_stmt): Same. * gimple-streamer.h: Same. * godump.c (go_force_record_alignment): Same. (go_format_type): Same. (go_output_type): Same. (go_output_fndecl): Same. (go_output_typedef): Same. (keyword_hash_init): Same. (find_dummy_types): Same. * graph.c (draw_cfg_nodes_no_loops): Same. (draw_cfg_nodes_for_loop): Same. * hard-reg-set.h (hard_reg_set_iter_next): Same. * hsa-brig.c: Same. * hsa-common.h (hsa_internal_fn_hasher::equal): Same. * hsa-dump.c (dump_hsa_cfun): Same. * hsa-gen.c (gen_function_def_parameters): Same. * hsa-regalloc.c (dump_hsa_cfun_regalloc): Same. * input.c (dump_line_table_statistics): Same. (test_lexer): Same. * input.h: Same. * internal-fn.c (get_multi_vector_move): Same. (expand_load_lanes_optab_fn): Same. (expand_GOMP_SIMT_ENTER_ALLOC): Same. (expand_GOMP_SIMT_EXIT): Same. (expand_GOMP_SIMT_LAST_LANE): Same. (expand_GOMP_SIMT_ORDERED_PRED): Same. (expand_GOMP_SIMT_VOTE_ANY): Same. (expand_GOMP_SIMT_XCHG_BFLY): Same. (expand_GOMP_SIMT_XCHG_IDX): Same. (expand_addsub_overflow): Same. (expand_neg_overflow): Same. (expand_mul_overflow): Same. (expand_call_mem_ref): Same. (expand_mask_load_optab_fn): Same. (expand_scatter_store_optab_fn): Same. (expand_gather_load_optab_fn): Same. * ipa-cp.c (ipa_get_parm_lattices): Same. (print_all_lattices): Same. (ignore_edge_p): Same. (build_toporder_info): Same. (free_toporder_info): Same. (push_node_to_stack): Same. (ipcp_lattice<valtype>::set_contains_variable): Same. (set_agg_lats_to_bottom): Same. (ipcp_bits_lattice::meet_with): Same. (set_single_call_flag): Same. (initialize_node_lattices): Same. (ipa_get_jf_ancestor_result): Same. (ipcp_verify_propagated_values): Same. (propagate_scalar_across_jump_function): Same. (propagate_context_across_jump_function): Same. (propagate_bits_across_jump_function): Same. (ipa_vr_operation_and_type_effects): Same. (propagate_vr_across_jump_function): Same. (set_check_aggs_by_ref): Same. (set_chain_of_aglats_contains_variable): Same. (merge_aggregate_lattices): Same. (agg_pass_through_permissible_p): Same. (propagate_aggs_across_jump_function): Same. (call_passes_through_thunk_p): Same. (propagate_constants_across_call): Same. (devirtualization_time_bonus): Same. (good_cloning_opportunity_p): Same. (context_independent_aggregate_values): Same. (gather_context_independent_values): Same. (perform_estimation_of_a_value): Same. (estimate_local_effects): Same. (value_topo_info<valtype>::add_val): Same. (add_all_node_vals_to_toposort): Same. (value_topo_info<valtype>::propagate_effects): Same. (ipcp_propagate_stage): Same. (ipcp_discover_new_direct_edges): Same. (same_node_or_its_all_contexts_clone_p): Same. (cgraph_edge_brings_value_p): Same. (gather_edges_for_value): Same. (create_specialized_node): Same. (find_more_scalar_values_for_callers_subset): Same. (find_more_contexts_for_caller_subset): Same. (copy_plats_to_inter): Same. (intersect_aggregates_with_edge): Same. (find_aggregate_values_for_callers_subset): Same. (cgraph_edge_brings_all_agg_vals_for_node): Same. (decide_about_value): Same. (decide_whether_version_node): Same. (spread_undeadness): Same. (identify_dead_nodes): Same. (ipcp_store_vr_results): Same. * ipa-devirt.c (final_warning_record::grow_type_warnings): Same. * ipa-fnsummary.c (ipa_fn_summary::account_size_time): Same. (redirect_to_unreachable): Same. (edge_set_predicate): Same. (evaluate_conditions_for_known_args): Same. (evaluate_properties_for_edge): Same. (ipa_fn_summary_t::duplicate): Same. (ipa_call_summary_t::duplicate): Same. (dump_ipa_call_summary): Same. (ipa_dump_fn_summary): Same. (eliminated_by_inlining_prob): Same. (set_cond_stmt_execution_predicate): Same. (set_switch_stmt_execution_predicate): Same. (compute_bb_predicates): Same. (will_be_nonconstant_expr_predicate): Same. (phi_result_unknown_predicate): Same. (analyze_function_body): Same. (compute_fn_summary): Same. (estimate_edge_devirt_benefit): Same. (estimate_edge_size_and_time): Same. (estimate_calls_size_and_time): Same. (estimate_node_size_and_time): Same. (remap_edge_change_prob): Same. (remap_edge_summaries): Same. (ipa_merge_fn_summary_after_inlining): Same. (ipa_fn_summary_generate): Same. (inline_read_section): Same. (ipa_fn_summary_read): Same. (ipa_fn_summary_write): Same. * ipa-fnsummary.h: Same. * ipa-hsa.c (ipa_hsa_read_section): Same. * ipa-icf-gimple.c (func_checker::compare_loops): Same. * ipa-icf.c (sem_function::param_used_p): Same. * ipa-inline-analysis.c (do_estimate_edge_time): Same. * ipa-inline.c (edge_badness): Same. (inline_small_functions): Same. * ipa-polymorphic-call.c (ipa_polymorphic_call_context::stream_out): Same. * ipa-predicate.c (predicate::remap_after_duplication): Same. (predicate::remap_after_inlining): Same. (predicate::stream_out): Same. * ipa-predicate.h: Same. * ipa-profile.c (ipa_profile_read_summary): Same. * ipa-prop.c (ipa_get_param_decl_index_1): Same. (count_formal_params): Same. (ipa_dump_param): Same. (ipa_alloc_node_params): Same. (ipa_print_node_jump_functions_for_edge): Same. (ipa_print_node_jump_functions): Same. (ipa_load_from_parm_agg): Same. (get_ancestor_addr_info): Same. (ipa_compute_jump_functions_for_edge): Same. (ipa_analyze_virtual_call_uses): Same. (ipa_analyze_stmt_uses): Same. (ipa_analyze_params_uses_in_bb): Same. (update_jump_functions_after_inlining): Same. (try_decrement_rdesc_refcount): Same. (ipa_impossible_devirt_target): Same. (update_indirect_edges_after_inlining): Same. (combine_controlled_uses_counters): Same. (ipa_edge_args_sum_t::duplicate): Same. (ipa_write_jump_function): Same. (ipa_write_indirect_edge_info): Same. (ipa_write_node_info): Same. (ipa_read_edge_info): Same. (ipa_prop_read_section): Same. (read_replacements_section): Same. * ipa-prop.h (ipa_get_param_count): Same. (ipa_get_param): Same. (ipa_get_type): Same. (ipa_get_param_move_cost): Same. (ipa_set_param_used): Same. (ipa_get_controlled_uses): Same. (ipa_set_controlled_uses): Same. (ipa_get_cs_argument_count): Same. * ipa-pure-const.c (analyze_function): Same. (pure_const_read_summary): Same. * ipa-ref.h: Same. * ipa-reference.c (ipa_reference_read_optimization_summary): Same. * ipa-split.c (test_nonssa_use): Same. (dump_split_point): Same. (dominated_by_forbidden): Same. (split_part_set_ssa_name_p): Same. (find_split_points): Same. * ira-build.c (finish_loop_tree_nodes): Same. (low_pressure_loop_node_p): Same. * ira-color.c (ira_reuse_stack_slot): Same. * ira-int.h: Same. * ira.c (setup_reg_equiv): Same. (print_insn_chain): Same. (ira): Same. * loop-doloop.c (doloop_condition_get): Same. (add_test): Same. (record_reg_sets): Same. (doloop_optimize): Same. * loop-init.c (loop_optimizer_init): Same. (fix_loop_structure): Same. * loop-invariant.c (merge_identical_invariants): Same. (compute_always_reached): Same. (find_exits): Same. (may_assign_reg_p): Same. (find_invariants_bb): Same. (find_invariants_body): Same. (replace_uses): Same. (can_move_invariant_reg): Same. (free_inv_motion_data): Same. (move_single_loop_invariants): Same. (change_pressure): Same. (mark_ref_regs): Same. (calculate_loop_reg_pressure): Same. * loop-iv.c (biv_entry_hasher::equal): Same. (iv_extend_to_rtx_code): Same. (check_iv_ref_table_size): Same. (clear_iv_info): Same. (latch_dominating_def): Same. (iv_get_reaching_def): Same. (iv_constant): Same. (iv_subreg): Same. (iv_extend): Same. (iv_neg): Same. (iv_add): Same. (iv_mult): Same. (get_biv_step): Same. (record_iv): Same. (analyzed_for_bivness_p): Same. (record_biv): Same. (iv_analyze_biv): Same. (iv_analyze_expr): Same. (iv_analyze_def): Same. (iv_analyze_op): Same. (iv_analyze): Same. (iv_analyze_result): Same. (biv_p): Same. (eliminate_implied_conditions): Same. (simplify_using_initial_values): Same. (shorten_into_mode): Same. (canonicalize_iv_subregs): Same. (determine_max_iter): Same. (check_simple_exit): Same. (find_simple_exit): Same. (get_simple_loop_desc): Same. * loop-unroll.c (report_unroll): Same. (decide_unrolling): Same. (unroll_loops): Same. (loop_exit_at_end_p): Same. (decide_unroll_constant_iterations): Same. (unroll_loop_constant_iterations): Same. (compare_and_jump_seq): Same. (unroll_loop_runtime_iterations): Same. (decide_unroll_stupid): Same. (unroll_loop_stupid): Same. (referenced_in_one_insn_in_loop_p): Same. (reset_debug_uses_in_loop): Same. (analyze_iv_to_split_insn): Same. * lra-eliminations.c (lra_debug_elim_table): Same. (setup_can_eliminate): Same. (form_sum): Same. (lra_get_elimination_hard_regno): Same. (lra_eliminate_regs_1): Same. (eliminate_regs_in_insn): Same. (update_reg_eliminate): Same. (init_elimination): Same. (lra_eliminate): Same. * lra-int.h: Same. * lra-lives.c (initiate_live_solver): Same. * lra-remat.c (create_remat_bb_data): Same. * lra-spills.c (lra_spill): Same. * lra.c (lra_set_insn_recog_data): Same. (lra_set_used_insn_alternative_by_uid): Same. (init_reg_info): Same. (expand_reg_info): Same. * lto-cgraph.c (output_symtab): Same. (read_identifier): Same. (get_alias_symbol): Same. (input_node): Same. (input_varpool_node): Same. (input_ref): Same. (input_edge): Same. (input_cgraph_1): Same. (input_refs): Same. (input_symtab): Same. (input_offload_tables): Same. (output_cgraph_opt_summary): Same. (input_edge_opt_summary): Same. (input_cgraph_opt_section): Same. * lto-section-in.c (lto_free_raw_section_data): Same. (lto_create_simple_input_block): Same. (lto_free_function_in_decl_state_for_node): Same. * lto-streamer-in.c (lto_tag_check_set): Same. (lto_location_cache::revert_location_cache): Same. (lto_location_cache::input_location): Same. (lto_input_location): Same. (stream_input_location_now): Same. (lto_input_tree_ref): Same. (lto_input_eh_catch_list): Same. (input_eh_region): Same. (lto_init_eh): Same. (make_new_block): Same. (input_cfg): Same. (fixup_call_stmt_edges): Same. (input_struct_function_base): Same. (input_function): Same. (lto_read_body_or_constructor): Same. (lto_read_tree_1): Same. (lto_read_tree): Same. (lto_input_scc): Same. (lto_input_tree_1): Same. (lto_input_toplevel_asms): Same. (lto_input_mode_table): Same. (lto_reader_init): Same. (lto_data_in_create): Same. * lto-streamer-out.c (output_cfg): Same. * lto-streamer.h: Same. * modulo-sched.c (duplicate_insns_of_cycles): Same. (generate_prolog_epilog): Same. (mark_loop_unsched): Same. (dump_insn_location): Same. (loop_canon_p): Same. (sms_schedule): Same. * omp-expand.c (expand_omp_for_ordered_loops): Same. (expand_omp_for_generic): Same. (expand_omp_for_static_nochunk): Same. (expand_omp_for_static_chunk): Same. (expand_omp_simd): Same. (expand_omp_taskloop_for_inner): Same. (expand_oacc_for): Same. (expand_omp_atomic_pipeline): Same. (mark_loops_in_oacc_kernels_region): Same. * omp-offload.c (oacc_xform_loop): Same. * omp-simd-clone.c (simd_clone_adjust): Same. * optabs-query.c (get_traditional_extraction_insn): Same. * optabs.c (expand_vector_broadcast): Same. (expand_binop_directly): Same. (expand_twoval_unop): Same. (expand_twoval_binop): Same. (expand_unop_direct): Same. (emit_indirect_jump): Same. (emit_conditional_move): Same. (emit_conditional_neg_or_complement): Same. (emit_conditional_add): Same. (vector_compare_rtx): Same. (expand_vec_perm_1): Same. (expand_vec_perm_const): Same. (expand_vec_cond_expr): Same. (expand_vec_series_expr): Same. (maybe_emit_atomic_exchange): Same. (maybe_emit_sync_lock_test_and_set): Same. (expand_atomic_compare_and_swap): Same. (expand_atomic_load): Same. (expand_atomic_store): Same. (maybe_emit_op): Same. (valid_multiword_target_p): Same. (create_integer_operand): Same. (maybe_legitimize_operand_same_code): Same. (maybe_legitimize_operand): Same. (create_convert_operand_from_type): Same. (can_reuse_operands_p): Same. (maybe_legitimize_operands): Same. (maybe_gen_insn): Same. (maybe_expand_insn): Same. (maybe_expand_jump_insn): Same. (expand_insn): Same. * optabs.h (create_expand_operand): Same. (create_fixed_operand): Same. (create_output_operand): Same. (create_input_operand): Same. (create_convert_operand_to): Same. (create_convert_operand_from): Same. * optinfo.h: Same. * poly-int.h: Same. * predict.c (optimize_insn_for_speed_p): Same. (optimize_loop_for_size_p): Same. (optimize_loop_for_speed_p): Same. (optimize_loop_nest_for_speed_p): Same. (get_base_value): Same. (predicted_by_loop_heuristics_p): Same. (predict_extra_loop_exits): Same. (predict_loops): Same. (predict_paths_for_bb): Same. (predict_paths_leading_to): Same. (propagate_freq): Same. (pass_profile::execute): Same. * predict.h: Same. * profile-count.c (profile_count::differs_from_p): Same. (profile_probability::differs_lot_from_p): Same. * profile-count.h: Same. * profile.c (branch_prob): Same. * regrename.c (free_chain_data): Same. (mark_conflict): Same. (create_new_chain): Same. (merge_overlapping_regs): Same. (init_rename_info): Same. (merge_chains): Same. (regrename_analyze): Same. (regrename_do_replace): Same. (scan_rtx_reg): Same. (record_out_operands): Same. (build_def_use): Same. * regrename.h: Same. * reload.h: Same. * reload1.c (init_reload): Same. (maybe_fix_stack_asms): Same. (copy_reloads): Same. (count_pseudo): Same. (count_spilled_pseudo): Same. (find_reg): Same. (find_reload_regs): Same. (select_reload_regs): Same. (spill_hard_reg): Same. (fixup_eh_region_note): Same. (set_reload_reg): Same. (allocate_reload_reg): Same. (compute_reload_subreg_offset): Same. (reload_adjust_reg_for_icode): Same. (emit_input_reload_insns): Same. (emit_output_reload_insns): Same. (do_input_reload): Same. (inherit_piecemeal_p): Same. * rtl.h: Same. * sanopt.c (maybe_get_dominating_check): Same. (maybe_optimize_ubsan_ptr_ifn): Same. (can_remove_asan_check): Same. (maybe_optimize_asan_check_ifn): Same. (sanopt_optimize_walker): Same. * sched-deps.c (add_dependence_list): Same. (chain_to_prev_insn): Same. (add_insn_mem_dependence): Same. (create_insn_reg_set): Same. (maybe_extend_reg_info_p): Same. (sched_analyze_reg): Same. (sched_analyze_1): Same. (get_implicit_reg_pending_clobbers): Same. (chain_to_prev_insn_p): Same. (deps_analyze_insn): Same. (deps_start_bb): Same. (sched_free_deps): Same. (init_deps): Same. (init_deps_reg_last): Same. (free_deps): Same. * sched-ebb.c: Same. * sched-int.h: Same. * sched-rgn.c (add_branch_dependences): Same. (concat_insn_mem_list): Same. (deps_join): Same. (sched_rgn_compute_dependencies): Same. * sel-sched-ir.c (reset_target_context): Same. (copy_deps_context): Same. (init_id_from_df): Same. (has_dependence_p): Same. (change_loops_latches): Same. (bb_top_order_comparator): Same. (make_region_from_loop_preheader): Same. (sel_init_pipelining): Same. (get_loop_nest_for_rgn): Same. (make_regions_from_the_rest): Same. (sel_is_loop_preheader_p): Same. * sel-sched-ir.h (inner_loop_header_p): Same. (get_all_loop_exits): Same. * selftest.h: Same. * sese.c (sese_build_liveouts): Same. (sese_insert_phis_for_liveouts): Same. * sese.h (defined_in_sese_p): Same. * sreal.c (sreal::stream_out): Same. * sreal.h: Same. * streamer-hooks.h: Same. * target-globals.c (save_target_globals): Same. * target-globals.h: Same. * target.def: Same. * target.h: Same. * targhooks.c (default_has_ifunc_p): Same. (default_empty_mask_is_expensive): Same. (default_init_cost): Same. * targhooks.h: Same. * toplev.c: Same. * tree-affine.c (aff_combination_mult): Same. (aff_combination_expand): Same. (aff_combination_constant_multiple_p): Same. * tree-affine.h: Same. * tree-cfg.c (build_gimple_cfg): Same. (replace_loop_annotate_in_block): Same. (replace_uses_by): Same. (remove_bb): Same. (dump_cfg_stats): Same. (gimple_duplicate_sese_region): Same. (gimple_duplicate_sese_tail): Same. (move_block_to_fn): Same. (replace_block_vars_by_duplicates): Same. (move_sese_region_to_fn): Same. (print_loops_bb): Same. (print_loop): Same. (print_loops): Same. (debug): Same. (debug_loops): Same. * tree-cfg.h: Same. * tree-chrec.c (chrec_fold_plus_poly_poly): Same. (chrec_fold_multiply_poly_poly): Same. (chrec_evaluate): Same. (chrec_component_in_loop_num): Same. (reset_evolution_in_loop): Same. (is_multivariate_chrec): Same. (chrec_contains_symbols): Same. (nb_vars_in_chrec): Same. (chrec_convert_1): Same. (chrec_convert_aggressive): Same. * tree-chrec.h: Same. * tree-core.h: Same. * tree-data-ref.c (dump_data_dependence_relation): Same. (canonicalize_base_object_address): Same. (data_ref_compare_tree): Same. (prune_runtime_alias_test_list): Same. (get_segment_min_max): Same. (create_intersect_range_checks): Same. (conflict_fn_no_dependence): Same. (object_address_invariant_in_loop_p): Same. (analyze_ziv_subscript): Same. (analyze_siv_subscript_cst_affine): Same. (analyze_miv_subscript): Same. (analyze_overlapping_iterations): Same. (build_classic_dist_vector_1): Same. (add_other_self_distances): Same. (same_access_functions): Same. (build_classic_dir_vector): Same. (subscript_dependence_tester_1): Same. (subscript_dependence_tester): Same. (access_functions_are_affine_or_constant_p): Same. (get_references_in_stmt): Same. (loop_nest_has_data_refs): Same. (graphite_find_data_references_in_stmt): Same. (find_data_references_in_bb): Same. (get_base_for_alignment): Same. (find_loop_nest_1): Same. (find_loop_nest): Same. * tree-data-ref.h (dr_alignment): Same. (ddr_dependence_level): Same. * tree-if-conv.c (fold_build_cond_expr): Same. (add_to_predicate_list): Same. (add_to_dst_predicate_list): Same. (phi_convertible_by_degenerating_args): Same. (idx_within_array_bound): Same. (all_preds_critical_p): Same. (pred_blocks_visited_p): Same. (predicate_bbs): Same. (build_region): Same. (if_convertible_loop_p_1): Same. (is_cond_scalar_reduction): Same. (predicate_scalar_phi): Same. (remove_conditions_and_labels): Same. (combine_blocks): Same. (version_loop_for_if_conversion): Same. (versionable_outer_loop_p): Same. (ifcvt_local_dce): Same. (tree_if_conversion): Same. (pass_if_conversion::gate): Same. * tree-if-conv.h: Same. * tree-inline.c (maybe_move_debug_stmts_to_successors): Same. * tree-loop-distribution.c (bb_top_order_cmp): Same. (free_rdg): Same. (stmt_has_scalar_dependences_outside_loop): Same. (copy_loop_before): Same. (create_bb_after_loop): Same. (const_with_all_bytes_same): Same. (generate_memset_builtin): Same. (generate_memcpy_builtin): Same. (destroy_loop): Same. (build_rdg_partition_for_vertex): Same. (compute_access_range): Same. (data_ref_segment_size): Same. (latch_dominated_by_data_ref): Same. (compute_alias_check_pairs): Same. (fuse_memset_builtins): Same. (finalize_partitions): Same. (find_seed_stmts_for_distribution): Same. (prepare_perfect_loop_nest): Same. * tree-parloops.c (lambda_transform_legal_p): Same. (loop_parallel_p): Same. (reduc_stmt_res): Same. (add_field_for_name): Same. (create_call_for_reduction_1): Same. (replace_uses_in_bb_by): Same. (transform_to_exit_first_loop_alt): Same. (try_transform_to_exit_first_loop_alt): Same. (transform_to_exit_first_loop): Same. (num_phis): Same. (gen_parallel_loop): Same. (gather_scalar_reductions): Same. (get_omp_data_i_param): Same. (try_create_reduction_list): Same. (oacc_entry_exit_single_gang): Same. (parallelize_loops): Same. * tree-pass.h: Same. * tree-predcom.c (determine_offset): Same. (last_always_executed_block): Same. (split_data_refs_to_components): Same. (suitable_component_p): Same. (valid_initializer_p): Same. (find_looparound_phi): Same. (insert_looparound_copy): Same. (add_looparound_copies): Same. (determine_roots_comp): Same. (predcom_tmp_var): Same. (initialize_root_vars): Same. (initialize_root_vars_store_elim_1): Same. (initialize_root_vars_store_elim_2): Same. (finalize_eliminated_stores): Same. (initialize_root_vars_lm): Same. (remove_stmt): Same. (determine_unroll_factor): Same. (execute_pred_commoning_cbck): Same. (base_names_in_chain_on): Same. (combine_chains): Same. (pcom_stmt_dominates_stmt_p): Same. (try_combine_chains): Same. (prepare_initializers_chain_store_elim): Same. (prepare_initializers_chain): Same. (prepare_initializers): Same. (prepare_finalizers_chain): Same. (prepare_finalizers): Same. (insert_init_seqs): Same. * tree-scalar-evolution.c (loop_phi_node_p): Same. (compute_overall_effect_of_inner_loop): Same. (add_to_evolution_1): Same. (add_to_evolution): Same. (follow_ssa_edge_binary): Same. (follow_ssa_edge_expr): Same. (backedge_phi_arg_p): Same. (follow_ssa_edge_in_condition_phi_branch): Same. (follow_ssa_edge_in_condition_phi): Same. (follow_ssa_edge_inner_loop_phi): Same. (follow_ssa_edge): Same. (analyze_evolution_in_loop): Same. (analyze_initial_condition): Same. (interpret_loop_phi): Same. (interpret_condition_phi): Same. (interpret_rhs_expr): Same. (interpret_expr): Same. (interpret_gimple_assign): Same. (analyze_scalar_evolution_1): Same. (analyze_scalar_evolution): Same. (analyze_scalar_evolution_for_address_of): Same. (get_instantiated_value_entry): Same. (loop_closed_phi_def): Same. (instantiate_scev_name): Same. (instantiate_scev_poly): Same. (instantiate_scev_binary): Same. (instantiate_scev_convert): Same. (instantiate_scev_not): Same. (instantiate_scev_r): Same. (instantiate_scev): Same. (resolve_mixers): Same. (initialize_scalar_evolutions_analyzer): Same. (scev_reset_htab): Same. (scev_reset): Same. (derive_simple_iv_with_niters): Same. (simple_iv_with_niters): Same. (expression_expensive_p): Same. (final_value_replacement_loop): Same. * tree-scalar-evolution.h (block_before_loop): Same. * tree-ssa-address.h: Same. * tree-ssa-dce.c (find_obviously_necessary_stmts): Same. * tree-ssa-dom.c (edge_info::record_simple_equiv): Same. (record_edge_info): Same. * tree-ssa-live.c (var_map_base_fini): Same. (remove_unused_locals): Same. * tree-ssa-live.h: Same. * tree-ssa-loop-ch.c (should_duplicate_loop_header_p): Same. (pass_ch_vect::execute): Same. (pass_ch::process_loop_p): Same. * tree-ssa-loop-im.c (mem_ref_hasher::hash): Same. (movement_possibility): Same. (outermost_invariant_loop): Same. (stmt_cost): Same. (determine_max_movement): Same. (invariantness_dom_walker::before_dom_children): Same. (move_computations): Same. (may_move_till): Same. (force_move_till_op): Same. (force_move_till): Same. (memref_free): Same. (record_mem_ref_loc): Same. (set_ref_stored_in_loop): Same. (mark_ref_stored): Same. (sort_bbs_in_loop_postorder_cmp): Same. (sort_locs_in_loop_postorder_cmp): Same. (analyze_memory_references): Same. (mem_refs_may_alias_p): Same. (find_ref_loc_in_loop_cmp): Same. (rewrite_mem_ref_loc::operator): Same. (first_mem_ref_loc_1::operator): Same. (sm_set_flag_if_changed::operator): Same. (execute_sm_if_changed_flag_set): Same. (execute_sm): Same. (hoist_memory_references): Same. (ref_always_accessed::operator): Same. (refs_independent_p): Same. (record_dep_loop): Same. (ref_indep_loop_p_1): Same. (ref_indep_loop_p): Same. (can_sm_ref_p): Same. (find_refs_for_sm): Same. (loop_suitable_for_sm): Same. (store_motion_loop): Same. (store_motion): Same. (fill_always_executed_in): Same. * tree-ssa-loop-ivcanon.c (constant_after_peeling): Same. (estimated_unrolled_size): Same. (loop_edge_to_cancel): Same. (remove_exits_and_undefined_stmts): Same. (remove_redundant_iv_tests): Same. (unloop_loops): Same. (estimated_peeled_sequence_size): Same. (try_peel_loop): Same. (canonicalize_loop_induction_variables): Same. (canonicalize_induction_variables): Same. * tree-ssa-loop-ivopts.c (iv_inv_expr_hasher::equal): Same. (name_info): Same. (stmt_after_inc_pos): Same. (contains_abnormal_ssa_name_p): Same. (niter_for_exit): Same. (find_bivs): Same. (mark_bivs): Same. (find_givs_in_bb): Same. (find_induction_variables): Same. (find_interesting_uses_cond): Same. (outermost_invariant_loop_for_expr): Same. (idx_find_step): Same. (add_candidate_1): Same. (add_iv_candidate_derived_from_uses): Same. (alloc_use_cost_map): Same. (prepare_decl_rtl): Same. (generic_predict_doloop_p): Same. (computation_cost): Same. (determine_common_wider_type): Same. (get_computation_aff_1): Same. (get_use_type): Same. (determine_group_iv_cost_address): Same. (iv_period): Same. (difference_cannot_overflow_p): Same. (may_eliminate_iv): Same. (determine_set_costs): Same. (cheaper_cost_pair): Same. (compare_cost_pair): Same. (iv_ca_cand_for_group): Same. (iv_ca_recount_cost): Same. (iv_ca_set_remove_invs): Same. (iv_ca_set_no_cp): Same. (iv_ca_set_add_invs): Same. (iv_ca_set_cp): Same. (iv_ca_add_group): Same. (iv_ca_cost): Same. (iv_ca_compare_deps): Same. (iv_ca_delta_reverse): Same. (iv_ca_delta_commit): Same. (iv_ca_cand_used_p): Same. (iv_ca_delta_free): Same. (iv_ca_new): Same. (iv_ca_free): Same. (iv_ca_dump): Same. (iv_ca_extend): Same. (iv_ca_narrow): Same. (iv_ca_prune): Same. (cheaper_cost_with_cand): Same. (iv_ca_replace): Same. (try_add_cand_for): Same. (get_initial_solution): Same. (try_improve_iv_set): Same. (find_optimal_iv_set_1): Same. (create_new_iv): Same. (rewrite_use_compare): Same. (remove_unused_ivs): Same. (determine_scaling_factor): Same. * tree-ssa-loop-ivopts.h: Same. * tree-ssa-loop-manip.c (create_iv): Same. (compute_live_loop_exits): Same. (add_exit_phi): Same. (add_exit_phis): Same. (find_uses_to_rename_use): Same. (find_uses_to_rename_def): Same. (find_uses_to_rename_in_loop): Same. (rewrite_into_loop_closed_ssa): Same. (check_loop_closed_ssa_bb): Same. (split_loop_exit_edge): Same. (ip_end_pos): Same. (ip_normal_pos): Same. (copy_phi_node_args): Same. (gimple_duplicate_loop_to_header_edge): Same. (can_unroll_loop_p): Same. (determine_exit_conditions): Same. (scale_dominated_blocks_in_loop): Same. (niter_for_unrolled_loop): Same. (tree_transform_and_unroll_loop): Same. (rewrite_all_phi_nodes_with_iv): Same. * tree-ssa-loop-manip.h: Same. * tree-ssa-loop-niter.c (number_of_iterations_ne_max): Same. (number_of_iterations_ne): Same. (assert_no_overflow_lt): Same. (assert_loop_rolls_lt): Same. (number_of_iterations_lt): Same. (adjust_cond_for_loop_until_wrap): Same. (tree_simplify_using_condition): Same. (simplify_using_initial_conditions): Same. (simplify_using_outer_evolutions): Same. (loop_only_exit_p): Same. (ssa_defined_by_minus_one_stmt_p): Same. (number_of_iterations_popcount): Same. (number_of_iterations_exit): Same. (find_loop_niter): Same. (finite_loop_p): Same. (chain_of_csts_start): Same. (get_val_for): Same. (loop_niter_by_eval): Same. (derive_constant_upper_bound_ops): Same. (do_warn_aggressive_loop_optimizations): Same. (record_estimate): Same. (get_cst_init_from_scev): Same. (record_nonwrapping_iv): Same. (idx_infer_loop_bounds): Same. (infer_loop_bounds_from_ref): Same. (infer_loop_bounds_from_array): Same. (infer_loop_bounds_from_pointer_arith): Same. (infer_loop_bounds_from_signedness): Same. (bound_index): Same. (discover_iteration_bound_by_body_walk): Same. (maybe_lower_iteration_bound): Same. (estimate_numbers_of_iterations): Same. (estimated_loop_iterations): Same. (estimated_loop_iterations_int): Same. (max_loop_iterations): Same. (max_loop_iterations_int): Same. (likely_max_loop_iterations): Same. (likely_max_loop_iterations_int): Same. (estimated_stmt_executions_int): Same. (max_stmt_executions): Same. (likely_max_stmt_executions): Same. (estimated_stmt_executions): Same. (stmt_dominates_stmt_p): Same. (nowrap_type_p): Same. (loop_exits_before_overflow): Same. (scev_var_range_cant_overflow): Same. (scev_probably_wraps_p): Same. (free_numbers_of_iterations_estimates): Same. * tree-ssa-loop-niter.h: Same. * tree-ssa-loop-prefetch.c (release_mem_refs): Same. (idx_analyze_ref): Same. (analyze_ref): Same. (gather_memory_references_ref): Same. (mark_nontemporal_store): Same. (emit_mfence_after_loop): Same. (may_use_storent_in_loop_p): Same. (mark_nontemporal_stores): Same. (should_unroll_loop_p): Same. (volume_of_dist_vector): Same. (add_subscript_strides): Same. (self_reuse_distance): Same. (insn_to_prefetch_ratio_too_small_p): Same. * tree-ssa-loop-split.c (split_at_bb_p): Same. (patch_loop_exit): Same. (find_or_create_guard_phi): Same. (easy_exit_values): Same. (connect_loop_phis): Same. (connect_loops): Same. (compute_new_first_bound): Same. (split_loop): Same. (tree_ssa_split_loops): Same. * tree-ssa-loop-unswitch.c (tree_ssa_unswitch_loops): Same. (is_maybe_undefined): Same. (tree_may_unswitch_on): Same. (simplify_using_entry_checks): Same. (tree_unswitch_single_loop): Same. (tree_unswitch_loop): Same. (tree_unswitch_outer_loop): Same. (empty_bb_without_guard_p): Same. (used_outside_loop_p): Same. (get_vop_from_header): Same. (hoist_guard): Same. * tree-ssa-loop.c (gate_oacc_kernels): Same. (get_lsm_tmp_name): Same. * tree-ssa-loop.h: Same. * tree-ssa-reassoc.c (add_repeat_to_ops_vec): Same. (build_and_add_sum): Same. (no_side_effect_bb): Same. (get_ops): Same. (linearize_expr): Same. (should_break_up_subtract): Same. (linearize_expr_tree): Same. * tree-ssa-scopedtables.c: Same. * tree-ssa-scopedtables.h: Same. * tree-ssa-structalias.c (condense_visit): Same. (label_visit): Same. (dump_pred_graph): Same. (perform_var_substitution): Same. (move_complex_constraints): Same. (remove_preds_and_fake_succs): Same. * tree-ssa-threadupdate.c (dbds_continue_enumeration_p): Same. (determine_bb_domination_status): Same. (duplicate_thread_path): Same. (thread_through_all_blocks): Same. * tree-ssa-threadupdate.h: Same. * tree-streamer-in.c (streamer_read_string_cst): Same. (input_identifier): Same. (unpack_ts_type_common_value_fields): Same. (unpack_ts_block_value_fields): Same. (unpack_ts_translation_unit_decl_value_fields): Same. (unpack_ts_omp_clause_value_fields): Same. (streamer_read_tree_bitfields): Same. (streamer_alloc_tree): Same. (lto_input_ts_common_tree_pointers): Same. (lto_input_ts_vector_tree_pointers): Same. (lto_input_ts_poly_tree_pointers): Same. (lto_input_ts_complex_tree_pointers): Same. (lto_input_ts_decl_minimal_tree_pointers): Same. (lto_input_ts_decl_common_tree_pointers): Same. (lto_input_ts_decl_non_common_tree_pointers): Same. (lto_input_ts_decl_with_vis_tree_pointers): Same. (lto_input_ts_field_decl_tree_pointers): Same. (lto_input_ts_function_decl_tree_pointers): Same. (lto_input_ts_type_common_tree_pointers): Same. (lto_input_ts_type_non_common_tree_pointers): Same. (lto_input_ts_list_tree_pointers): Same. (lto_input_ts_vec_tree_pointers): Same. (lto_input_ts_exp_tree_pointers): Same. (lto_input_ts_block_tree_pointers): Same. (lto_input_ts_binfo_tree_pointers): Same. (lto_input_ts_constructor_tree_pointers): Same. (lto_input_ts_omp_clause_tree_pointers): Same. (streamer_read_tree_body): Same. * tree-streamer.h: Same. * tree-switch-conversion.c (bit_test_cluster::is_beneficial): Same. * tree-vect-data-refs.c (vect_get_smallest_scalar_type): Same. (vect_analyze_possibly_independent_ddr): Same. (vect_analyze_data_ref_dependence): Same. (vect_compute_data_ref_alignment): Same. (vect_enhance_data_refs_alignment): Same. (vect_analyze_data_ref_access): Same. (vect_check_gather_scatter): Same. (vect_find_stmt_data_reference): Same. (vect_create_addr_base_for_vector_ref): Same. (vect_setup_realignment): Same. (vect_supportable_dr_alignment): Same. * tree-vect-loop-manip.c (rename_variables_in_bb): Same. (adjust_phi_and_debug_stmts): Same. (vect_set_loop_mask): Same. (add_preheader_seq): Same. (vect_maybe_permute_loop_masks): Same. (vect_set_loop_masks_directly): Same. (vect_set_loop_condition_masked): Same. (vect_set_loop_condition_unmasked): Same. (slpeel_duplicate_current_defs_from_edges): Same. (slpeel_add_loop_guard): Same. (slpeel_can_duplicate_loop_p): Same. (create_lcssa_for_virtual_phi): Same. (iv_phi_p): Same. (vect_update_ivs_after_vectorizer): Same. (vect_gen_vector_loop_niters_mult_vf): Same. (slpeel_update_phi_nodes_for_loops): Same. (slpeel_update_phi_nodes_for_guard1): Same. (find_guard_arg): Same. (slpeel_update_phi_nodes_for_guard2): Same. (slpeel_update_phi_nodes_for_lcssa): Same. (vect_do_peeling): Same. (vect_create_cond_for_alias_checks): Same. (vect_loop_versioning): Same. * tree-vect-loop.c (vect_determine_vf_for_stmt): Same. (vect_inner_phi_in_double_reduction_p): Same. (vect_analyze_scalar_cycles_1): Same. (vect_fixup_scalar_cycles_with_patterns): Same. (vect_get_loop_niters): Same. (bb_in_loop_p): Same. (vect_get_max_nscalars_per_iter): Same. (vect_verify_full_masking): Same. (vect_compute_single_scalar_iteration_cost): Same. (vect_analyze_loop_form_1): Same. (vect_analyze_loop_form): Same. (vect_active_double_reduction_p): Same. (vect_analyze_loop_operations): Same. (neutral_op_for_slp_reduction): Same. (vect_is_simple_reduction): Same. (vect_model_reduction_cost): Same. (get_initial_def_for_reduction): Same. (get_initial_defs_for_reduction): Same. (vect_create_epilog_for_reduction): Same. (vectorize_fold_left_reduction): Same. (vectorizable_reduction): Same. (vectorizable_induction): Same. (vectorizable_live_operation): Same. (loop_niters_no_overflow): Same. (vect_get_loop_mask): Same. (vect_transform_loop_stmt): Same. (vect_transform_loop): Same. * tree-vect-patterns.c (vect_reassociating_reduction_p): Same. (vect_determine_precisions): Same. (vect_pattern_recog_1): Same. * tree-vect-slp.c (vect_analyze_slp_instance): Same. * tree-vect-stmts.c (stmt_vectype): Same. (process_use): Same. (vect_init_vector_1): Same. (vect_truncate_gather_scatter_offset): Same. (get_group_load_store_type): Same. (vect_build_gather_load_calls): Same. (vect_get_strided_load_store_ops): Same. (vectorizable_simd_clone_call): Same. (vectorizable_store): Same. (permute_vec_elements): Same. (vectorizable_load): Same. (vect_transform_stmt): Same. (supportable_widening_operation): Same. * tree-vectorizer.c (vec_info::replace_stmt): Same. (vec_info::free_stmt_vec_info): Same. (vect_free_loop_info_assumptions): Same. (vect_loop_vectorized_call): Same. (set_uid_loop_bbs): Same. (vectorize_loops): Same. * tree-vectorizer.h (STMT_VINFO_BB_VINFO): Same. * tree.c (add_tree_to_fld_list): Same. (fld_type_variant_equal_p): Same. (fld_decl_context): Same. (fld_incomplete_type_of): Same. (free_lang_data_in_binfo): Same. (need_assembler_name_p): Same. (find_decls_types_r): Same. (get_eh_types_for_runtime): Same. (find_decls_types_in_eh_region): Same. (find_decls_types_in_node): Same. (assign_assembler_name_if_needed): Same. * value-prof.c (stream_out_histogram_value): Same. * value-prof.h: Same. * var-tracking.c (use_narrower_mode): Same. (prepare_call_arguments): Same. (vt_expand_loc_callback): Same. (resolve_expansions_pending_recursion): Same. (vt_expand_loc): Same. * varasm.c (const_hash_1): Same. (compare_constant): Same. (tree_output_constant_def): Same. (simplify_subtraction): Same. (get_pool_constant): Same. (output_constant_pool_2): Same. (output_constant_pool_1): Same. (mark_constants_in_pattern): Same. (mark_constant_pool): Same. (get_section_anchor): Same. * vr-values.c (compare_range_with_value): Same. (vr_values::extract_range_from_phi_node): Same. * vr-values.h: Same. * web.c (unionfind_union): Same. * wide-int.h: Same. From-SVN: r273311
2019-07-09 20:32:49 +02:00
const class xlogue_layout &
xlogue_layout::get_instance ()
{
enum xlogue_stub_sets stub_set;
bool aligned_plus_8 = cfun->machine->call_ms2sysv_pad_in;
if (stack_realign_fp)
stub_set = XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
else if (frame_pointer_needed)
stub_set = aligned_plus_8
? XLOGUE_SET_HFP_ALIGNED_PLUS_8
: XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
else
stub_set = aligned_plus_8 ? XLOGUE_SET_ALIGNED_PLUS_8 : XLOGUE_SET_ALIGNED;
return s_instances[stub_set];
}
/* Determine how many clobbered registers can be saved by the stub.
Returns the count of registers the stub will save and restore. */
unsigned
xlogue_layout::count_stub_managed_regs ()
{
bool hfp = frame_pointer_needed || stack_realign_fp;
unsigned i, count;
unsigned regno;
for (count = i = MIN_REGS; i < MAX_REGS; ++i)
{
regno = REG_ORDER[i];
if (regno == BP_REG && hfp)
continue;
if (!ix86_save_reg (regno, false, false))
break;
++count;
}
return count;
}
/* Determine if register REGNO is a stub managed register given the
total COUNT of stub managed registers. */
bool
xlogue_layout::is_stub_managed_reg (unsigned regno, unsigned count)
{
bool hfp = frame_pointer_needed || stack_realign_fp;
unsigned i;
for (i = 0; i < count; ++i)
{
gcc_assert (i < MAX_REGS);
if (REG_ORDER[i] == BP_REG && hfp)
++count;
else if (REG_ORDER[i] == regno)
return true;
}
return false;
}
/* Constructor for xlogue_layout. */
xlogue_layout::xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp)
: m_hfp (hfp) , m_nregs (hfp ? 17 : 18),
m_stack_align_off_in (stack_align_off_in)
{
HOST_WIDE_INT offset = stack_align_off_in;
unsigned i, j;
for (i = j = 0; i < MAX_REGS; ++i)
{
unsigned regno = REG_ORDER[i];
if (regno == BP_REG && hfp)
continue;
if (SSE_REGNO_P (regno))
{
offset += 16;
/* Verify that SSE regs are always aligned. */
gcc_assert (!((stack_align_off_in + offset) & 15));
}
else
offset += 8;
m_regs[j].regno = regno;
m_regs[j++].offset = offset - STUB_INDEX_OFFSET;
}
gcc_assert (j == m_nregs);
}
const char *
xlogue_layout::get_stub_name (enum xlogue_stub stub,
unsigned n_extra_regs)
{
const int have_avx = TARGET_AVX;
char *name = s_stub_names[!!have_avx][stub][n_extra_regs];
/* Lazy init */
if (!*name)
{
int res = snprintf (name, STUB_NAME_MAX_LEN, "__%s_%s_%u",
(have_avx ? "avx" : "sse"),
STUB_BASE_NAMES[stub],
MIN_REGS + n_extra_regs);
gcc_checking_assert (res < (int)STUB_NAME_MAX_LEN);
}
return name;
}
/* Return rtx of a symbol ref for the entry point (based upon
cfun->machine->call_ms2sysv_extra_regs) of the specified stub. */
rtx
xlogue_layout::get_stub_rtx (enum xlogue_stub stub)
{
const unsigned n_extra_regs = cfun->machine->call_ms2sysv_extra_regs;
gcc_checking_assert (n_extra_regs <= MAX_EXTRA_REGS);
gcc_assert (stub < XLOGUE_STUB_COUNT);
gcc_assert (crtl->stack_realign_finalized);
return gen_rtx_SYMBOL_REF (Pmode, get_stub_name (stub, n_extra_regs));
}
unsigned scalar_chain::max_id = 0;
namespace {
/* Initialize new chain. */
re PR rtl-optimization/91154 (456.hmmer regression on Haswell caused by r272922) 2019-08-14 Richard Biener <rguenther@suse.de> Uroš Bizjak <ubizjak@gmail.com> PR target/91154 * config/i386/i386-features.h (scalar_chain::scalar_chain): Add mode arguments. (scalar_chain::smode): New member. (scalar_chain::vmode): Likewise. (dimode_scalar_chain): Rename to... (general_scalar_chain): ... this. (general_scalar_chain::general_scalar_chain): Take mode arguments. (timode_scalar_chain::timode_scalar_chain): Initialize scalar_chain base with TImode and V1TImode. * config/i386/i386-features.c (scalar_chain::scalar_chain): Adjust. (general_scalar_chain::vector_const_cost): Adjust for SImode chains. (general_scalar_chain::compute_convert_gain): Likewise. Add {S,U}{MIN,MAX} support. (general_scalar_chain::replace_with_subreg): Use vmode/smode. (general_scalar_chain::make_vector_copies): Likewise. Handle non-DImode chains appropriately. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::convert_op): Likewise. (general_scalar_chain::convert_insn): Likewise. Add fatal_insn_not_found if the result is not recognized. (convertible_comparison_p): Pass in the scalar mode and use that. (general_scalar_to_vector_candidate_p): Likewise. Rename from dimode_scalar_to_vector_candidate_p. Add {S,U}{MIN,MAX} support. (scalar_to_vector_candidate_p): Remove by inlining into single caller. (general_remove_non_convertible_regs): Rename from dimode_remove_non_convertible_regs. (remove_non_convertible_regs): Remove by inlining into single caller. (convert_scalars_to_vector): Handle SImode and DImode chains in addition to TImode chains. * config/i386/i386.md (<maxmin><MAXMIN_IMODE>3): New expander. (*<maxmin><MAXMIN_IMODE>3_1): New insn-and-split. (*<maxmin>di3_doubleword): Likewise. * gcc.target/i386/pr91154.c: New testcase. * gcc.target/i386/minmax-3.c: Likewise. * gcc.target/i386/minmax-4.c: Likewise. * gcc.target/i386/minmax-5.c: Likewise. * gcc.target/i386/minmax-6.c: Likewise. * gcc.target/i386/minmax-1.c: Add -mno-stv. * gcc.target/i386/minmax-2.c: Likewise. Co-Authored-By: Uros Bizjak <ubizjak@gmail.com> From-SVN: r274481
2019-08-14 14:04:05 +02:00
scalar_chain::scalar_chain (enum machine_mode smode_, enum machine_mode vmode_)
{
re PR rtl-optimization/91154 (456.hmmer regression on Haswell caused by r272922) 2019-08-14 Richard Biener <rguenther@suse.de> Uroš Bizjak <ubizjak@gmail.com> PR target/91154 * config/i386/i386-features.h (scalar_chain::scalar_chain): Add mode arguments. (scalar_chain::smode): New member. (scalar_chain::vmode): Likewise. (dimode_scalar_chain): Rename to... (general_scalar_chain): ... this. (general_scalar_chain::general_scalar_chain): Take mode arguments. (timode_scalar_chain::timode_scalar_chain): Initialize scalar_chain base with TImode and V1TImode. * config/i386/i386-features.c (scalar_chain::scalar_chain): Adjust. (general_scalar_chain::vector_const_cost): Adjust for SImode chains. (general_scalar_chain::compute_convert_gain): Likewise. Add {S,U}{MIN,MAX} support. (general_scalar_chain::replace_with_subreg): Use vmode/smode. (general_scalar_chain::make_vector_copies): Likewise. Handle non-DImode chains appropriately. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::convert_op): Likewise. (general_scalar_chain::convert_insn): Likewise. Add fatal_insn_not_found if the result is not recognized. (convertible_comparison_p): Pass in the scalar mode and use that. (general_scalar_to_vector_candidate_p): Likewise. Rename from dimode_scalar_to_vector_candidate_p. Add {S,U}{MIN,MAX} support. (scalar_to_vector_candidate_p): Remove by inlining into single caller. (general_remove_non_convertible_regs): Rename from dimode_remove_non_convertible_regs. (remove_non_convertible_regs): Remove by inlining into single caller. (convert_scalars_to_vector): Handle SImode and DImode chains in addition to TImode chains. * config/i386/i386.md (<maxmin><MAXMIN_IMODE>3): New expander. (*<maxmin><MAXMIN_IMODE>3_1): New insn-and-split. (*<maxmin>di3_doubleword): Likewise. * gcc.target/i386/pr91154.c: New testcase. * gcc.target/i386/minmax-3.c: Likewise. * gcc.target/i386/minmax-4.c: Likewise. * gcc.target/i386/minmax-5.c: Likewise. * gcc.target/i386/minmax-6.c: Likewise. * gcc.target/i386/minmax-1.c: Add -mno-stv. * gcc.target/i386/minmax-2.c: Likewise. Co-Authored-By: Uros Bizjak <ubizjak@gmail.com> From-SVN: r274481
2019-08-14 14:04:05 +02:00
smode = smode_;
vmode = vmode_;
chain_id = ++max_id;
if (dump_file)
fprintf (dump_file, "Created a new instruction chain #%d\n", chain_id);
bitmap_obstack_initialize (NULL);
insns = BITMAP_ALLOC (NULL);
defs = BITMAP_ALLOC (NULL);
defs_conv = BITMAP_ALLOC (NULL);
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64. This patch upgrades x86_64's scalar-to-vector (STV) pass to more aggressively transform 128-bit scalar TImode operations into vector V1TImode operations performed on SSE registers. TImode functionality already exists in STV, but only for move operations. This change brings support for logical operations (AND, IOR, XOR, NOT and ANDN) and comparisons. The effect of these changes are conveniently demonstrated by the new sse4_1-stv-5.c test case: __int128 a[16]; __int128 b[16]; __int128 c[16]; void foo() { for (unsigned int i=0; i<16; i++) a[i] = b[i] & ~c[i]; } which when currently compiled on mainline wtih -O2 -msse4 produces: foo: xorl %eax, %eax .L2: movq c(%rax), %rsi movq c+8(%rax), %rdi addq $16, %rax notq %rsi notq %rdi andq b-16(%rax), %rsi andq b-8(%rax), %rdi movq %rsi, a-16(%rax) movq %rdi, a-8(%rax) cmpq $256, %rax jne .L2 ret but with this patch now produces: foo: xorl %eax, %eax .L2: movdqa c(%rax), %xmm0 pandn b(%rax), %xmm0 addq $16, %rax movaps %xmm0, a-16(%rax) cmpq $256, %rax jne .L2 ret Technically, the STV pass is implemented by three C++ classes, a common abstract base class "scalar_chain" that contains common functionality, and two derived classes: general_scalar_chain (which handles SI and DI modes) and timode_scalar_chain (which handles TI modes). As mentioned previously, because only TI mode moves were handled the two worker classes behaved significantly differently. These changes bring the functionality of these two classes closer together, which is reflected by refactoring more shared code from general_scalar_chain to the parent scalar_chain and reusing it from timode_scalar_chain. There still remain significant differences (and simplifications) so the existing division of classes (as specializations) continues to make sense. 2022-07-11 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-features.h (scalar_chain): Add fields insns_conv, n_sse_to_integer and n_integer_to_sse to this parent class, moved from general_scalar_chain. (scalar_chain::convert_compare): Protected method moved from general_scalar_chain. (mark_dual_mode_def): Make protected, not private virtual. (scalar_chain:convert_op): New private virtual method. (general_scalar_chain::general_scalar_chain): Simplify constructor. (general_scalar_chain::~general_scalar_chain): Delete destructor. (general_scalar_chain): Move insns_conv, n_sse_to_integer and n_integer_to_sse fields to parent class, scalar_chain. (general_scalar_chain::mark_dual_mode_def): Delete prototype. (general_scalar_chain::convert_compare): Delete prototype. (timode_scalar_chain::compute_convert_gain): Remove simplistic implementation, convert to a method prototype. (timode_scalar_chain::mark_dual_mode_def): Delete prototype. (timode_scalar_chain::convert_op): Prototype new virtual method. * config/i386/i386-features.cc (scalar_chain::scalar_chain): Allocate insns_conv and initialize n_sse_to_integer and n_integer_to_sse fields in constructor. (scalar_chain::scalar_chain): Free insns_conv in destructor. (general_scalar_chain::general_scalar_chain): Delete constructor, now defined in the class declaration. (general_scalar_chain::~general_scalar_chain): Delete destructor. (scalar_chain::mark_dual_mode_def): Renamed from general_scalar_chain::mark_dual_mode_def. (timode_scalar_chain::mark_dual_mode_def): Delete. (scalar_chain::convert_compare): Renamed from general_scalar_chain::convert_compare. (timode_scalar_chain::compute_convert_gain): New method to determine the gain from converting a TImode chain to V1TImode. (timode_scalar_chain::convert_op): New method to convert an operand from TImode to V1TImode. (timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE on REG_EQUAL notes that were originally TImode (not CONST_INT). Handle AND, ANDN, XOR, IOR, NOT and COMPARE. (timode_mem_p): Helper predicate to check where operand is memory reference with sufficient alignment for TImode STV. (timode_scalar_to_vector_candidate_p): Use convertible_comparison_p to check whether COMPARE is convertible. Handle SET_DESTs that that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT, CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT. gcc/testsuite/ChangeLog * gcc.target/i386/sse4_1-stv-2.c: New test case, pand. * gcc.target/i386/sse4_1-stv-3.c: New test case, por. * gcc.target/i386/sse4_1-stv-4.c: New test case, pxor. * gcc.target/i386/sse4_1-stv-5.c: New test case, pandn. * gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
insns_conv = BITMAP_ALLOC (NULL);
queue = NULL;
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64. This patch upgrades x86_64's scalar-to-vector (STV) pass to more aggressively transform 128-bit scalar TImode operations into vector V1TImode operations performed on SSE registers. TImode functionality already exists in STV, but only for move operations. This change brings support for logical operations (AND, IOR, XOR, NOT and ANDN) and comparisons. The effect of these changes are conveniently demonstrated by the new sse4_1-stv-5.c test case: __int128 a[16]; __int128 b[16]; __int128 c[16]; void foo() { for (unsigned int i=0; i<16; i++) a[i] = b[i] & ~c[i]; } which when currently compiled on mainline wtih -O2 -msse4 produces: foo: xorl %eax, %eax .L2: movq c(%rax), %rsi movq c+8(%rax), %rdi addq $16, %rax notq %rsi notq %rdi andq b-16(%rax), %rsi andq b-8(%rax), %rdi movq %rsi, a-16(%rax) movq %rdi, a-8(%rax) cmpq $256, %rax jne .L2 ret but with this patch now produces: foo: xorl %eax, %eax .L2: movdqa c(%rax), %xmm0 pandn b(%rax), %xmm0 addq $16, %rax movaps %xmm0, a-16(%rax) cmpq $256, %rax jne .L2 ret Technically, the STV pass is implemented by three C++ classes, a common abstract base class "scalar_chain" that contains common functionality, and two derived classes: general_scalar_chain (which handles SI and DI modes) and timode_scalar_chain (which handles TI modes). As mentioned previously, because only TI mode moves were handled the two worker classes behaved significantly differently. These changes bring the functionality of these two classes closer together, which is reflected by refactoring more shared code from general_scalar_chain to the parent scalar_chain and reusing it from timode_scalar_chain. There still remain significant differences (and simplifications) so the existing division of classes (as specializations) continues to make sense. 2022-07-11 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-features.h (scalar_chain): Add fields insns_conv, n_sse_to_integer and n_integer_to_sse to this parent class, moved from general_scalar_chain. (scalar_chain::convert_compare): Protected method moved from general_scalar_chain. (mark_dual_mode_def): Make protected, not private virtual. (scalar_chain:convert_op): New private virtual method. (general_scalar_chain::general_scalar_chain): Simplify constructor. (general_scalar_chain::~general_scalar_chain): Delete destructor. (general_scalar_chain): Move insns_conv, n_sse_to_integer and n_integer_to_sse fields to parent class, scalar_chain. (general_scalar_chain::mark_dual_mode_def): Delete prototype. (general_scalar_chain::convert_compare): Delete prototype. (timode_scalar_chain::compute_convert_gain): Remove simplistic implementation, convert to a method prototype. (timode_scalar_chain::mark_dual_mode_def): Delete prototype. (timode_scalar_chain::convert_op): Prototype new virtual method. * config/i386/i386-features.cc (scalar_chain::scalar_chain): Allocate insns_conv and initialize n_sse_to_integer and n_integer_to_sse fields in constructor. (scalar_chain::scalar_chain): Free insns_conv in destructor. (general_scalar_chain::general_scalar_chain): Delete constructor, now defined in the class declaration. (general_scalar_chain::~general_scalar_chain): Delete destructor. (scalar_chain::mark_dual_mode_def): Renamed from general_scalar_chain::mark_dual_mode_def. (timode_scalar_chain::mark_dual_mode_def): Delete. (scalar_chain::convert_compare): Renamed from general_scalar_chain::convert_compare. (timode_scalar_chain::compute_convert_gain): New method to determine the gain from converting a TImode chain to V1TImode. (timode_scalar_chain::convert_op): New method to convert an operand from TImode to V1TImode. (timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE on REG_EQUAL notes that were originally TImode (not CONST_INT). Handle AND, ANDN, XOR, IOR, NOT and COMPARE. (timode_mem_p): Helper predicate to check where operand is memory reference with sufficient alignment for TImode STV. (timode_scalar_to_vector_candidate_p): Use convertible_comparison_p to check whether COMPARE is convertible. Handle SET_DESTs that that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT, CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT. gcc/testsuite/ChangeLog * gcc.target/i386/sse4_1-stv-2.c: New test case, pand. * gcc.target/i386/sse4_1-stv-3.c: New test case, por. * gcc.target/i386/sse4_1-stv-4.c: New test case, pxor. * gcc.target/i386/sse4_1-stv-5.c: New test case, pandn. * gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
n_sse_to_integer = 0;
n_integer_to_sse = 0;
}
/* Free chain's data. */
scalar_chain::~scalar_chain ()
{
BITMAP_FREE (insns);
BITMAP_FREE (defs);
BITMAP_FREE (defs_conv);
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64. This patch upgrades x86_64's scalar-to-vector (STV) pass to more aggressively transform 128-bit scalar TImode operations into vector V1TImode operations performed on SSE registers. TImode functionality already exists in STV, but only for move operations. This change brings support for logical operations (AND, IOR, XOR, NOT and ANDN) and comparisons. The effect of these changes are conveniently demonstrated by the new sse4_1-stv-5.c test case: __int128 a[16]; __int128 b[16]; __int128 c[16]; void foo() { for (unsigned int i=0; i<16; i++) a[i] = b[i] & ~c[i]; } which when currently compiled on mainline wtih -O2 -msse4 produces: foo: xorl %eax, %eax .L2: movq c(%rax), %rsi movq c+8(%rax), %rdi addq $16, %rax notq %rsi notq %rdi andq b-16(%rax), %rsi andq b-8(%rax), %rdi movq %rsi, a-16(%rax) movq %rdi, a-8(%rax) cmpq $256, %rax jne .L2 ret but with this patch now produces: foo: xorl %eax, %eax .L2: movdqa c(%rax), %xmm0 pandn b(%rax), %xmm0 addq $16, %rax movaps %xmm0, a-16(%rax) cmpq $256, %rax jne .L2 ret Technically, the STV pass is implemented by three C++ classes, a common abstract base class "scalar_chain" that contains common functionality, and two derived classes: general_scalar_chain (which handles SI and DI modes) and timode_scalar_chain (which handles TI modes). As mentioned previously, because only TI mode moves were handled the two worker classes behaved significantly differently. These changes bring the functionality of these two classes closer together, which is reflected by refactoring more shared code from general_scalar_chain to the parent scalar_chain and reusing it from timode_scalar_chain. There still remain significant differences (and simplifications) so the existing division of classes (as specializations) continues to make sense. 2022-07-11 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-features.h (scalar_chain): Add fields insns_conv, n_sse_to_integer and n_integer_to_sse to this parent class, moved from general_scalar_chain. (scalar_chain::convert_compare): Protected method moved from general_scalar_chain. (mark_dual_mode_def): Make protected, not private virtual. (scalar_chain:convert_op): New private virtual method. (general_scalar_chain::general_scalar_chain): Simplify constructor. (general_scalar_chain::~general_scalar_chain): Delete destructor. (general_scalar_chain): Move insns_conv, n_sse_to_integer and n_integer_to_sse fields to parent class, scalar_chain. (general_scalar_chain::mark_dual_mode_def): Delete prototype. (general_scalar_chain::convert_compare): Delete prototype. (timode_scalar_chain::compute_convert_gain): Remove simplistic implementation, convert to a method prototype. (timode_scalar_chain::mark_dual_mode_def): Delete prototype. (timode_scalar_chain::convert_op): Prototype new virtual method. * config/i386/i386-features.cc (scalar_chain::scalar_chain): Allocate insns_conv and initialize n_sse_to_integer and n_integer_to_sse fields in constructor. (scalar_chain::scalar_chain): Free insns_conv in destructor. (general_scalar_chain::general_scalar_chain): Delete constructor, now defined in the class declaration. (general_scalar_chain::~general_scalar_chain): Delete destructor. (scalar_chain::mark_dual_mode_def): Renamed from general_scalar_chain::mark_dual_mode_def. (timode_scalar_chain::mark_dual_mode_def): Delete. (scalar_chain::convert_compare): Renamed from general_scalar_chain::convert_compare. (timode_scalar_chain::compute_convert_gain): New method to determine the gain from converting a TImode chain to V1TImode. (timode_scalar_chain::convert_op): New method to convert an operand from TImode to V1TImode. (timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE on REG_EQUAL notes that were originally TImode (not CONST_INT). Handle AND, ANDN, XOR, IOR, NOT and COMPARE. (timode_mem_p): Helper predicate to check where operand is memory reference with sufficient alignment for TImode STV. (timode_scalar_to_vector_candidate_p): Use convertible_comparison_p to check whether COMPARE is convertible. Handle SET_DESTs that that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT, CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT. gcc/testsuite/ChangeLog * gcc.target/i386/sse4_1-stv-2.c: New test case, pand. * gcc.target/i386/sse4_1-stv-3.c: New test case, por. * gcc.target/i386/sse4_1-stv-4.c: New test case, pxor. * gcc.target/i386/sse4_1-stv-5.c: New test case, pandn. * gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
BITMAP_FREE (insns_conv);
bitmap_obstack_release (NULL);
}
/* Add instruction into chains' queue. */
void
scalar_chain::add_to_queue (unsigned insn_uid)
{
if (bitmap_bit_p (insns, insn_uid)
|| bitmap_bit_p (queue, insn_uid))
return;
if (dump_file)
fprintf (dump_file, " Adding insn %d into chain's #%d queue\n",
insn_uid, chain_id);
bitmap_set_bit (queue, insn_uid);
}
/* For DImode conversion, mark register defined by DEF as requiring
conversion. */
void
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64. This patch upgrades x86_64's scalar-to-vector (STV) pass to more aggressively transform 128-bit scalar TImode operations into vector V1TImode operations performed on SSE registers. TImode functionality already exists in STV, but only for move operations. This change brings support for logical operations (AND, IOR, XOR, NOT and ANDN) and comparisons. The effect of these changes are conveniently demonstrated by the new sse4_1-stv-5.c test case: __int128 a[16]; __int128 b[16]; __int128 c[16]; void foo() { for (unsigned int i=0; i<16; i++) a[i] = b[i] & ~c[i]; } which when currently compiled on mainline wtih -O2 -msse4 produces: foo: xorl %eax, %eax .L2: movq c(%rax), %rsi movq c+8(%rax), %rdi addq $16, %rax notq %rsi notq %rdi andq b-16(%rax), %rsi andq b-8(%rax), %rdi movq %rsi, a-16(%rax) movq %rdi, a-8(%rax) cmpq $256, %rax jne .L2 ret but with this patch now produces: foo: xorl %eax, %eax .L2: movdqa c(%rax), %xmm0 pandn b(%rax), %xmm0 addq $16, %rax movaps %xmm0, a-16(%rax) cmpq $256, %rax jne .L2 ret Technically, the STV pass is implemented by three C++ classes, a common abstract base class "scalar_chain" that contains common functionality, and two derived classes: general_scalar_chain (which handles SI and DI modes) and timode_scalar_chain (which handles TI modes). As mentioned previously, because only TI mode moves were handled the two worker classes behaved significantly differently. These changes bring the functionality of these two classes closer together, which is reflected by refactoring more shared code from general_scalar_chain to the parent scalar_chain and reusing it from timode_scalar_chain. There still remain significant differences (and simplifications) so the existing division of classes (as specializations) continues to make sense. 2022-07-11 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-features.h (scalar_chain): Add fields insns_conv, n_sse_to_integer and n_integer_to_sse to this parent class, moved from general_scalar_chain. (scalar_chain::convert_compare): Protected method moved from general_scalar_chain. (mark_dual_mode_def): Make protected, not private virtual. (scalar_chain:convert_op): New private virtual method. (general_scalar_chain::general_scalar_chain): Simplify constructor. (general_scalar_chain::~general_scalar_chain): Delete destructor. (general_scalar_chain): Move insns_conv, n_sse_to_integer and n_integer_to_sse fields to parent class, scalar_chain. (general_scalar_chain::mark_dual_mode_def): Delete prototype. (general_scalar_chain::convert_compare): Delete prototype. (timode_scalar_chain::compute_convert_gain): Remove simplistic implementation, convert to a method prototype. (timode_scalar_chain::mark_dual_mode_def): Delete prototype. (timode_scalar_chain::convert_op): Prototype new virtual method. * config/i386/i386-features.cc (scalar_chain::scalar_chain): Allocate insns_conv and initialize n_sse_to_integer and n_integer_to_sse fields in constructor. (scalar_chain::scalar_chain): Free insns_conv in destructor. (general_scalar_chain::general_scalar_chain): Delete constructor, now defined in the class declaration. (general_scalar_chain::~general_scalar_chain): Delete destructor. (scalar_chain::mark_dual_mode_def): Renamed from general_scalar_chain::mark_dual_mode_def. (timode_scalar_chain::mark_dual_mode_def): Delete. (scalar_chain::convert_compare): Renamed from general_scalar_chain::convert_compare. (timode_scalar_chain::compute_convert_gain): New method to determine the gain from converting a TImode chain to V1TImode. (timode_scalar_chain::convert_op): New method to convert an operand from TImode to V1TImode. (timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE on REG_EQUAL notes that were originally TImode (not CONST_INT). Handle AND, ANDN, XOR, IOR, NOT and COMPARE. (timode_mem_p): Helper predicate to check where operand is memory reference with sufficient alignment for TImode STV. (timode_scalar_to_vector_candidate_p): Use convertible_comparison_p to check whether COMPARE is convertible. Handle SET_DESTs that that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT, CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT. gcc/testsuite/ChangeLog * gcc.target/i386/sse4_1-stv-2.c: New test case, pand. * gcc.target/i386/sse4_1-stv-3.c: New test case, por. * gcc.target/i386/sse4_1-stv-4.c: New test case, pxor. * gcc.target/i386/sse4_1-stv-5.c: New test case, pandn. * gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
scalar_chain::mark_dual_mode_def (df_ref def)
{
gcc_assert (DF_REF_REG_DEF_P (def));
/* Record the def/insn pair so we can later efficiently iterate over
the defs to convert on insns not in the chain. */
bool reg_new = bitmap_set_bit (defs_conv, DF_REF_REGNO (def));
if (!bitmap_bit_p (insns, DF_REF_INSN_UID (def)))
{
if (!bitmap_set_bit (insns_conv, DF_REF_INSN_UID (def))
&& !reg_new)
return;
n_integer_to_sse++;
}
else
{
if (!reg_new)
return;
n_sse_to_integer++;
}
if (dump_file)
fprintf (dump_file,
" Mark r%d def in insn %d as requiring both modes in chain #%d\n",
DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id);
}
/* Check REF's chain to add new insns into a queue
and find registers requiring conversion. */
void
scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref)
{
df_link *chain;
gcc_assert (bitmap_bit_p (insns, DF_REF_INSN_UID (ref))
|| bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)));
add_to_queue (DF_REF_INSN_UID (ref));
for (chain = DF_REF_CHAIN (ref); chain; chain = chain->next)
{
unsigned uid = DF_REF_INSN_UID (chain->ref);
if (!NONDEBUG_INSN_P (DF_REF_INSN (chain->ref)))
continue;
if (!DF_REF_REG_MEM_P (chain->ref))
{
if (bitmap_bit_p (insns, uid))
continue;
if (bitmap_bit_p (candidates, uid))
{
add_to_queue (uid);
continue;
}
}
if (DF_REF_REG_DEF_P (chain->ref))
{
if (dump_file)
fprintf (dump_file, " r%d def in insn %d isn't convertible\n",
DF_REF_REGNO (chain->ref), uid);
mark_dual_mode_def (chain->ref);
}
else
{
if (dump_file)
fprintf (dump_file, " r%d use in insn %d isn't convertible\n",
DF_REF_REGNO (chain->ref), uid);
mark_dual_mode_def (ref);
}
}
}
/* Add instruction into a chain. */
void
scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid)
{
if (bitmap_bit_p (insns, insn_uid))
return;
if (dump_file)
fprintf (dump_file, " Adding insn %d to chain #%d\n", insn_uid, chain_id);
bitmap_set_bit (insns, insn_uid);
rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
rtx def_set = single_set (insn);
if (def_set && REG_P (SET_DEST (def_set))
&& !HARD_REGISTER_P (SET_DEST (def_set)))
bitmap_set_bit (defs, REGNO (SET_DEST (def_set)));
re PR rtl-optimization/91154 (456.hmmer regression on Haswell caused by r272922) 2019-08-14 Richard Biener <rguenther@suse.de> Uroš Bizjak <ubizjak@gmail.com> PR target/91154 * config/i386/i386-features.h (scalar_chain::scalar_chain): Add mode arguments. (scalar_chain::smode): New member. (scalar_chain::vmode): Likewise. (dimode_scalar_chain): Rename to... (general_scalar_chain): ... this. (general_scalar_chain::general_scalar_chain): Take mode arguments. (timode_scalar_chain::timode_scalar_chain): Initialize scalar_chain base with TImode and V1TImode. * config/i386/i386-features.c (scalar_chain::scalar_chain): Adjust. (general_scalar_chain::vector_const_cost): Adjust for SImode chains. (general_scalar_chain::compute_convert_gain): Likewise. Add {S,U}{MIN,MAX} support. (general_scalar_chain::replace_with_subreg): Use vmode/smode. (general_scalar_chain::make_vector_copies): Likewise. Handle non-DImode chains appropriately. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::convert_op): Likewise. (general_scalar_chain::convert_insn): Likewise. Add fatal_insn_not_found if the result is not recognized. (convertible_comparison_p): Pass in the scalar mode and use that. (general_scalar_to_vector_candidate_p): Likewise. Rename from dimode_scalar_to_vector_candidate_p. Add {S,U}{MIN,MAX} support. (scalar_to_vector_candidate_p): Remove by inlining into single caller. (general_remove_non_convertible_regs): Rename from dimode_remove_non_convertible_regs. (remove_non_convertible_regs): Remove by inlining into single caller. (convert_scalars_to_vector): Handle SImode and DImode chains in addition to TImode chains. * config/i386/i386.md (<maxmin><MAXMIN_IMODE>3): New expander. (*<maxmin><MAXMIN_IMODE>3_1): New insn-and-split. (*<maxmin>di3_doubleword): Likewise. * gcc.target/i386/pr91154.c: New testcase. * gcc.target/i386/minmax-3.c: Likewise. * gcc.target/i386/minmax-4.c: Likewise. * gcc.target/i386/minmax-5.c: Likewise. * gcc.target/i386/minmax-6.c: Likewise. * gcc.target/i386/minmax-1.c: Add -mno-stv. * gcc.target/i386/minmax-2.c: Likewise. Co-Authored-By: Uros Bizjak <ubizjak@gmail.com> From-SVN: r274481
2019-08-14 14:04:05 +02:00
/* ??? The following is quadratic since analyze_register_chain
iterates over all refs to look for dual-mode regs. Instead this
should be done separately for all regs mentioned in the chain once. */
df_ref ref;
for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
if (!HARD_REGISTER_P (DF_REF_REG (ref)))
analyze_register_chain (candidates, ref);
for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
if (!DF_REF_REG_MEM_P (ref))
analyze_register_chain (candidates, ref);
}
/* Build new chain starting from insn INSN_UID recursively
adding all dependent uses and definitions. */
void
scalar_chain::build (bitmap candidates, unsigned insn_uid)
{
queue = BITMAP_ALLOC (NULL);
bitmap_set_bit (queue, insn_uid);
if (dump_file)
fprintf (dump_file, "Building chain #%d...\n", chain_id);
while (!bitmap_empty_p (queue))
{
insn_uid = bitmap_first_set_bit (queue);
bitmap_clear_bit (queue, insn_uid);
bitmap_clear_bit (candidates, insn_uid);
add_insn (candidates, insn_uid);
}
if (dump_file)
{
fprintf (dump_file, "Collected chain #%d...\n", chain_id);
fprintf (dump_file, " insns: ");
dump_bitmap (dump_file, insns);
if (!bitmap_empty_p (defs_conv))
{
bitmap_iterator bi;
unsigned id;
const char *comma = "";
fprintf (dump_file, " defs to convert: ");
EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
{
fprintf (dump_file, "%sr%d", comma, id);
comma = ", ";
}
fprintf (dump_file, "\n");
}
}
BITMAP_FREE (queue);
}
/* Return a cost of building a vector costant
instead of using a scalar one. */
int
re PR rtl-optimization/91154 (456.hmmer regression on Haswell caused by r272922) 2019-08-14 Richard Biener <rguenther@suse.de> Uroš Bizjak <ubizjak@gmail.com> PR target/91154 * config/i386/i386-features.h (scalar_chain::scalar_chain): Add mode arguments. (scalar_chain::smode): New member. (scalar_chain::vmode): Likewise. (dimode_scalar_chain): Rename to... (general_scalar_chain): ... this. (general_scalar_chain::general_scalar_chain): Take mode arguments. (timode_scalar_chain::timode_scalar_chain): Initialize scalar_chain base with TImode and V1TImode. * config/i386/i386-features.c (scalar_chain::scalar_chain): Adjust. (general_scalar_chain::vector_const_cost): Adjust for SImode chains. (general_scalar_chain::compute_convert_gain): Likewise. Add {S,U}{MIN,MAX} support. (general_scalar_chain::replace_with_subreg): Use vmode/smode. (general_scalar_chain::make_vector_copies): Likewise. Handle non-DImode chains appropriately. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::convert_op): Likewise. (general_scalar_chain::convert_insn): Likewise. Add fatal_insn_not_found if the result is not recognized. (convertible_comparison_p): Pass in the scalar mode and use that. (general_scalar_to_vector_candidate_p): Likewise. Rename from dimode_scalar_to_vector_candidate_p. Add {S,U}{MIN,MAX} support. (scalar_to_vector_candidate_p): Remove by inlining into single caller. (general_remove_non_convertible_regs): Rename from dimode_remove_non_convertible_regs. (remove_non_convertible_regs): Remove by inlining into single caller. (convert_scalars_to_vector): Handle SImode and DImode chains in addition to TImode chains. * config/i386/i386.md (<maxmin><MAXMIN_IMODE>3): New expander. (*<maxmin><MAXMIN_IMODE>3_1): New insn-and-split. (*<maxmin>di3_doubleword): Likewise. * gcc.target/i386/pr91154.c: New testcase. * gcc.target/i386/minmax-3.c: Likewise. * gcc.target/i386/minmax-4.c: Likewise. * gcc.target/i386/minmax-5.c: Likewise. * gcc.target/i386/minmax-6.c: Likewise. * gcc.target/i386/minmax-1.c: Add -mno-stv. * gcc.target/i386/minmax-2.c: Likewise. Co-Authored-By: Uros Bizjak <ubizjak@gmail.com> From-SVN: r274481
2019-08-14 14:04:05 +02:00
general_scalar_chain::vector_const_cost (rtx exp)
{
gcc_assert (CONST_INT_P (exp));
re PR rtl-optimization/91154 (456.hmmer regression on Haswell caused by r272922) 2019-08-14 Richard Biener <rguenther@suse.de> Uroš Bizjak <ubizjak@gmail.com> PR target/91154 * config/i386/i386-features.h (scalar_chain::scalar_chain): Add mode arguments. (scalar_chain::smode): New member. (scalar_chain::vmode): Likewise. (dimode_scalar_chain): Rename to... (general_scalar_chain): ... this. (general_scalar_chain::general_scalar_chain): Take mode arguments. (timode_scalar_chain::timode_scalar_chain): Initialize scalar_chain base with TImode and V1TImode. * config/i386/i386-features.c (scalar_chain::scalar_chain): Adjust. (general_scalar_chain::vector_const_cost): Adjust for SImode chains. (general_scalar_chain::compute_convert_gain): Likewise. Add {S,U}{MIN,MAX} support. (general_scalar_chain::replace_with_subreg): Use vmode/smode. (general_scalar_chain::make_vector_copies): Likewise. Handle non-DImode chains appropriately. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::convert_op): Likewise. (general_scalar_chain::convert_insn): Likewise. Add fatal_insn_not_found if the result is not recognized. (convertible_comparison_p): Pass in the scalar mode and use that. (general_scalar_to_vector_candidate_p): Likewise. Rename from dimode_scalar_to_vector_candidate_p. Add {S,U}{MIN,MAX} support. (scalar_to_vector_candidate_p): Remove by inlining into single caller. (general_remove_non_convertible_regs): Rename from dimode_remove_non_convertible_regs. (remove_non_convertible_regs): Remove by inlining into single caller. (convert_scalars_to_vector): Handle SImode and DImode chains in addition to TImode chains. * config/i386/i386.md (<maxmin><MAXMIN_IMODE>3): New expander. (*<maxmin><MAXMIN_IMODE>3_1): New insn-and-split. (*<maxmin>di3_doubleword): Likewise. * gcc.target/i386/pr91154.c: New testcase. * gcc.target/i386/minmax-3.c: Likewise. * gcc.target/i386/minmax-4.c: Likewise. * gcc.target/i386/minmax-5.c: Likewise. * gcc.target/i386/minmax-6.c: Likewise. * gcc.target/i386/minmax-1.c: Add -mno-stv. * gcc.target/i386/minmax-2.c: Likewise. Co-Authored-By: Uros Bizjak <ubizjak@gmail.com> From-SVN: r274481
2019-08-14 14:04:05 +02:00
if (standard_sse_constant_p (exp, vmode))
return ix86_cost->sse_op;
/* We have separate costs for SImode and DImode, use SImode costs
for smaller modes. */
return ix86_cost->sse_load[smode == DImode ? 1 : 0];
}
/* Compute a gain for chain conversion. */
int
re PR rtl-optimization/91154 (456.hmmer regression on Haswell caused by r272922) 2019-08-14 Richard Biener <rguenther@suse.de> Uroš Bizjak <ubizjak@gmail.com> PR target/91154 * config/i386/i386-features.h (scalar_chain::scalar_chain): Add mode arguments. (scalar_chain::smode): New member. (scalar_chain::vmode): Likewise. (dimode_scalar_chain): Rename to... (general_scalar_chain): ... this. (general_scalar_chain::general_scalar_chain): Take mode arguments. (timode_scalar_chain::timode_scalar_chain): Initialize scalar_chain base with TImode and V1TImode. * config/i386/i386-features.c (scalar_chain::scalar_chain): Adjust. (general_scalar_chain::vector_const_cost): Adjust for SImode chains. (general_scalar_chain::compute_convert_gain): Likewise. Add {S,U}{MIN,MAX} support. (general_scalar_chain::replace_with_subreg): Use vmode/smode. (general_scalar_chain::make_vector_copies): Likewise. Handle non-DImode chains appropriately. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::convert_op): Likewise. (general_scalar_chain::convert_insn): Likewise. Add fatal_insn_not_found if the result is not recognized. (convertible_comparison_p): Pass in the scalar mode and use that. (general_scalar_to_vector_candidate_p): Likewise. Rename from dimode_scalar_to_vector_candidate_p. Add {S,U}{MIN,MAX} support. (scalar_to_vector_candidate_p): Remove by inlining into single caller. (general_remove_non_convertible_regs): Rename from dimode_remove_non_convertible_regs. (remove_non_convertible_regs): Remove by inlining into single caller. (convert_scalars_to_vector): Handle SImode and DImode chains in addition to TImode chains. * config/i386/i386.md (<maxmin><MAXMIN_IMODE>3): New expander. (*<maxmin><MAXMIN_IMODE>3_1): New insn-and-split. (*<maxmin>di3_doubleword): Likewise. * gcc.target/i386/pr91154.c: New testcase. * gcc.target/i386/minmax-3.c: Likewise. * gcc.target/i386/minmax-4.c: Likewise. * gcc.target/i386/minmax-5.c: Likewise. * gcc.target/i386/minmax-6.c: Likewise. * gcc.target/i386/minmax-1.c: Add -mno-stv. * gcc.target/i386/minmax-2.c: Likewise. Co-Authored-By: Uros Bizjak <ubizjak@gmail.com> From-SVN: r274481
2019-08-14 14:04:05 +02:00
general_scalar_chain::compute_convert_gain ()
{
bitmap_iterator bi;
unsigned insn_uid;
int gain = 0;
int cost = 0;
if (dump_file)
fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
re PR rtl-optimization/91154 (456.hmmer regression on Haswell caused by r272922) 2019-08-14 Richard Biener <rguenther@suse.de> Uroš Bizjak <ubizjak@gmail.com> PR target/91154 * config/i386/i386-features.h (scalar_chain::scalar_chain): Add mode arguments. (scalar_chain::smode): New member. (scalar_chain::vmode): Likewise. (dimode_scalar_chain): Rename to... (general_scalar_chain): ... this. (general_scalar_chain::general_scalar_chain): Take mode arguments. (timode_scalar_chain::timode_scalar_chain): Initialize scalar_chain base with TImode and V1TImode. * config/i386/i386-features.c (scalar_chain::scalar_chain): Adjust. (general_scalar_chain::vector_const_cost): Adjust for SImode chains. (general_scalar_chain::compute_convert_gain): Likewise. Add {S,U}{MIN,MAX} support. (general_scalar_chain::replace_with_subreg): Use vmode/smode. (general_scalar_chain::make_vector_copies): Likewise. Handle non-DImode chains appropriately. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::convert_op): Likewise. (general_scalar_chain::convert_insn): Likewise. Add fatal_insn_not_found if the result is not recognized. (convertible_comparison_p): Pass in the scalar mode and use that. (general_scalar_to_vector_candidate_p): Likewise. Rename from dimode_scalar_to_vector_candidate_p. Add {S,U}{MIN,MAX} support. (scalar_to_vector_candidate_p): Remove by inlining into single caller. (general_remove_non_convertible_regs): Rename from dimode_remove_non_convertible_regs. (remove_non_convertible_regs): Remove by inlining into single caller. (convert_scalars_to_vector): Handle SImode and DImode chains in addition to TImode chains. * config/i386/i386.md (<maxmin><MAXMIN_IMODE>3): New expander. (*<maxmin><MAXMIN_IMODE>3_1): New insn-and-split. (*<maxmin>di3_doubleword): Likewise. * gcc.target/i386/pr91154.c: New testcase. * gcc.target/i386/minmax-3.c: Likewise. * gcc.target/i386/minmax-4.c: Likewise. * gcc.target/i386/minmax-5.c: Likewise. * gcc.target/i386/minmax-6.c: Likewise. * gcc.target/i386/minmax-1.c: Add -mno-stv. * gcc.target/i386/minmax-2.c: Likewise. Co-Authored-By: Uros Bizjak <ubizjak@gmail.com> From-SVN: r274481
2019-08-14 14:04:05 +02:00
/* SSE costs distinguish between SImode and DImode loads/stores, for
int costs factor in the number of GPRs involved. When supporting
smaller modes than SImode the int load/store costs need to be
adjusted as well. */
unsigned sse_cost_idx = smode == DImode ? 1 : 0;
unsigned m = smode == DImode ? (TARGET_64BIT ? 1 : 2) : 1;
EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
{
rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
rtx def_set = single_set (insn);
rtx src = SET_SRC (def_set);
rtx dst = SET_DEST (def_set);
int igain = 0;
if (REG_P (src) && REG_P (dst))
re PR rtl-optimization/91154 (456.hmmer regression on Haswell caused by r272922) 2019-08-14 Richard Biener <rguenther@suse.de> Uroš Bizjak <ubizjak@gmail.com> PR target/91154 * config/i386/i386-features.h (scalar_chain::scalar_chain): Add mode arguments. (scalar_chain::smode): New member. (scalar_chain::vmode): Likewise. (dimode_scalar_chain): Rename to... (general_scalar_chain): ... this. (general_scalar_chain::general_scalar_chain): Take mode arguments. (timode_scalar_chain::timode_scalar_chain): Initialize scalar_chain base with TImode and V1TImode. * config/i386/i386-features.c (scalar_chain::scalar_chain): Adjust. (general_scalar_chain::vector_const_cost): Adjust for SImode chains. (general_scalar_chain::compute_convert_gain): Likewise. Add {S,U}{MIN,MAX} support. (general_scalar_chain::replace_with_subreg): Use vmode/smode. (general_scalar_chain::make_vector_copies): Likewise. Handle non-DImode chains appropriately. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::convert_op): Likewise. (general_scalar_chain::convert_insn): Likewise. Add fatal_insn_not_found if the result is not recognized. (convertible_comparison_p): Pass in the scalar mode and use that. (general_scalar_to_vector_candidate_p): Likewise. Rename from dimode_scalar_to_vector_candidate_p. Add {S,U}{MIN,MAX} support. (scalar_to_vector_candidate_p): Remove by inlining into single caller. (general_remove_non_convertible_regs): Rename from dimode_remove_non_convertible_regs. (remove_non_convertible_regs): Remove by inlining into single caller. (convert_scalars_to_vector): Handle SImode and DImode chains in addition to TImode chains. * config/i386/i386.md (<maxmin><MAXMIN_IMODE>3): New expander. (*<maxmin><MAXMIN_IMODE>3_1): New insn-and-split. (*<maxmin>di3_doubleword): Likewise. * gcc.target/i386/pr91154.c: New testcase. * gcc.target/i386/minmax-3.c: Likewise. * gcc.target/i386/minmax-4.c: Likewise. * gcc.target/i386/minmax-5.c: Likewise. * gcc.target/i386/minmax-6.c: Likewise. * gcc.target/i386/minmax-1.c: Add -mno-stv. * gcc.target/i386/minmax-2.c: Likewise. Co-Authored-By: Uros Bizjak <ubizjak@gmail.com> From-SVN: r274481
2019-08-14 14:04:05 +02:00
igain += 2 * m - ix86_cost->xmm_move;
else if (REG_P (src) && MEM_P (dst))
re PR rtl-optimization/91154 (456.hmmer regression on Haswell caused by r272922) 2019-08-14 Richard Biener <rguenther@suse.de> Uroš Bizjak <ubizjak@gmail.com> PR target/91154 * config/i386/i386-features.h (scalar_chain::scalar_chain): Add mode arguments. (scalar_chain::smode): New member. (scalar_chain::vmode): Likewise. (dimode_scalar_chain): Rename to... (general_scalar_chain): ... this. (general_scalar_chain::general_scalar_chain): Take mode arguments. (timode_scalar_chain::timode_scalar_chain): Initialize scalar_chain base with TImode and V1TImode. * config/i386/i386-features.c (scalar_chain::scalar_chain): Adjust. (general_scalar_chain::vector_const_cost): Adjust for SImode chains. (general_scalar_chain::compute_convert_gain): Likewise. Add {S,U}{MIN,MAX} support. (general_scalar_chain::replace_with_subreg): Use vmode/smode. (general_scalar_chain::make_vector_copies): Likewise. Handle non-DImode chains appropriately. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::convert_op): Likewise. (general_scalar_chain::convert_insn): Likewise. Add fatal_insn_not_found if the result is not recognized. (convertible_comparison_p): Pass in the scalar mode and use that. (general_scalar_to_vector_candidate_p): Likewise. Rename from dimode_scalar_to_vector_candidate_p. Add {S,U}{MIN,MAX} support. (scalar_to_vector_candidate_p): Remove by inlining into single caller. (general_remove_non_convertible_regs): Rename from dimode_remove_non_convertible_regs. (remove_non_convertible_regs): Remove by inlining into single caller. (convert_scalars_to_vector): Handle SImode and DImode chains in addition to TImode chains. * config/i386/i386.md (<maxmin><MAXMIN_IMODE>3): New expander. (*<maxmin><MAXMIN_IMODE>3_1): New insn-and-split. (*<maxmin>di3_doubleword): Likewise. * gcc.target/i386/pr91154.c: New testcase. * gcc.target/i386/minmax-3.c: Likewise. * gcc.target/i386/minmax-4.c: Likewise. * gcc.target/i386/minmax-5.c: Likewise. * gcc.target/i386/minmax-6.c: Likewise. * gcc.target/i386/minmax-1.c: Add -mno-stv. * gcc.target/i386/minmax-2.c: Likewise. Co-Authored-By: Uros Bizjak <ubizjak@gmail.com> From-SVN: r274481
2019-08-14 14:04:05 +02:00
igain
+= m * ix86_cost->int_store[2] - ix86_cost->sse_store[sse_cost_idx];
else if (MEM_P (src) && REG_P (dst))
re PR rtl-optimization/91154 (456.hmmer regression on Haswell caused by r272922) 2019-08-14 Richard Biener <rguenther@suse.de> Uroš Bizjak <ubizjak@gmail.com> PR target/91154 * config/i386/i386-features.h (scalar_chain::scalar_chain): Add mode arguments. (scalar_chain::smode): New member. (scalar_chain::vmode): Likewise. (dimode_scalar_chain): Rename to... (general_scalar_chain): ... this. (general_scalar_chain::general_scalar_chain): Take mode arguments. (timode_scalar_chain::timode_scalar_chain): Initialize scalar_chain base with TImode and V1TImode. * config/i386/i386-features.c (scalar_chain::scalar_chain): Adjust. (general_scalar_chain::vector_const_cost): Adjust for SImode chains. (general_scalar_chain::compute_convert_gain): Likewise. Add {S,U}{MIN,MAX} support. (general_scalar_chain::replace_with_subreg): Use vmode/smode. (general_scalar_chain::make_vector_copies): Likewise. Handle non-DImode chains appropriately. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::convert_op): Likewise. (general_scalar_chain::convert_insn): Likewise. Add fatal_insn_not_found if the result is not recognized. (convertible_comparison_p): Pass in the scalar mode and use that. (general_scalar_to_vector_candidate_p): Likewise. Rename from dimode_scalar_to_vector_candidate_p. Add {S,U}{MIN,MAX} support. (scalar_to_vector_candidate_p): Remove by inlining into single caller. (general_remove_non_convertible_regs): Rename from dimode_remove_non_convertible_regs. (remove_non_convertible_regs): Remove by inlining into single caller. (convert_scalars_to_vector): Handle SImode and DImode chains in addition to TImode chains. * config/i386/i386.md (<maxmin><MAXMIN_IMODE>3): New expander. (*<maxmin><MAXMIN_IMODE>3_1): New insn-and-split. (*<maxmin>di3_doubleword): Likewise. * gcc.target/i386/pr91154.c: New testcase. * gcc.target/i386/minmax-3.c: Likewise. * gcc.target/i386/minmax-4.c: Likewise. * gcc.target/i386/minmax-5.c: Likewise. * gcc.target/i386/minmax-6.c: Likewise. * gcc.target/i386/minmax-1.c: Add -mno-stv. * gcc.target/i386/minmax-2.c: Likewise. Co-Authored-By: Uros Bizjak <ubizjak@gmail.com> From-SVN: r274481
2019-08-14 14:04:05 +02:00
igain += m * ix86_cost->int_load[2] - ix86_cost->sse_load[sse_cost_idx];
i386: Add integer nabs instructions [PR101044] The patch adds integer nabs "(NEG (ABS (...)))" instructions, adds STV conversion and adjusts STV cost calculations accordingly. When CMOV instruction is used to implement abs, the sign is determined from the preceeding operand negation, and CMOVS is used to select between negated and non-negated value. To implement nabs, just reverse the condition and emit CMOVNS instead. The STV costs are adjusted for inherent NOT of nabs insn. V2DI NOT is somehow costly operation, since it is implemented as a load of zero, followed by a SUB insn. OTOH, integer nabs with inherent NOT is relatively cheap, so some STV chains became less profitable for conversion. The patch rewrites operand scanner in compute_convert_gain to a switch and reorders case instances in general_scalar_to_vector_candidate_p to benefit from fallthroughs, and to remove special processing of andnot in the later case. gcc/ 2021-07-01 Uroš Bizjak <ubizjak@gmail.com> PR target/101044 * config/i386/i386.md (*nabs<dwi>2_doubleword): New insn_and_split pattern. (*nabs<dwi>2_1): Ditto. * config/i386/i386-features.c (general_scalar_chain::compute_convert_gain): Handle (NEG (ABS (...))) RTX. Rewrite src code scanner as switch statement. (general_scalar_chain::convert_insn): Handle (NEG (ABS (...))) RTX. (general_scalar_to_vector_candidate_p): Detect (NEG (ABS (...))) RTX. Reorder case statements for (AND (NOT (...) ...)) fallthrough. gcc/testsuite/ 2021-07-01 Uroš Bizjak <ubizjak@gmail.com> PR target/101044 * gcc.target/i386/pr101044.c: New test.
2021-07-01 10:56:32 +02:00
else
switch (GET_CODE (src))
{
case ASHIFT:
case ASHIFTRT:
case LSHIFTRT:
if (m == 2)
{
if (INTVAL (XEXP (src, 1)) >= 32)
igain += ix86_cost->add;
else
igain += ix86_cost->shift_const;
}
i386: Add integer nabs instructions [PR101044] The patch adds integer nabs "(NEG (ABS (...)))" instructions, adds STV conversion and adjusts STV cost calculations accordingly. When CMOV instruction is used to implement abs, the sign is determined from the preceeding operand negation, and CMOVS is used to select between negated and non-negated value. To implement nabs, just reverse the condition and emit CMOVNS instead. The STV costs are adjusted for inherent NOT of nabs insn. V2DI NOT is somehow costly operation, since it is implemented as a load of zero, followed by a SUB insn. OTOH, integer nabs with inherent NOT is relatively cheap, so some STV chains became less profitable for conversion. The patch rewrites operand scanner in compute_convert_gain to a switch and reorders case instances in general_scalar_to_vector_candidate_p to benefit from fallthroughs, and to remove special processing of andnot in the later case. gcc/ 2021-07-01 Uroš Bizjak <ubizjak@gmail.com> PR target/101044 * config/i386/i386.md (*nabs<dwi>2_doubleword): New insn_and_split pattern. (*nabs<dwi>2_1): Ditto. * config/i386/i386-features.c (general_scalar_chain::compute_convert_gain): Handle (NEG (ABS (...))) RTX. Rewrite src code scanner as switch statement. (general_scalar_chain::convert_insn): Handle (NEG (ABS (...))) RTX. (general_scalar_to_vector_candidate_p): Detect (NEG (ABS (...))) RTX. Reorder case statements for (AND (NOT (...) ...)) fallthrough. gcc/testsuite/ 2021-07-01 Uroš Bizjak <ubizjak@gmail.com> PR target/101044 * gcc.target/i386/pr101044.c: New test.
2021-07-01 10:56:32 +02:00
igain += ix86_cost->shift_const - ix86_cost->sse_op;
i386: Add integer nabs instructions [PR101044] The patch adds integer nabs "(NEG (ABS (...)))" instructions, adds STV conversion and adjusts STV cost calculations accordingly. When CMOV instruction is used to implement abs, the sign is determined from the preceeding operand negation, and CMOVS is used to select between negated and non-negated value. To implement nabs, just reverse the condition and emit CMOVNS instead. The STV costs are adjusted for inherent NOT of nabs insn. V2DI NOT is somehow costly operation, since it is implemented as a load of zero, followed by a SUB insn. OTOH, integer nabs with inherent NOT is relatively cheap, so some STV chains became less profitable for conversion. The patch rewrites operand scanner in compute_convert_gain to a switch and reorders case instances in general_scalar_to_vector_candidate_p to benefit from fallthroughs, and to remove special processing of andnot in the later case. gcc/ 2021-07-01 Uroš Bizjak <ubizjak@gmail.com> PR target/101044 * config/i386/i386.md (*nabs<dwi>2_doubleword): New insn_and_split pattern. (*nabs<dwi>2_1): Ditto. * config/i386/i386-features.c (general_scalar_chain::compute_convert_gain): Handle (NEG (ABS (...))) RTX. Rewrite src code scanner as switch statement. (general_scalar_chain::convert_insn): Handle (NEG (ABS (...))) RTX. (general_scalar_to_vector_candidate_p): Detect (NEG (ABS (...))) RTX. Reorder case statements for (AND (NOT (...) ...)) fallthrough. gcc/testsuite/ 2021-07-01 Uroš Bizjak <ubizjak@gmail.com> PR target/101044 * gcc.target/i386/pr101044.c: New test.
2021-07-01 10:56:32 +02:00
if (CONST_INT_P (XEXP (src, 0)))
igain -= vector_const_cost (XEXP (src, 0));
break;
case AND:
case IOR:
case XOR:
case PLUS:
case MINUS:
igain += m * ix86_cost->add - ix86_cost->sse_op;
/* Additional gain for andnot for targets without BMI. */
if (GET_CODE (XEXP (src, 0)) == NOT
&& !TARGET_BMI)
igain += m * ix86_cost->add;
if (CONST_INT_P (XEXP (src, 0)))
igain -= vector_const_cost (XEXP (src, 0));
if (CONST_INT_P (XEXP (src, 1)))
igain -= vector_const_cost (XEXP (src, 1));
break;
case NEG:
case NOT:
igain -= ix86_cost->sse_op + COSTS_N_INSNS (1);
if (GET_CODE (XEXP (src, 0)) != ABS)
{
igain += m * ix86_cost->add;
break;
}
/* FALLTHRU */
case ABS:
case SMAX:
case SMIN:
case UMAX:
case UMIN:
/* We do not have any conditional move cost, estimate it as a
reg-reg move. Comparisons are costed as adds. */
igain += m * (COSTS_N_INSNS (2) + ix86_cost->add);
/* Integer SSE ops are all costed the same. */
igain -= ix86_cost->sse_op;
break;
case COMPARE:
/* Assume comparison cost is the same. */
break;
case CONST_INT:
if (REG_P (dst))
{
if (optimize_insn_for_size_p ())
{
/* xor (2 bytes) vs. xorps (3 bytes). */
if (src == const0_rtx)
igain -= COSTS_N_BYTES (1);
/* movdi_internal vs. movv2di_internal. */
/* => mov (5 bytes) vs. movaps (7 bytes). */
else if (x86_64_immediate_operand (src, SImode))
igain -= COSTS_N_BYTES (2);
else
/* ??? Larger immediate constants are placed in the
constant pool, where the size benefit/impact of
STV conversion is affected by whether and how
often each constant pool entry is shared/reused.
The value below is empirically derived from the
CSiBE benchmark (and the optimal value may drift
over time). */
igain += COSTS_N_BYTES (0);
}
else
{
/* DImode can be immediate for TARGET_64BIT
and SImode always. */
igain += m * COSTS_N_INSNS (1);
igain -= vector_const_cost (src);
}
}
i386: Add integer nabs instructions [PR101044] The patch adds integer nabs "(NEG (ABS (...)))" instructions, adds STV conversion and adjusts STV cost calculations accordingly. When CMOV instruction is used to implement abs, the sign is determined from the preceeding operand negation, and CMOVS is used to select between negated and non-negated value. To implement nabs, just reverse the condition and emit CMOVNS instead. The STV costs are adjusted for inherent NOT of nabs insn. V2DI NOT is somehow costly operation, since it is implemented as a load of zero, followed by a SUB insn. OTOH, integer nabs with inherent NOT is relatively cheap, so some STV chains became less profitable for conversion. The patch rewrites operand scanner in compute_convert_gain to a switch and reorders case instances in general_scalar_to_vector_candidate_p to benefit from fallthroughs, and to remove special processing of andnot in the later case. gcc/ 2021-07-01 Uroš Bizjak <ubizjak@gmail.com> PR target/101044 * config/i386/i386.md (*nabs<dwi>2_doubleword): New insn_and_split pattern. (*nabs<dwi>2_1): Ditto. * config/i386/i386-features.c (general_scalar_chain::compute_convert_gain): Handle (NEG (ABS (...))) RTX. Rewrite src code scanner as switch statement. (general_scalar_chain::convert_insn): Handle (NEG (ABS (...))) RTX. (general_scalar_to_vector_candidate_p): Detect (NEG (ABS (...))) RTX. Reorder case statements for (AND (NOT (...) ...)) fallthrough. gcc/testsuite/ 2021-07-01 Uroš Bizjak <ubizjak@gmail.com> PR target/101044 * gcc.target/i386/pr101044.c: New test.
2021-07-01 10:56:32 +02:00
else if (MEM_P (dst))
{
igain += (m * ix86_cost->int_store[2]
- ix86_cost->sse_store[sse_cost_idx]);
igain -= vector_const_cost (src);
}
i386: Add integer nabs instructions [PR101044] The patch adds integer nabs "(NEG (ABS (...)))" instructions, adds STV conversion and adjusts STV cost calculations accordingly. When CMOV instruction is used to implement abs, the sign is determined from the preceeding operand negation, and CMOVS is used to select between negated and non-negated value. To implement nabs, just reverse the condition and emit CMOVNS instead. The STV costs are adjusted for inherent NOT of nabs insn. V2DI NOT is somehow costly operation, since it is implemented as a load of zero, followed by a SUB insn. OTOH, integer nabs with inherent NOT is relatively cheap, so some STV chains became less profitable for conversion. The patch rewrites operand scanner in compute_convert_gain to a switch and reorders case instances in general_scalar_to_vector_candidate_p to benefit from fallthroughs, and to remove special processing of andnot in the later case. gcc/ 2021-07-01 Uroš Bizjak <ubizjak@gmail.com> PR target/101044 * config/i386/i386.md (*nabs<dwi>2_doubleword): New insn_and_split pattern. (*nabs<dwi>2_1): Ditto. * config/i386/i386-features.c (general_scalar_chain::compute_convert_gain): Handle (NEG (ABS (...))) RTX. Rewrite src code scanner as switch statement. (general_scalar_chain::convert_insn): Handle (NEG (ABS (...))) RTX. (general_scalar_to_vector_candidate_p): Detect (NEG (ABS (...))) RTX. Reorder case statements for (AND (NOT (...) ...)) fallthrough. gcc/testsuite/ 2021-07-01 Uroš Bizjak <ubizjak@gmail.com> PR target/101044 * gcc.target/i386/pr101044.c: New test.
2021-07-01 10:56:32 +02:00
break;
default:
gcc_unreachable ();
}
if (igain != 0 && dump_file)
{
fprintf (dump_file, " Instruction gain %d for ", igain);
dump_insn_slim (dump_file, insn);
}
gain += igain;
}
if (dump_file)
fprintf (dump_file, " Instruction conversion gain: %d\n", gain);
/* Cost the integer to sse and sse to integer moves. */
cost += n_sse_to_integer * ix86_cost->sse_to_integer;
/* ??? integer_to_sse but we only have that in the RA cost table.
Assume sse_to_integer/integer_to_sse are the same which they
are at the moment. */
cost += n_integer_to_sse * ix86_cost->sse_to_integer;
if (dump_file)
fprintf (dump_file, " Registers conversion cost: %d\n", cost);
gain -= cost;
if (dump_file)
fprintf (dump_file, " Total gain: %d\n", gain);
return gain;
}
/* Insert generated conversion instruction sequence INSNS
after instruction AFTER. New BB may be required in case
instruction has EH region attached. */
void
scalar_chain::emit_conversion_insns (rtx insns, rtx_insn *after)
{
if (!control_flow_insn_p (after))
{
emit_insn_after (insns, after);
return;
}
basic_block bb = BLOCK_FOR_INSN (after);
edge e = find_fallthru_edge (bb->succs);
gcc_assert (e);
basic_block new_bb = split_edge (e);
emit_insn_after (insns, BB_HEAD (new_bb));
}
} // anon namespace
/* Generate the canonical SET_SRC to move GPR to a VMODE vector register,
zeroing the upper parts. */
static rtx
gen_gpr_to_xmm_move_src (enum machine_mode vmode, rtx gpr)
{
switch (GET_MODE_NUNITS (vmode))
{
case 1:
PR target/70321: Split double word equality/inequality after STV on x86. This patch resolves the last piece of PR target/70321 a code quality (P2 regression) affecting mainline. Currently, for HJ's testcase: void foo (long long ixi) { if (ixi != 14348907) __builtin_abort (); } GCC with -m32 -O2 generates four instructions for the comparison: movl 16(%esp), %eax movl 20(%esp), %edx xorl $14348907, %eax orl %eax, %edx but with this patch it now requires only three, making better use of x86's addressing modes: movl 16(%esp), %eax xorl $14348907, %eax orl 20(%esp), %eax The solution is to expand "doubleword" equality/inequality expressions using flag setting COMPARE instructions for the early RTL passes, and then split them during split1, after STV and before reload. Hence on x86_64, we now see/allow things like: (insn 11 8 12 2 (set (reg:CCZ 17 flags) (compare:CCZ (reg/v:TI 84 [ x ]) (reg:TI 96))) "cmpti.c":2:43 30 {*cmpti_doubleword} This allows the STV pass to decide whether it's preferrable to perform this comparison using vector operations, i.e. a pxor/ptest sequence, or as scalar integer operations, i.e. a xor/xor/or sequence. Alas this required tweaking of the STV pass to recognize the "new" form of these comparisons and split out the pxor operation itself. To confirm this still works as expected I've added a new STV test case: long long a[1024]; long long b[1024]; int foo() { for (int i=0; i<1024; i++) { long long t = (a[i]<<8) | (b[i]<<24); if (t == 0) return 1; } return 0; } where with -m32 -O2 -msse4.1 the above comparison with zero should look like: punpcklqdq %xmm0, %xmm0 ptest %xmm0, %xmm0 Although this patch includes one or two minor tweaks to provide all the necessary infrastructure to support conversion of TImode comparisons to V1TImode (and SImode comparisons to V4SImode), STV doesn't yet implement these transformations, but this is something that can be considered after stage 4. Indeed the new convert_compare functionality is split out into a method to simplify its potential reuse by the timode_scalar_chain class. 2022-05-30 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog PR target/70321 * config/i386/i386-expand.cc (ix86_expand_branch): Don't decompose DI mode equality/inequality using XOR here. Instead generate a COMPARE for doubleword modes (DImode on !TARGET_64BIT or TImode). * config/i386/i386-features.cc (gen_gpr_to_xmm_move_src): Use gen_rtx_SUBREG when NUNITS is 1, i.e. for TImode to V1TImode. (general_scalar_chain::convert_compare): New function to convert scalar equality/inequality comparison into vector operations. (general_scalar_chain::convert_insn) [COMPARE]: Refactor. Call new convert_compare helper method. (convertible_comparion_p): Update to match doubleword COMPARE of two register, memory or integer constant operands. * config/i386/i386-features.h (general_scalar_chain::convert_compare): Prototype/declare member function here. * config/i386/i386.md (cstore<mode>4): Change mode to SDWIM, but only allow new doubleword modes for EQ and NE operators. (*cmp<dwi>_doubleword): New define_insn_and_split, to split a doubleword comparison into a pair of XORs followed by an IOR to set the (zero) flags register, optimizing the XORs if possible. * config/i386/sse.md (V_AVX): Include V1TI and V2TI in mode iterator; V_AVX is (currently) only used by ptest. (sse4_1 mode attribute): Update to support V1TI and V2TI. gcc/testsuite/ChangeLog PR target/70321 * gcc.target/i386/pr70321.c: New test case. * gcc.target/i386/sse4_1-stv-1.c: New test case.
2022-05-30 22:20:09 +02:00
return gen_rtx_SUBREG (vmode, gpr, 0);
case 2:
return gen_rtx_VEC_CONCAT (vmode, gpr,
CONST0_RTX (GET_MODE_INNER (vmode)));
default:
return gen_rtx_VEC_MERGE (vmode, gen_rtx_VEC_DUPLICATE (vmode, gpr),
CONST0_RTX (vmode), GEN_INT (HOST_WIDE_INT_1U));
}
}
/* Make vector copies for all register REGNO definitions
and replace its uses in a chain. */
void
PR target/106303: Fix TImode STV related failures on x86. This patch resolves PR target/106303 (and the related PRs 106347, 106404, 106407) which are ICEs caused by my improvements to x86_64's 128-bit TImode to V1TImode Scalar to Vector (STV) pass. My apologies for the breakage. The issue is that data flow analysis is used to partition usage of each TImode pseudo into "chains", where each chain is analyzed and if suitable converted to vector operations. The problems appears when some chains for a pseudo are converted, and others aren't as RTL sharing can result in some mode changes leaking into other instructions that aren't/shouldn't/can't be converted, which eventually leads to an ICE for mismatched modes. My first approach to a fix was to unify more of the STV infrastructure, reasoning that if TImode STV was exhibiting these problems, but DImode and SImode STV weren't, the issue was likely to be caused/resolved by these remaining differences. This appeared to fix some but not all of the reported PRs. A better solution was then proposed by H.J. Lu in Bugzilla, that we need to iterate the removal of candidates in the function timode_remove_non_convertible_regs until there are no further changes. As each chain is removed from consideration, it in turn may affect whether other insns/chains can safely be converted. 2022-07-24 Roger Sayle <roger@nextmovesoftware.com> H.J. Lu <hjl.tools@gmail.com> gcc/ChangeLog PR target/106303 PR target/106347 * config/i386/i386-features.cc (make_vector_copies): Move from general_scalar_chain to scalar_chain. (convert_reg): Likewise. (convert_insn_common): New scalar_chain method split out from general_scalar_chain convert_insn. (convert_registers): Move from general_scalar_chain to scalar_chain. (scalar_chain::convert): Call convert_insn_common before calling convert_insn. (timode_remove_non_convertible_regs): Iterate until there are no further changes to the candidates. * config/i386/i386-features.h (scalar_chain::hash_map): Move from general_scalar_chain. (scalar_chain::convert_reg): Likewise. (scalar_chain::convert_insn_common): New shared method. (scalar_chain::make_vector_copies): Move from general_scalar_chain. (scalar_chain::convert_registers): Likewise. No longer virtual. (general_scalar_chain::hash_map): Delete. Moved to scalar_chain. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::make_vector_copies): Likewise. (general_scalar_chain::convert_registers): Delete virtual method. (timode_scalar_chain::convert_registers): Likewise. gcc/testsuite/ChangeLog PR target/106303 PR target/106347 * gcc.target/i386/pr106303.c: New test case. * gcc.target/i386/pr106347.c: New test case.
2022-07-24 13:22:22 +02:00
scalar_chain::make_vector_copies (rtx_insn *insn, rtx reg)
{
rtx vreg = *defs_map.get (reg);
start_sequence ();
if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
{
rtx tmp = assign_386_stack_local (smode, SLOT_STV_TEMP);
if (smode == DImode && !TARGET_64BIT)
{
emit_move_insn (adjust_address (tmp, SImode, 0),
gen_rtx_SUBREG (SImode, reg, 0));
emit_move_insn (adjust_address (tmp, SImode, 4),
gen_rtx_SUBREG (SImode, reg, 4));
}
else
emit_move_insn (copy_rtx (tmp), reg);
emit_insn (gen_rtx_SET (gen_rtx_SUBREG (vmode, vreg, 0),
gen_gpr_to_xmm_move_src (vmode, tmp)));
}
else if (!TARGET_64BIT && smode == DImode)
{
if (TARGET_SSE4_1)
{
emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
CONST0_RTX (V4SImode),
gen_rtx_SUBREG (SImode, reg, 0)));
emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode, vreg, 0),
gen_rtx_SUBREG (V4SImode, vreg, 0),
gen_rtx_SUBREG (SImode, reg, 4),
GEN_INT (2)));
}
else
{
rtx tmp = gen_reg_rtx (DImode);
emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
CONST0_RTX (V4SImode),
gen_rtx_SUBREG (SImode, reg, 0)));
emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, tmp, 0),
CONST0_RTX (V4SImode),
gen_rtx_SUBREG (SImode, reg, 4)));
emit_insn (gen_vec_interleave_lowv4si
(gen_rtx_SUBREG (V4SImode, vreg, 0),
gen_rtx_SUBREG (V4SImode, vreg, 0),
gen_rtx_SUBREG (V4SImode, tmp, 0)));
}
}
else
emit_insn (gen_rtx_SET (gen_rtx_SUBREG (vmode, vreg, 0),
gen_gpr_to_xmm_move_src (vmode, reg)));
rtx_insn *seq = get_insns ();
end_sequence ();
emit_conversion_insns (seq, insn);
if (dump_file)
fprintf (dump_file,
" Copied r%d to a vector register r%d for insn %d\n",
REGNO (reg), REGNO (vreg), INSN_UID (insn));
}
/* Copy the definition SRC of INSN inside the chain to DST for
scalar uses outside of the chain. */
void
PR target/106303: Fix TImode STV related failures on x86. This patch resolves PR target/106303 (and the related PRs 106347, 106404, 106407) which are ICEs caused by my improvements to x86_64's 128-bit TImode to V1TImode Scalar to Vector (STV) pass. My apologies for the breakage. The issue is that data flow analysis is used to partition usage of each TImode pseudo into "chains", where each chain is analyzed and if suitable converted to vector operations. The problems appears when some chains for a pseudo are converted, and others aren't as RTL sharing can result in some mode changes leaking into other instructions that aren't/shouldn't/can't be converted, which eventually leads to an ICE for mismatched modes. My first approach to a fix was to unify more of the STV infrastructure, reasoning that if TImode STV was exhibiting these problems, but DImode and SImode STV weren't, the issue was likely to be caused/resolved by these remaining differences. This appeared to fix some but not all of the reported PRs. A better solution was then proposed by H.J. Lu in Bugzilla, that we need to iterate the removal of candidates in the function timode_remove_non_convertible_regs until there are no further changes. As each chain is removed from consideration, it in turn may affect whether other insns/chains can safely be converted. 2022-07-24 Roger Sayle <roger@nextmovesoftware.com> H.J. Lu <hjl.tools@gmail.com> gcc/ChangeLog PR target/106303 PR target/106347 * config/i386/i386-features.cc (make_vector_copies): Move from general_scalar_chain to scalar_chain. (convert_reg): Likewise. (convert_insn_common): New scalar_chain method split out from general_scalar_chain convert_insn. (convert_registers): Move from general_scalar_chain to scalar_chain. (scalar_chain::convert): Call convert_insn_common before calling convert_insn. (timode_remove_non_convertible_regs): Iterate until there are no further changes to the candidates. * config/i386/i386-features.h (scalar_chain::hash_map): Move from general_scalar_chain. (scalar_chain::convert_reg): Likewise. (scalar_chain::convert_insn_common): New shared method. (scalar_chain::make_vector_copies): Move from general_scalar_chain. (scalar_chain::convert_registers): Likewise. No longer virtual. (general_scalar_chain::hash_map): Delete. Moved to scalar_chain. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::make_vector_copies): Likewise. (general_scalar_chain::convert_registers): Delete virtual method. (timode_scalar_chain::convert_registers): Likewise. gcc/testsuite/ChangeLog PR target/106303 PR target/106347 * gcc.target/i386/pr106303.c: New test case. * gcc.target/i386/pr106347.c: New test case.
2022-07-24 13:22:22 +02:00
scalar_chain::convert_reg (rtx_insn *insn, rtx dst, rtx src)
{
start_sequence ();
if (!TARGET_INTER_UNIT_MOVES_FROM_VEC)
{
rtx tmp = assign_386_stack_local (smode, SLOT_STV_TEMP);
emit_move_insn (tmp, src);
if (!TARGET_64BIT && smode == DImode)
{
emit_move_insn (gen_rtx_SUBREG (SImode, dst, 0),
adjust_address (tmp, SImode, 0));
emit_move_insn (gen_rtx_SUBREG (SImode, dst, 4),
adjust_address (tmp, SImode, 4));
}
else
emit_move_insn (dst, copy_rtx (tmp));
}
else if (!TARGET_64BIT && smode == DImode)
{
if (TARGET_SSE4_1)
{
rtx tmp = gen_rtx_PARALLEL (VOIDmode,
gen_rtvec (1, const0_rtx));
emit_insn
(gen_rtx_SET
(gen_rtx_SUBREG (SImode, dst, 0),
gen_rtx_VEC_SELECT (SImode,
gen_rtx_SUBREG (V4SImode, src, 0),
tmp)));
tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const1_rtx));
emit_insn
(gen_rtx_SET
(gen_rtx_SUBREG (SImode, dst, 4),
gen_rtx_VEC_SELECT (SImode,
gen_rtx_SUBREG (V4SImode, src, 0),
tmp)));
}
else
{
rtx vcopy = gen_reg_rtx (V2DImode);
emit_move_insn (vcopy, gen_rtx_SUBREG (V2DImode, src, 0));
emit_move_insn (gen_rtx_SUBREG (SImode, dst, 0),
gen_rtx_SUBREG (SImode, vcopy, 0));
emit_move_insn (vcopy,
gen_rtx_LSHIFTRT (V2DImode,
vcopy, GEN_INT (32)));
emit_move_insn (gen_rtx_SUBREG (SImode, dst, 4),
gen_rtx_SUBREG (SImode, vcopy, 0));
}
}
else
emit_move_insn (dst, src);
rtx_insn *seq = get_insns ();
end_sequence ();
emit_conversion_insns (seq, insn);
if (dump_file)
fprintf (dump_file,
" Copied r%d to a scalar register r%d for insn %d\n",
REGNO (src), REGNO (dst), INSN_UID (insn));
}
/* Convert operand OP in INSN. We should handle
memory operands and uninitialized registers.
All other register uses are converted during
registers conversion. */
void
re PR rtl-optimization/91154 (456.hmmer regression on Haswell caused by r272922) 2019-08-14 Richard Biener <rguenther@suse.de> Uroš Bizjak <ubizjak@gmail.com> PR target/91154 * config/i386/i386-features.h (scalar_chain::scalar_chain): Add mode arguments. (scalar_chain::smode): New member. (scalar_chain::vmode): Likewise. (dimode_scalar_chain): Rename to... (general_scalar_chain): ... this. (general_scalar_chain::general_scalar_chain): Take mode arguments. (timode_scalar_chain::timode_scalar_chain): Initialize scalar_chain base with TImode and V1TImode. * config/i386/i386-features.c (scalar_chain::scalar_chain): Adjust. (general_scalar_chain::vector_const_cost): Adjust for SImode chains. (general_scalar_chain::compute_convert_gain): Likewise. Add {S,U}{MIN,MAX} support. (general_scalar_chain::replace_with_subreg): Use vmode/smode. (general_scalar_chain::make_vector_copies): Likewise. Handle non-DImode chains appropriately. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::convert_op): Likewise. (general_scalar_chain::convert_insn): Likewise. Add fatal_insn_not_found if the result is not recognized. (convertible_comparison_p): Pass in the scalar mode and use that. (general_scalar_to_vector_candidate_p): Likewise. Rename from dimode_scalar_to_vector_candidate_p. Add {S,U}{MIN,MAX} support. (scalar_to_vector_candidate_p): Remove by inlining into single caller. (general_remove_non_convertible_regs): Rename from dimode_remove_non_convertible_regs. (remove_non_convertible_regs): Remove by inlining into single caller. (convert_scalars_to_vector): Handle SImode and DImode chains in addition to TImode chains. * config/i386/i386.md (<maxmin><MAXMIN_IMODE>3): New expander. (*<maxmin><MAXMIN_IMODE>3_1): New insn-and-split. (*<maxmin>di3_doubleword): Likewise. * gcc.target/i386/pr91154.c: New testcase. * gcc.target/i386/minmax-3.c: Likewise. * gcc.target/i386/minmax-4.c: Likewise. * gcc.target/i386/minmax-5.c: Likewise. * gcc.target/i386/minmax-6.c: Likewise. * gcc.target/i386/minmax-1.c: Add -mno-stv. * gcc.target/i386/minmax-2.c: Likewise. Co-Authored-By: Uros Bizjak <ubizjak@gmail.com> From-SVN: r274481
2019-08-14 14:04:05 +02:00
general_scalar_chain::convert_op (rtx *op, rtx_insn *insn)
{
*op = copy_rtx_if_shared (*op);
if (GET_CODE (*op) == NOT)
{
convert_op (&XEXP (*op, 0), insn);
re PR rtl-optimization/91154 (456.hmmer regression on Haswell caused by r272922) 2019-08-14 Richard Biener <rguenther@suse.de> Uroš Bizjak <ubizjak@gmail.com> PR target/91154 * config/i386/i386-features.h (scalar_chain::scalar_chain): Add mode arguments. (scalar_chain::smode): New member. (scalar_chain::vmode): Likewise. (dimode_scalar_chain): Rename to... (general_scalar_chain): ... this. (general_scalar_chain::general_scalar_chain): Take mode arguments. (timode_scalar_chain::timode_scalar_chain): Initialize scalar_chain base with TImode and V1TImode. * config/i386/i386-features.c (scalar_chain::scalar_chain): Adjust. (general_scalar_chain::vector_const_cost): Adjust for SImode chains. (general_scalar_chain::compute_convert_gain): Likewise. Add {S,U}{MIN,MAX} support. (general_scalar_chain::replace_with_subreg): Use vmode/smode. (general_scalar_chain::make_vector_copies): Likewise. Handle non-DImode chains appropriately. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::convert_op): Likewise. (general_scalar_chain::convert_insn): Likewise. Add fatal_insn_not_found if the result is not recognized. (convertible_comparison_p): Pass in the scalar mode and use that. (general_scalar_to_vector_candidate_p): Likewise. Rename from dimode_scalar_to_vector_candidate_p. Add {S,U}{MIN,MAX} support. (scalar_to_vector_candidate_p): Remove by inlining into single caller. (general_remove_non_convertible_regs): Rename from dimode_remove_non_convertible_regs. (remove_non_convertible_regs): Remove by inlining into single caller. (convert_scalars_to_vector): Handle SImode and DImode chains in addition to TImode chains. * config/i386/i386.md (<maxmin><MAXMIN_IMODE>3): New expander. (*<maxmin><MAXMIN_IMODE>3_1): New insn-and-split. (*<maxmin>di3_doubleword): Likewise. * gcc.target/i386/pr91154.c: New testcase. * gcc.target/i386/minmax-3.c: Likewise. * gcc.target/i386/minmax-4.c: Likewise. * gcc.target/i386/minmax-5.c: Likewise. * gcc.target/i386/minmax-6.c: Likewise. * gcc.target/i386/minmax-1.c: Add -mno-stv. * gcc.target/i386/minmax-2.c: Likewise. Co-Authored-By: Uros Bizjak <ubizjak@gmail.com> From-SVN: r274481
2019-08-14 14:04:05 +02:00
PUT_MODE (*op, vmode);
}
else if (MEM_P (*op))
{
re PR rtl-optimization/91154 (456.hmmer regression on Haswell caused by r272922) 2019-08-14 Richard Biener <rguenther@suse.de> Uroš Bizjak <ubizjak@gmail.com> PR target/91154 * config/i386/i386-features.h (scalar_chain::scalar_chain): Add mode arguments. (scalar_chain::smode): New member. (scalar_chain::vmode): Likewise. (dimode_scalar_chain): Rename to... (general_scalar_chain): ... this. (general_scalar_chain::general_scalar_chain): Take mode arguments. (timode_scalar_chain::timode_scalar_chain): Initialize scalar_chain base with TImode and V1TImode. * config/i386/i386-features.c (scalar_chain::scalar_chain): Adjust. (general_scalar_chain::vector_const_cost): Adjust for SImode chains. (general_scalar_chain::compute_convert_gain): Likewise. Add {S,U}{MIN,MAX} support. (general_scalar_chain::replace_with_subreg): Use vmode/smode. (general_scalar_chain::make_vector_copies): Likewise. Handle non-DImode chains appropriately. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::convert_op): Likewise. (general_scalar_chain::convert_insn): Likewise. Add fatal_insn_not_found if the result is not recognized. (convertible_comparison_p): Pass in the scalar mode and use that. (general_scalar_to_vector_candidate_p): Likewise. Rename from dimode_scalar_to_vector_candidate_p. Add {S,U}{MIN,MAX} support. (scalar_to_vector_candidate_p): Remove by inlining into single caller. (general_remove_non_convertible_regs): Rename from dimode_remove_non_convertible_regs. (remove_non_convertible_regs): Remove by inlining into single caller. (convert_scalars_to_vector): Handle SImode and DImode chains in addition to TImode chains. * config/i386/i386.md (<maxmin><MAXMIN_IMODE>3): New expander. (*<maxmin><MAXMIN_IMODE>3_1): New insn-and-split. (*<maxmin>di3_doubleword): Likewise. * gcc.target/i386/pr91154.c: New testcase. * gcc.target/i386/minmax-3.c: Likewise. * gcc.target/i386/minmax-4.c: Likewise. * gcc.target/i386/minmax-5.c: Likewise. * gcc.target/i386/minmax-6.c: Likewise. * gcc.target/i386/minmax-1.c: Add -mno-stv. * gcc.target/i386/minmax-2.c: Likewise. Co-Authored-By: Uros Bizjak <ubizjak@gmail.com> From-SVN: r274481
2019-08-14 14:04:05 +02:00
rtx tmp = gen_reg_rtx (GET_MODE (*op));
/* Handle movabs. */
if (!memory_operand (*op, GET_MODE (*op)))
{
rtx tmp2 = gen_reg_rtx (GET_MODE (*op));
emit_insn_before (gen_rtx_SET (tmp2, *op), insn);
*op = tmp2;
}
emit_insn_before (gen_rtx_SET (gen_rtx_SUBREG (vmode, tmp, 0),
gen_gpr_to_xmm_move_src (vmode, *op)),
insn);
re PR rtl-optimization/91154 (456.hmmer regression on Haswell caused by r272922) 2019-08-14 Richard Biener <rguenther@suse.de> Uroš Bizjak <ubizjak@gmail.com> PR target/91154 * config/i386/i386-features.h (scalar_chain::scalar_chain): Add mode arguments. (scalar_chain::smode): New member. (scalar_chain::vmode): Likewise. (dimode_scalar_chain): Rename to... (general_scalar_chain): ... this. (general_scalar_chain::general_scalar_chain): Take mode arguments. (timode_scalar_chain::timode_scalar_chain): Initialize scalar_chain base with TImode and V1TImode. * config/i386/i386-features.c (scalar_chain::scalar_chain): Adjust. (general_scalar_chain::vector_const_cost): Adjust for SImode chains. (general_scalar_chain::compute_convert_gain): Likewise. Add {S,U}{MIN,MAX} support. (general_scalar_chain::replace_with_subreg): Use vmode/smode. (general_scalar_chain::make_vector_copies): Likewise. Handle non-DImode chains appropriately. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::convert_op): Likewise. (general_scalar_chain::convert_insn): Likewise. Add fatal_insn_not_found if the result is not recognized. (convertible_comparison_p): Pass in the scalar mode and use that. (general_scalar_to_vector_candidate_p): Likewise. Rename from dimode_scalar_to_vector_candidate_p. Add {S,U}{MIN,MAX} support. (scalar_to_vector_candidate_p): Remove by inlining into single caller. (general_remove_non_convertible_regs): Rename from dimode_remove_non_convertible_regs. (remove_non_convertible_regs): Remove by inlining into single caller. (convert_scalars_to_vector): Handle SImode and DImode chains in addition to TImode chains. * config/i386/i386.md (<maxmin><MAXMIN_IMODE>3): New expander. (*<maxmin><MAXMIN_IMODE>3_1): New insn-and-split. (*<maxmin>di3_doubleword): Likewise. * gcc.target/i386/pr91154.c: New testcase. * gcc.target/i386/minmax-3.c: Likewise. * gcc.target/i386/minmax-4.c: Likewise. * gcc.target/i386/minmax-5.c: Likewise. * gcc.target/i386/minmax-6.c: Likewise. * gcc.target/i386/minmax-1.c: Add -mno-stv. * gcc.target/i386/minmax-2.c: Likewise. Co-Authored-By: Uros Bizjak <ubizjak@gmail.com> From-SVN: r274481
2019-08-14 14:04:05 +02:00
*op = gen_rtx_SUBREG (vmode, tmp, 0);
if (dump_file)
fprintf (dump_file, " Preloading operand for insn %d into r%d\n",
INSN_UID (insn), REGNO (tmp));
}
else if (REG_P (*op))
{
re PR rtl-optimization/91154 (456.hmmer regression on Haswell caused by r272922) 2019-08-14 Richard Biener <rguenther@suse.de> Uroš Bizjak <ubizjak@gmail.com> PR target/91154 * config/i386/i386-features.h (scalar_chain::scalar_chain): Add mode arguments. (scalar_chain::smode): New member. (scalar_chain::vmode): Likewise. (dimode_scalar_chain): Rename to... (general_scalar_chain): ... this. (general_scalar_chain::general_scalar_chain): Take mode arguments. (timode_scalar_chain::timode_scalar_chain): Initialize scalar_chain base with TImode and V1TImode. * config/i386/i386-features.c (scalar_chain::scalar_chain): Adjust. (general_scalar_chain::vector_const_cost): Adjust for SImode chains. (general_scalar_chain::compute_convert_gain): Likewise. Add {S,U}{MIN,MAX} support. (general_scalar_chain::replace_with_subreg): Use vmode/smode. (general_scalar_chain::make_vector_copies): Likewise. Handle non-DImode chains appropriately. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::convert_op): Likewise. (general_scalar_chain::convert_insn): Likewise. Add fatal_insn_not_found if the result is not recognized. (convertible_comparison_p): Pass in the scalar mode and use that. (general_scalar_to_vector_candidate_p): Likewise. Rename from dimode_scalar_to_vector_candidate_p. Add {S,U}{MIN,MAX} support. (scalar_to_vector_candidate_p): Remove by inlining into single caller. (general_remove_non_convertible_regs): Rename from dimode_remove_non_convertible_regs. (remove_non_convertible_regs): Remove by inlining into single caller. (convert_scalars_to_vector): Handle SImode and DImode chains in addition to TImode chains. * config/i386/i386.md (<maxmin><MAXMIN_IMODE>3): New expander. (*<maxmin><MAXMIN_IMODE>3_1): New insn-and-split. (*<maxmin>di3_doubleword): Likewise. * gcc.target/i386/pr91154.c: New testcase. * gcc.target/i386/minmax-3.c: Likewise. * gcc.target/i386/minmax-4.c: Likewise. * gcc.target/i386/minmax-5.c: Likewise. * gcc.target/i386/minmax-6.c: Likewise. * gcc.target/i386/minmax-1.c: Add -mno-stv. * gcc.target/i386/minmax-2.c: Likewise. Co-Authored-By: Uros Bizjak <ubizjak@gmail.com> From-SVN: r274481
2019-08-14 14:04:05 +02:00
*op = gen_rtx_SUBREG (vmode, *op, 0);
}
else if (CONST_INT_P (*op))
{
rtx vec_cst;
re PR rtl-optimization/91154 (456.hmmer regression on Haswell caused by r272922) 2019-08-14 Richard Biener <rguenther@suse.de> Uroš Bizjak <ubizjak@gmail.com> PR target/91154 * config/i386/i386-features.h (scalar_chain::scalar_chain): Add mode arguments. (scalar_chain::smode): New member. (scalar_chain::vmode): Likewise. (dimode_scalar_chain): Rename to... (general_scalar_chain): ... this. (general_scalar_chain::general_scalar_chain): Take mode arguments. (timode_scalar_chain::timode_scalar_chain): Initialize scalar_chain base with TImode and V1TImode. * config/i386/i386-features.c (scalar_chain::scalar_chain): Adjust. (general_scalar_chain::vector_const_cost): Adjust for SImode chains. (general_scalar_chain::compute_convert_gain): Likewise. Add {S,U}{MIN,MAX} support. (general_scalar_chain::replace_with_subreg): Use vmode/smode. (general_scalar_chain::make_vector_copies): Likewise. Handle non-DImode chains appropriately. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::convert_op): Likewise. (general_scalar_chain::convert_insn): Likewise. Add fatal_insn_not_found if the result is not recognized. (convertible_comparison_p): Pass in the scalar mode and use that. (general_scalar_to_vector_candidate_p): Likewise. Rename from dimode_scalar_to_vector_candidate_p. Add {S,U}{MIN,MAX} support. (scalar_to_vector_candidate_p): Remove by inlining into single caller. (general_remove_non_convertible_regs): Rename from dimode_remove_non_convertible_regs. (remove_non_convertible_regs): Remove by inlining into single caller. (convert_scalars_to_vector): Handle SImode and DImode chains in addition to TImode chains. * config/i386/i386.md (<maxmin><MAXMIN_IMODE>3): New expander. (*<maxmin><MAXMIN_IMODE>3_1): New insn-and-split. (*<maxmin>di3_doubleword): Likewise. * gcc.target/i386/pr91154.c: New testcase. * gcc.target/i386/minmax-3.c: Likewise. * gcc.target/i386/minmax-4.c: Likewise. * gcc.target/i386/minmax-5.c: Likewise. * gcc.target/i386/minmax-6.c: Likewise. * gcc.target/i386/minmax-1.c: Add -mno-stv. * gcc.target/i386/minmax-2.c: Likewise. Co-Authored-By: Uros Bizjak <ubizjak@gmail.com> From-SVN: r274481
2019-08-14 14:04:05 +02:00
rtx tmp = gen_rtx_SUBREG (vmode, gen_reg_rtx (smode), 0);
/* Prefer all ones vector in case of -1. */
if (constm1_operand (*op, GET_MODE (*op)))
re PR rtl-optimization/91154 (456.hmmer regression on Haswell caused by r272922) 2019-08-14 Richard Biener <rguenther@suse.de> Uroš Bizjak <ubizjak@gmail.com> PR target/91154 * config/i386/i386-features.h (scalar_chain::scalar_chain): Add mode arguments. (scalar_chain::smode): New member. (scalar_chain::vmode): Likewise. (dimode_scalar_chain): Rename to... (general_scalar_chain): ... this. (general_scalar_chain::general_scalar_chain): Take mode arguments. (timode_scalar_chain::timode_scalar_chain): Initialize scalar_chain base with TImode and V1TImode. * config/i386/i386-features.c (scalar_chain::scalar_chain): Adjust. (general_scalar_chain::vector_const_cost): Adjust for SImode chains. (general_scalar_chain::compute_convert_gain): Likewise. Add {S,U}{MIN,MAX} support. (general_scalar_chain::replace_with_subreg): Use vmode/smode. (general_scalar_chain::make_vector_copies): Likewise. Handle non-DImode chains appropriately. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::convert_op): Likewise. (general_scalar_chain::convert_insn): Likewise. Add fatal_insn_not_found if the result is not recognized. (convertible_comparison_p): Pass in the scalar mode and use that. (general_scalar_to_vector_candidate_p): Likewise. Rename from dimode_scalar_to_vector_candidate_p. Add {S,U}{MIN,MAX} support. (scalar_to_vector_candidate_p): Remove by inlining into single caller. (general_remove_non_convertible_regs): Rename from dimode_remove_non_convertible_regs. (remove_non_convertible_regs): Remove by inlining into single caller. (convert_scalars_to_vector): Handle SImode and DImode chains in addition to TImode chains. * config/i386/i386.md (<maxmin><MAXMIN_IMODE>3): New expander. (*<maxmin><MAXMIN_IMODE>3_1): New insn-and-split. (*<maxmin>di3_doubleword): Likewise. * gcc.target/i386/pr91154.c: New testcase. * gcc.target/i386/minmax-3.c: Likewise. * gcc.target/i386/minmax-4.c: Likewise. * gcc.target/i386/minmax-5.c: Likewise. * gcc.target/i386/minmax-6.c: Likewise. * gcc.target/i386/minmax-1.c: Add -mno-stv. * gcc.target/i386/minmax-2.c: Likewise. Co-Authored-By: Uros Bizjak <ubizjak@gmail.com> From-SVN: r274481
2019-08-14 14:04:05 +02:00
vec_cst = CONSTM1_RTX (vmode);
else
re PR rtl-optimization/91154 (456.hmmer regression on Haswell caused by r272922) 2019-08-14 Richard Biener <rguenther@suse.de> Uroš Bizjak <ubizjak@gmail.com> PR target/91154 * config/i386/i386-features.h (scalar_chain::scalar_chain): Add mode arguments. (scalar_chain::smode): New member. (scalar_chain::vmode): Likewise. (dimode_scalar_chain): Rename to... (general_scalar_chain): ... this. (general_scalar_chain::general_scalar_chain): Take mode arguments. (timode_scalar_chain::timode_scalar_chain): Initialize scalar_chain base with TImode and V1TImode. * config/i386/i386-features.c (scalar_chain::scalar_chain): Adjust. (general_scalar_chain::vector_const_cost): Adjust for SImode chains. (general_scalar_chain::compute_convert_gain): Likewise. Add {S,U}{MIN,MAX} support. (general_scalar_chain::replace_with_subreg): Use vmode/smode. (general_scalar_chain::make_vector_copies): Likewise. Handle non-DImode chains appropriately. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::convert_op): Likewise. (general_scalar_chain::convert_insn): Likewise. Add fatal_insn_not_found if the result is not recognized. (convertible_comparison_p): Pass in the scalar mode and use that. (general_scalar_to_vector_candidate_p): Likewise. Rename from dimode_scalar_to_vector_candidate_p. Add {S,U}{MIN,MAX} support. (scalar_to_vector_candidate_p): Remove by inlining into single caller. (general_remove_non_convertible_regs): Rename from dimode_remove_non_convertible_regs. (remove_non_convertible_regs): Remove by inlining into single caller. (convert_scalars_to_vector): Handle SImode and DImode chains in addition to TImode chains. * config/i386/i386.md (<maxmin><MAXMIN_IMODE>3): New expander. (*<maxmin><MAXMIN_IMODE>3_1): New insn-and-split. (*<maxmin>di3_doubleword): Likewise. * gcc.target/i386/pr91154.c: New testcase. * gcc.target/i386/minmax-3.c: Likewise. * gcc.target/i386/minmax-4.c: Likewise. * gcc.target/i386/minmax-5.c: Likewise. * gcc.target/i386/minmax-6.c: Likewise. * gcc.target/i386/minmax-1.c: Add -mno-stv. * gcc.target/i386/minmax-2.c: Likewise. Co-Authored-By: Uros Bizjak <ubizjak@gmail.com> From-SVN: r274481
2019-08-14 14:04:05 +02:00
{
unsigned n = GET_MODE_NUNITS (vmode);
rtx *v = XALLOCAVEC (rtx, n);
v[0] = *op;
for (unsigned i = 1; i < n; ++i)
v[i] = const0_rtx;
vec_cst = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (n, v));
}
re PR rtl-optimization/91154 (456.hmmer regression on Haswell caused by r272922) 2019-08-14 Richard Biener <rguenther@suse.de> Uroš Bizjak <ubizjak@gmail.com> PR target/91154 * config/i386/i386-features.h (scalar_chain::scalar_chain): Add mode arguments. (scalar_chain::smode): New member. (scalar_chain::vmode): Likewise. (dimode_scalar_chain): Rename to... (general_scalar_chain): ... this. (general_scalar_chain::general_scalar_chain): Take mode arguments. (timode_scalar_chain::timode_scalar_chain): Initialize scalar_chain base with TImode and V1TImode. * config/i386/i386-features.c (scalar_chain::scalar_chain): Adjust. (general_scalar_chain::vector_const_cost): Adjust for SImode chains. (general_scalar_chain::compute_convert_gain): Likewise. Add {S,U}{MIN,MAX} support. (general_scalar_chain::replace_with_subreg): Use vmode/smode. (general_scalar_chain::make_vector_copies): Likewise. Handle non-DImode chains appropriately. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::convert_op): Likewise. (general_scalar_chain::convert_insn): Likewise. Add fatal_insn_not_found if the result is not recognized. (convertible_comparison_p): Pass in the scalar mode and use that. (general_scalar_to_vector_candidate_p): Likewise. Rename from dimode_scalar_to_vector_candidate_p. Add {S,U}{MIN,MAX} support. (scalar_to_vector_candidate_p): Remove by inlining into single caller. (general_remove_non_convertible_regs): Rename from dimode_remove_non_convertible_regs. (remove_non_convertible_regs): Remove by inlining into single caller. (convert_scalars_to_vector): Handle SImode and DImode chains in addition to TImode chains. * config/i386/i386.md (<maxmin><MAXMIN_IMODE>3): New expander. (*<maxmin><MAXMIN_IMODE>3_1): New insn-and-split. (*<maxmin>di3_doubleword): Likewise. * gcc.target/i386/pr91154.c: New testcase. * gcc.target/i386/minmax-3.c: Likewise. * gcc.target/i386/minmax-4.c: Likewise. * gcc.target/i386/minmax-5.c: Likewise. * gcc.target/i386/minmax-6.c: Likewise. * gcc.target/i386/minmax-1.c: Add -mno-stv. * gcc.target/i386/minmax-2.c: Likewise. Co-Authored-By: Uros Bizjak <ubizjak@gmail.com> From-SVN: r274481
2019-08-14 14:04:05 +02:00
if (!standard_sse_constant_p (vec_cst, vmode))
{
start_sequence ();
re PR rtl-optimization/91154 (456.hmmer regression on Haswell caused by r272922) 2019-08-14 Richard Biener <rguenther@suse.de> Uroš Bizjak <ubizjak@gmail.com> PR target/91154 * config/i386/i386-features.h (scalar_chain::scalar_chain): Add mode arguments. (scalar_chain::smode): New member. (scalar_chain::vmode): Likewise. (dimode_scalar_chain): Rename to... (general_scalar_chain): ... this. (general_scalar_chain::general_scalar_chain): Take mode arguments. (timode_scalar_chain::timode_scalar_chain): Initialize scalar_chain base with TImode and V1TImode. * config/i386/i386-features.c (scalar_chain::scalar_chain): Adjust. (general_scalar_chain::vector_const_cost): Adjust for SImode chains. (general_scalar_chain::compute_convert_gain): Likewise. Add {S,U}{MIN,MAX} support. (general_scalar_chain::replace_with_subreg): Use vmode/smode. (general_scalar_chain::make_vector_copies): Likewise. Handle non-DImode chains appropriately. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::convert_op): Likewise. (general_scalar_chain::convert_insn): Likewise. Add fatal_insn_not_found if the result is not recognized. (convertible_comparison_p): Pass in the scalar mode and use that. (general_scalar_to_vector_candidate_p): Likewise. Rename from dimode_scalar_to_vector_candidate_p. Add {S,U}{MIN,MAX} support. (scalar_to_vector_candidate_p): Remove by inlining into single caller. (general_remove_non_convertible_regs): Rename from dimode_remove_non_convertible_regs. (remove_non_convertible_regs): Remove by inlining into single caller. (convert_scalars_to_vector): Handle SImode and DImode chains in addition to TImode chains. * config/i386/i386.md (<maxmin><MAXMIN_IMODE>3): New expander. (*<maxmin><MAXMIN_IMODE>3_1): New insn-and-split. (*<maxmin>di3_doubleword): Likewise. * gcc.target/i386/pr91154.c: New testcase. * gcc.target/i386/minmax-3.c: Likewise. * gcc.target/i386/minmax-4.c: Likewise. * gcc.target/i386/minmax-5.c: Likewise. * gcc.target/i386/minmax-6.c: Likewise. * gcc.target/i386/minmax-1.c: Add -mno-stv. * gcc.target/i386/minmax-2.c: Likewise. Co-Authored-By: Uros Bizjak <ubizjak@gmail.com> From-SVN: r274481
2019-08-14 14:04:05 +02:00
vec_cst = validize_mem (force_const_mem (vmode, vec_cst));
rtx_insn *seq = get_insns ();
end_sequence ();
emit_insn_before (seq, insn);
}
emit_insn_before (gen_move_insn (copy_rtx (tmp), vec_cst), insn);
*op = tmp;
}
else
{
gcc_assert (SUBREG_P (*op));
re PR rtl-optimization/91154 (456.hmmer regression on Haswell caused by r272922) 2019-08-14 Richard Biener <rguenther@suse.de> Uroš Bizjak <ubizjak@gmail.com> PR target/91154 * config/i386/i386-features.h (scalar_chain::scalar_chain): Add mode arguments. (scalar_chain::smode): New member. (scalar_chain::vmode): Likewise. (dimode_scalar_chain): Rename to... (general_scalar_chain): ... this. (general_scalar_chain::general_scalar_chain): Take mode arguments. (timode_scalar_chain::timode_scalar_chain): Initialize scalar_chain base with TImode and V1TImode. * config/i386/i386-features.c (scalar_chain::scalar_chain): Adjust. (general_scalar_chain::vector_const_cost): Adjust for SImode chains. (general_scalar_chain::compute_convert_gain): Likewise. Add {S,U}{MIN,MAX} support. (general_scalar_chain::replace_with_subreg): Use vmode/smode. (general_scalar_chain::make_vector_copies): Likewise. Handle non-DImode chains appropriately. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::convert_op): Likewise. (general_scalar_chain::convert_insn): Likewise. Add fatal_insn_not_found if the result is not recognized. (convertible_comparison_p): Pass in the scalar mode and use that. (general_scalar_to_vector_candidate_p): Likewise. Rename from dimode_scalar_to_vector_candidate_p. Add {S,U}{MIN,MAX} support. (scalar_to_vector_candidate_p): Remove by inlining into single caller. (general_remove_non_convertible_regs): Rename from dimode_remove_non_convertible_regs. (remove_non_convertible_regs): Remove by inlining into single caller. (convert_scalars_to_vector): Handle SImode and DImode chains in addition to TImode chains. * config/i386/i386.md (<maxmin><MAXMIN_IMODE>3): New expander. (*<maxmin><MAXMIN_IMODE>3_1): New insn-and-split. (*<maxmin>di3_doubleword): Likewise. * gcc.target/i386/pr91154.c: New testcase. * gcc.target/i386/minmax-3.c: Likewise. * gcc.target/i386/minmax-4.c: Likewise. * gcc.target/i386/minmax-5.c: Likewise. * gcc.target/i386/minmax-6.c: Likewise. * gcc.target/i386/minmax-1.c: Add -mno-stv. * gcc.target/i386/minmax-2.c: Likewise. Co-Authored-By: Uros Bizjak <ubizjak@gmail.com> From-SVN: r274481
2019-08-14 14:04:05 +02:00
gcc_assert (GET_MODE (*op) == vmode);
}
}
PR target/70321: Split double word equality/inequality after STV on x86. This patch resolves the last piece of PR target/70321 a code quality (P2 regression) affecting mainline. Currently, for HJ's testcase: void foo (long long ixi) { if (ixi != 14348907) __builtin_abort (); } GCC with -m32 -O2 generates four instructions for the comparison: movl 16(%esp), %eax movl 20(%esp), %edx xorl $14348907, %eax orl %eax, %edx but with this patch it now requires only three, making better use of x86's addressing modes: movl 16(%esp), %eax xorl $14348907, %eax orl 20(%esp), %eax The solution is to expand "doubleword" equality/inequality expressions using flag setting COMPARE instructions for the early RTL passes, and then split them during split1, after STV and before reload. Hence on x86_64, we now see/allow things like: (insn 11 8 12 2 (set (reg:CCZ 17 flags) (compare:CCZ (reg/v:TI 84 [ x ]) (reg:TI 96))) "cmpti.c":2:43 30 {*cmpti_doubleword} This allows the STV pass to decide whether it's preferrable to perform this comparison using vector operations, i.e. a pxor/ptest sequence, or as scalar integer operations, i.e. a xor/xor/or sequence. Alas this required tweaking of the STV pass to recognize the "new" form of these comparisons and split out the pxor operation itself. To confirm this still works as expected I've added a new STV test case: long long a[1024]; long long b[1024]; int foo() { for (int i=0; i<1024; i++) { long long t = (a[i]<<8) | (b[i]<<24); if (t == 0) return 1; } return 0; } where with -m32 -O2 -msse4.1 the above comparison with zero should look like: punpcklqdq %xmm0, %xmm0 ptest %xmm0, %xmm0 Although this patch includes one or two minor tweaks to provide all the necessary infrastructure to support conversion of TImode comparisons to V1TImode (and SImode comparisons to V4SImode), STV doesn't yet implement these transformations, but this is something that can be considered after stage 4. Indeed the new convert_compare functionality is split out into a method to simplify its potential reuse by the timode_scalar_chain class. 2022-05-30 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog PR target/70321 * config/i386/i386-expand.cc (ix86_expand_branch): Don't decompose DI mode equality/inequality using XOR here. Instead generate a COMPARE for doubleword modes (DImode on !TARGET_64BIT or TImode). * config/i386/i386-features.cc (gen_gpr_to_xmm_move_src): Use gen_rtx_SUBREG when NUNITS is 1, i.e. for TImode to V1TImode. (general_scalar_chain::convert_compare): New function to convert scalar equality/inequality comparison into vector operations. (general_scalar_chain::convert_insn) [COMPARE]: Refactor. Call new convert_compare helper method. (convertible_comparion_p): Update to match doubleword COMPARE of two register, memory or integer constant operands. * config/i386/i386-features.h (general_scalar_chain::convert_compare): Prototype/declare member function here. * config/i386/i386.md (cstore<mode>4): Change mode to SDWIM, but only allow new doubleword modes for EQ and NE operators. (*cmp<dwi>_doubleword): New define_insn_and_split, to split a doubleword comparison into a pair of XORs followed by an IOR to set the (zero) flags register, optimizing the XORs if possible. * config/i386/sse.md (V_AVX): Include V1TI and V2TI in mode iterator; V_AVX is (currently) only used by ptest. (sse4_1 mode attribute): Update to support V1TI and V2TI. gcc/testsuite/ChangeLog PR target/70321 * gcc.target/i386/pr70321.c: New test case. * gcc.target/i386/sse4_1-stv-1.c: New test case.
2022-05-30 22:20:09 +02:00
/* Convert COMPARE to vector mode. */
rtx
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64. This patch upgrades x86_64's scalar-to-vector (STV) pass to more aggressively transform 128-bit scalar TImode operations into vector V1TImode operations performed on SSE registers. TImode functionality already exists in STV, but only for move operations. This change brings support for logical operations (AND, IOR, XOR, NOT and ANDN) and comparisons. The effect of these changes are conveniently demonstrated by the new sse4_1-stv-5.c test case: __int128 a[16]; __int128 b[16]; __int128 c[16]; void foo() { for (unsigned int i=0; i<16; i++) a[i] = b[i] & ~c[i]; } which when currently compiled on mainline wtih -O2 -msse4 produces: foo: xorl %eax, %eax .L2: movq c(%rax), %rsi movq c+8(%rax), %rdi addq $16, %rax notq %rsi notq %rdi andq b-16(%rax), %rsi andq b-8(%rax), %rdi movq %rsi, a-16(%rax) movq %rdi, a-8(%rax) cmpq $256, %rax jne .L2 ret but with this patch now produces: foo: xorl %eax, %eax .L2: movdqa c(%rax), %xmm0 pandn b(%rax), %xmm0 addq $16, %rax movaps %xmm0, a-16(%rax) cmpq $256, %rax jne .L2 ret Technically, the STV pass is implemented by three C++ classes, a common abstract base class "scalar_chain" that contains common functionality, and two derived classes: general_scalar_chain (which handles SI and DI modes) and timode_scalar_chain (which handles TI modes). As mentioned previously, because only TI mode moves were handled the two worker classes behaved significantly differently. These changes bring the functionality of these two classes closer together, which is reflected by refactoring more shared code from general_scalar_chain to the parent scalar_chain and reusing it from timode_scalar_chain. There still remain significant differences (and simplifications) so the existing division of classes (as specializations) continues to make sense. 2022-07-11 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-features.h (scalar_chain): Add fields insns_conv, n_sse_to_integer and n_integer_to_sse to this parent class, moved from general_scalar_chain. (scalar_chain::convert_compare): Protected method moved from general_scalar_chain. (mark_dual_mode_def): Make protected, not private virtual. (scalar_chain:convert_op): New private virtual method. (general_scalar_chain::general_scalar_chain): Simplify constructor. (general_scalar_chain::~general_scalar_chain): Delete destructor. (general_scalar_chain): Move insns_conv, n_sse_to_integer and n_integer_to_sse fields to parent class, scalar_chain. (general_scalar_chain::mark_dual_mode_def): Delete prototype. (general_scalar_chain::convert_compare): Delete prototype. (timode_scalar_chain::compute_convert_gain): Remove simplistic implementation, convert to a method prototype. (timode_scalar_chain::mark_dual_mode_def): Delete prototype. (timode_scalar_chain::convert_op): Prototype new virtual method. * config/i386/i386-features.cc (scalar_chain::scalar_chain): Allocate insns_conv and initialize n_sse_to_integer and n_integer_to_sse fields in constructor. (scalar_chain::scalar_chain): Free insns_conv in destructor. (general_scalar_chain::general_scalar_chain): Delete constructor, now defined in the class declaration. (general_scalar_chain::~general_scalar_chain): Delete destructor. (scalar_chain::mark_dual_mode_def): Renamed from general_scalar_chain::mark_dual_mode_def. (timode_scalar_chain::mark_dual_mode_def): Delete. (scalar_chain::convert_compare): Renamed from general_scalar_chain::convert_compare. (timode_scalar_chain::compute_convert_gain): New method to determine the gain from converting a TImode chain to V1TImode. (timode_scalar_chain::convert_op): New method to convert an operand from TImode to V1TImode. (timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE on REG_EQUAL notes that were originally TImode (not CONST_INT). Handle AND, ANDN, XOR, IOR, NOT and COMPARE. (timode_mem_p): Helper predicate to check where operand is memory reference with sufficient alignment for TImode STV. (timode_scalar_to_vector_candidate_p): Use convertible_comparison_p to check whether COMPARE is convertible. Handle SET_DESTs that that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT, CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT. gcc/testsuite/ChangeLog * gcc.target/i386/sse4_1-stv-2.c: New test case, pand. * gcc.target/i386/sse4_1-stv-3.c: New test case, por. * gcc.target/i386/sse4_1-stv-4.c: New test case, pxor. * gcc.target/i386/sse4_1-stv-5.c: New test case, pandn. * gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
scalar_chain::convert_compare (rtx op1, rtx op2, rtx_insn *insn)
PR target/70321: Split double word equality/inequality after STV on x86. This patch resolves the last piece of PR target/70321 a code quality (P2 regression) affecting mainline. Currently, for HJ's testcase: void foo (long long ixi) { if (ixi != 14348907) __builtin_abort (); } GCC with -m32 -O2 generates four instructions for the comparison: movl 16(%esp), %eax movl 20(%esp), %edx xorl $14348907, %eax orl %eax, %edx but with this patch it now requires only three, making better use of x86's addressing modes: movl 16(%esp), %eax xorl $14348907, %eax orl 20(%esp), %eax The solution is to expand "doubleword" equality/inequality expressions using flag setting COMPARE instructions for the early RTL passes, and then split them during split1, after STV and before reload. Hence on x86_64, we now see/allow things like: (insn 11 8 12 2 (set (reg:CCZ 17 flags) (compare:CCZ (reg/v:TI 84 [ x ]) (reg:TI 96))) "cmpti.c":2:43 30 {*cmpti_doubleword} This allows the STV pass to decide whether it's preferrable to perform this comparison using vector operations, i.e. a pxor/ptest sequence, or as scalar integer operations, i.e. a xor/xor/or sequence. Alas this required tweaking of the STV pass to recognize the "new" form of these comparisons and split out the pxor operation itself. To confirm this still works as expected I've added a new STV test case: long long a[1024]; long long b[1024]; int foo() { for (int i=0; i<1024; i++) { long long t = (a[i]<<8) | (b[i]<<24); if (t == 0) return 1; } return 0; } where with -m32 -O2 -msse4.1 the above comparison with zero should look like: punpcklqdq %xmm0, %xmm0 ptest %xmm0, %xmm0 Although this patch includes one or two minor tweaks to provide all the necessary infrastructure to support conversion of TImode comparisons to V1TImode (and SImode comparisons to V4SImode), STV doesn't yet implement these transformations, but this is something that can be considered after stage 4. Indeed the new convert_compare functionality is split out into a method to simplify its potential reuse by the timode_scalar_chain class. 2022-05-30 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog PR target/70321 * config/i386/i386-expand.cc (ix86_expand_branch): Don't decompose DI mode equality/inequality using XOR here. Instead generate a COMPARE for doubleword modes (DImode on !TARGET_64BIT or TImode). * config/i386/i386-features.cc (gen_gpr_to_xmm_move_src): Use gen_rtx_SUBREG when NUNITS is 1, i.e. for TImode to V1TImode. (general_scalar_chain::convert_compare): New function to convert scalar equality/inequality comparison into vector operations. (general_scalar_chain::convert_insn) [COMPARE]: Refactor. Call new convert_compare helper method. (convertible_comparion_p): Update to match doubleword COMPARE of two register, memory or integer constant operands. * config/i386/i386-features.h (general_scalar_chain::convert_compare): Prototype/declare member function here. * config/i386/i386.md (cstore<mode>4): Change mode to SDWIM, but only allow new doubleword modes for EQ and NE operators. (*cmp<dwi>_doubleword): New define_insn_and_split, to split a doubleword comparison into a pair of XORs followed by an IOR to set the (zero) flags register, optimizing the XORs if possible. * config/i386/sse.md (V_AVX): Include V1TI and V2TI in mode iterator; V_AVX is (currently) only used by ptest. (sse4_1 mode attribute): Update to support V1TI and V2TI. gcc/testsuite/ChangeLog PR target/70321 * gcc.target/i386/pr70321.c: New test case. * gcc.target/i386/sse4_1-stv-1.c: New test case.
2022-05-30 22:20:09 +02:00
{
Use PTEST to perform AND in TImode STV of (A & B) != 0 on x86_64. This x86_64 backend patch allows TImode STV to take advantage of the fact that the PTEST instruction performs an AND operation. Previously PTEST was (mostly) used for comparison against zero, by using the same operands. The benefits are demonstrated by the new test case: __int128 a,b; int foo() { return (a & b) != 0; } Currently with -O2 -msse4 we generate: movdqa a(%rip), %xmm0 pand b(%rip), %xmm0 xorl %eax, %eax ptest %xmm0, %xmm0 setne %al ret with this patch we now generate: movdqa a(%rip), %xmm0 xorl %eax, %eax ptest b(%rip), %xmm0 setne %al ret Technically, the magic happens using new define_insn_and_split patterns. Using two patterns allows this transformation to performed independently of whether TImode STV is run before or after combine. The one tricky case is that immediate constant operands of the AND behave slightly differently between TImode and V1TImode: All V1TImode immediate operands becomes loads, but for TImode only values that are not hilo_operands need to be loaded. Hence the new *testti_doubleword accepts any general_operand, but internally during split calls force_reg whenever the second operand is not x86_64_hilo_general_operand. This required (benefits from) some tweaks to TImode STV to support CONST_WIDE_INT in more places, using CONST_SCALAR_INT_P instead of just CONST_INT_P. 2022-08-09 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-features.cc (scalar_chain::convert_compare): Create new pseudos only when/if needed. Add support for TEST, i.e. (COMPARE (AND x y) (const_int 0)), using UNSPEC_PTEST. When broadcasting V2DImode and V4SImode use new pseudo register. (timode_scalar_chain::convert_op): Do nothing if operand is already V1TImode. Avoid generating useless SUBREG conversions, i.e. (SUBREG:V1TImode (REG:V1TImode) 0). Handle CONST_WIDE_INT in addition to CONST_INT by using CONST_SCALAR_INT_P. (convertible_comparison_p): Use CONST_SCALAR_INT_P to match both CONST_WIDE_INT and CONST_INT. Recognize new *testti_doubleword pattern as an STV candidate. (timode_scalar_to_vector_candidate_p): Allow CONST_SCALAR_INT_P operands in binary logic operations. * config/i386/i386.cc (ix86_rtx_costs) <case UNSPEC>: Add costs for UNSPEC_PTEST; a PTEST that performs an AND has the same cost as regular PTEST, i.e. cost->sse_op. * config/i386/i386.md (*testti_doubleword): New pre-reload define_insn_and_split that recognizes comparison of TI mode AND against zero. * config/i386/sse.md (*ptest<mode>_and): New pre-reload define_insn_and_split that recognizes UNSPEC_PTEST of identical AND operands. gcc/testsuite/ChangeLog * gcc.target/i386/sse4_1-stv-8.c: New test case.
2022-08-09 19:59:55 +02:00
rtx src, tmp;
PR target/70321: Split double word equality/inequality after STV on x86. This patch resolves the last piece of PR target/70321 a code quality (P2 regression) affecting mainline. Currently, for HJ's testcase: void foo (long long ixi) { if (ixi != 14348907) __builtin_abort (); } GCC with -m32 -O2 generates four instructions for the comparison: movl 16(%esp), %eax movl 20(%esp), %edx xorl $14348907, %eax orl %eax, %edx but with this patch it now requires only three, making better use of x86's addressing modes: movl 16(%esp), %eax xorl $14348907, %eax orl 20(%esp), %eax The solution is to expand "doubleword" equality/inequality expressions using flag setting COMPARE instructions for the early RTL passes, and then split them during split1, after STV and before reload. Hence on x86_64, we now see/allow things like: (insn 11 8 12 2 (set (reg:CCZ 17 flags) (compare:CCZ (reg/v:TI 84 [ x ]) (reg:TI 96))) "cmpti.c":2:43 30 {*cmpti_doubleword} This allows the STV pass to decide whether it's preferrable to perform this comparison using vector operations, i.e. a pxor/ptest sequence, or as scalar integer operations, i.e. a xor/xor/or sequence. Alas this required tweaking of the STV pass to recognize the "new" form of these comparisons and split out the pxor operation itself. To confirm this still works as expected I've added a new STV test case: long long a[1024]; long long b[1024]; int foo() { for (int i=0; i<1024; i++) { long long t = (a[i]<<8) | (b[i]<<24); if (t == 0) return 1; } return 0; } where with -m32 -O2 -msse4.1 the above comparison with zero should look like: punpcklqdq %xmm0, %xmm0 ptest %xmm0, %xmm0 Although this patch includes one or two minor tweaks to provide all the necessary infrastructure to support conversion of TImode comparisons to V1TImode (and SImode comparisons to V4SImode), STV doesn't yet implement these transformations, but this is something that can be considered after stage 4. Indeed the new convert_compare functionality is split out into a method to simplify its potential reuse by the timode_scalar_chain class. 2022-05-30 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog PR target/70321 * config/i386/i386-expand.cc (ix86_expand_branch): Don't decompose DI mode equality/inequality using XOR here. Instead generate a COMPARE for doubleword modes (DImode on !TARGET_64BIT or TImode). * config/i386/i386-features.cc (gen_gpr_to_xmm_move_src): Use gen_rtx_SUBREG when NUNITS is 1, i.e. for TImode to V1TImode. (general_scalar_chain::convert_compare): New function to convert scalar equality/inequality comparison into vector operations. (general_scalar_chain::convert_insn) [COMPARE]: Refactor. Call new convert_compare helper method. (convertible_comparion_p): Update to match doubleword COMPARE of two register, memory or integer constant operands. * config/i386/i386-features.h (general_scalar_chain::convert_compare): Prototype/declare member function here. * config/i386/i386.md (cstore<mode>4): Change mode to SDWIM, but only allow new doubleword modes for EQ and NE operators. (*cmp<dwi>_doubleword): New define_insn_and_split, to split a doubleword comparison into a pair of XORs followed by an IOR to set the (zero) flags register, optimizing the XORs if possible. * config/i386/sse.md (V_AVX): Include V1TI and V2TI in mode iterator; V_AVX is (currently) only used by ptest. (sse4_1 mode attribute): Update to support V1TI and V2TI. gcc/testsuite/ChangeLog PR target/70321 * gcc.target/i386/pr70321.c: New test case. * gcc.target/i386/sse4_1-stv-1.c: New test case.
2022-05-30 22:20:09 +02:00
/* Comparison against anything other than zero, requires an XOR. */
if (op2 != const0_rtx)
{
convert_op (&op1, insn);
PR target/70321: Split double word equality/inequality after STV on x86. This patch resolves the last piece of PR target/70321 a code quality (P2 regression) affecting mainline. Currently, for HJ's testcase: void foo (long long ixi) { if (ixi != 14348907) __builtin_abort (); } GCC with -m32 -O2 generates four instructions for the comparison: movl 16(%esp), %eax movl 20(%esp), %edx xorl $14348907, %eax orl %eax, %edx but with this patch it now requires only three, making better use of x86's addressing modes: movl 16(%esp), %eax xorl $14348907, %eax orl 20(%esp), %eax The solution is to expand "doubleword" equality/inequality expressions using flag setting COMPARE instructions for the early RTL passes, and then split them during split1, after STV and before reload. Hence on x86_64, we now see/allow things like: (insn 11 8 12 2 (set (reg:CCZ 17 flags) (compare:CCZ (reg/v:TI 84 [ x ]) (reg:TI 96))) "cmpti.c":2:43 30 {*cmpti_doubleword} This allows the STV pass to decide whether it's preferrable to perform this comparison using vector operations, i.e. a pxor/ptest sequence, or as scalar integer operations, i.e. a xor/xor/or sequence. Alas this required tweaking of the STV pass to recognize the "new" form of these comparisons and split out the pxor operation itself. To confirm this still works as expected I've added a new STV test case: long long a[1024]; long long b[1024]; int foo() { for (int i=0; i<1024; i++) { long long t = (a[i]<<8) | (b[i]<<24); if (t == 0) return 1; } return 0; } where with -m32 -O2 -msse4.1 the above comparison with zero should look like: punpcklqdq %xmm0, %xmm0 ptest %xmm0, %xmm0 Although this patch includes one or two minor tweaks to provide all the necessary infrastructure to support conversion of TImode comparisons to V1TImode (and SImode comparisons to V4SImode), STV doesn't yet implement these transformations, but this is something that can be considered after stage 4. Indeed the new convert_compare functionality is split out into a method to simplify its potential reuse by the timode_scalar_chain class. 2022-05-30 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog PR target/70321 * config/i386/i386-expand.cc (ix86_expand_branch): Don't decompose DI mode equality/inequality using XOR here. Instead generate a COMPARE for doubleword modes (DImode on !TARGET_64BIT or TImode). * config/i386/i386-features.cc (gen_gpr_to_xmm_move_src): Use gen_rtx_SUBREG when NUNITS is 1, i.e. for TImode to V1TImode. (general_scalar_chain::convert_compare): New function to convert scalar equality/inequality comparison into vector operations. (general_scalar_chain::convert_insn) [COMPARE]: Refactor. Call new convert_compare helper method. (convertible_comparion_p): Update to match doubleword COMPARE of two register, memory or integer constant operands. * config/i386/i386-features.h (general_scalar_chain::convert_compare): Prototype/declare member function here. * config/i386/i386.md (cstore<mode>4): Change mode to SDWIM, but only allow new doubleword modes for EQ and NE operators. (*cmp<dwi>_doubleword): New define_insn_and_split, to split a doubleword comparison into a pair of XORs followed by an IOR to set the (zero) flags register, optimizing the XORs if possible. * config/i386/sse.md (V_AVX): Include V1TI and V2TI in mode iterator; V_AVX is (currently) only used by ptest. (sse4_1 mode attribute): Update to support V1TI and V2TI. gcc/testsuite/ChangeLog PR target/70321 * gcc.target/i386/pr70321.c: New test case. * gcc.target/i386/sse4_1-stv-1.c: New test case.
2022-05-30 22:20:09 +02:00
convert_op (&op2, insn);
/* If both operands are MEMs, explicitly load the OP1 into TMP. */
if (MEM_P (op1) && MEM_P (op2))
{
Use PTEST to perform AND in TImode STV of (A & B) != 0 on x86_64. This x86_64 backend patch allows TImode STV to take advantage of the fact that the PTEST instruction performs an AND operation. Previously PTEST was (mostly) used for comparison against zero, by using the same operands. The benefits are demonstrated by the new test case: __int128 a,b; int foo() { return (a & b) != 0; } Currently with -O2 -msse4 we generate: movdqa a(%rip), %xmm0 pand b(%rip), %xmm0 xorl %eax, %eax ptest %xmm0, %xmm0 setne %al ret with this patch we now generate: movdqa a(%rip), %xmm0 xorl %eax, %eax ptest b(%rip), %xmm0 setne %al ret Technically, the magic happens using new define_insn_and_split patterns. Using two patterns allows this transformation to performed independently of whether TImode STV is run before or after combine. The one tricky case is that immediate constant operands of the AND behave slightly differently between TImode and V1TImode: All V1TImode immediate operands becomes loads, but for TImode only values that are not hilo_operands need to be loaded. Hence the new *testti_doubleword accepts any general_operand, but internally during split calls force_reg whenever the second operand is not x86_64_hilo_general_operand. This required (benefits from) some tweaks to TImode STV to support CONST_WIDE_INT in more places, using CONST_SCALAR_INT_P instead of just CONST_INT_P. 2022-08-09 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-features.cc (scalar_chain::convert_compare): Create new pseudos only when/if needed. Add support for TEST, i.e. (COMPARE (AND x y) (const_int 0)), using UNSPEC_PTEST. When broadcasting V2DImode and V4SImode use new pseudo register. (timode_scalar_chain::convert_op): Do nothing if operand is already V1TImode. Avoid generating useless SUBREG conversions, i.e. (SUBREG:V1TImode (REG:V1TImode) 0). Handle CONST_WIDE_INT in addition to CONST_INT by using CONST_SCALAR_INT_P. (convertible_comparison_p): Use CONST_SCALAR_INT_P to match both CONST_WIDE_INT and CONST_INT. Recognize new *testti_doubleword pattern as an STV candidate. (timode_scalar_to_vector_candidate_p): Allow CONST_SCALAR_INT_P operands in binary logic operations. * config/i386/i386.cc (ix86_rtx_costs) <case UNSPEC>: Add costs for UNSPEC_PTEST; a PTEST that performs an AND has the same cost as regular PTEST, i.e. cost->sse_op. * config/i386/i386.md (*testti_doubleword): New pre-reload define_insn_and_split that recognizes comparison of TI mode AND against zero. * config/i386/sse.md (*ptest<mode>_and): New pre-reload define_insn_and_split that recognizes UNSPEC_PTEST of identical AND operands. gcc/testsuite/ChangeLog * gcc.target/i386/sse4_1-stv-8.c: New test case.
2022-08-09 19:59:55 +02:00
tmp = gen_reg_rtx (vmode);
PR target/70321: Split double word equality/inequality after STV on x86. This patch resolves the last piece of PR target/70321 a code quality (P2 regression) affecting mainline. Currently, for HJ's testcase: void foo (long long ixi) { if (ixi != 14348907) __builtin_abort (); } GCC with -m32 -O2 generates four instructions for the comparison: movl 16(%esp), %eax movl 20(%esp), %edx xorl $14348907, %eax orl %eax, %edx but with this patch it now requires only three, making better use of x86's addressing modes: movl 16(%esp), %eax xorl $14348907, %eax orl 20(%esp), %eax The solution is to expand "doubleword" equality/inequality expressions using flag setting COMPARE instructions for the early RTL passes, and then split them during split1, after STV and before reload. Hence on x86_64, we now see/allow things like: (insn 11 8 12 2 (set (reg:CCZ 17 flags) (compare:CCZ (reg/v:TI 84 [ x ]) (reg:TI 96))) "cmpti.c":2:43 30 {*cmpti_doubleword} This allows the STV pass to decide whether it's preferrable to perform this comparison using vector operations, i.e. a pxor/ptest sequence, or as scalar integer operations, i.e. a xor/xor/or sequence. Alas this required tweaking of the STV pass to recognize the "new" form of these comparisons and split out the pxor operation itself. To confirm this still works as expected I've added a new STV test case: long long a[1024]; long long b[1024]; int foo() { for (int i=0; i<1024; i++) { long long t = (a[i]<<8) | (b[i]<<24); if (t == 0) return 1; } return 0; } where with -m32 -O2 -msse4.1 the above comparison with zero should look like: punpcklqdq %xmm0, %xmm0 ptest %xmm0, %xmm0 Although this patch includes one or two minor tweaks to provide all the necessary infrastructure to support conversion of TImode comparisons to V1TImode (and SImode comparisons to V4SImode), STV doesn't yet implement these transformations, but this is something that can be considered after stage 4. Indeed the new convert_compare functionality is split out into a method to simplify its potential reuse by the timode_scalar_chain class. 2022-05-30 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog PR target/70321 * config/i386/i386-expand.cc (ix86_expand_branch): Don't decompose DI mode equality/inequality using XOR here. Instead generate a COMPARE for doubleword modes (DImode on !TARGET_64BIT or TImode). * config/i386/i386-features.cc (gen_gpr_to_xmm_move_src): Use gen_rtx_SUBREG when NUNITS is 1, i.e. for TImode to V1TImode. (general_scalar_chain::convert_compare): New function to convert scalar equality/inequality comparison into vector operations. (general_scalar_chain::convert_insn) [COMPARE]: Refactor. Call new convert_compare helper method. (convertible_comparion_p): Update to match doubleword COMPARE of two register, memory or integer constant operands. * config/i386/i386-features.h (general_scalar_chain::convert_compare): Prototype/declare member function here. * config/i386/i386.md (cstore<mode>4): Change mode to SDWIM, but only allow new doubleword modes for EQ and NE operators. (*cmp<dwi>_doubleword): New define_insn_and_split, to split a doubleword comparison into a pair of XORs followed by an IOR to set the (zero) flags register, optimizing the XORs if possible. * config/i386/sse.md (V_AVX): Include V1TI and V2TI in mode iterator; V_AVX is (currently) only used by ptest. (sse4_1 mode attribute): Update to support V1TI and V2TI. gcc/testsuite/ChangeLog PR target/70321 * gcc.target/i386/pr70321.c: New test case. * gcc.target/i386/sse4_1-stv-1.c: New test case.
2022-05-30 22:20:09 +02:00
emit_insn_before (gen_rtx_SET (tmp, op1), insn);
src = tmp;
}
else
src = op1;
src = gen_rtx_XOR (vmode, src, op2);
}
else if (GET_CODE (op1) == AND
&& GET_CODE (XEXP (op1, 0)) == NOT)
{
rtx op11 = XEXP (XEXP (op1, 0), 0);
rtx op12 = XEXP (op1, 1);
convert_op (&op11, insn);
convert_op (&op12, insn);
Use PTEST to perform AND in TImode STV of (A & B) != 0 on x86_64. This x86_64 backend patch allows TImode STV to take advantage of the fact that the PTEST instruction performs an AND operation. Previously PTEST was (mostly) used for comparison against zero, by using the same operands. The benefits are demonstrated by the new test case: __int128 a,b; int foo() { return (a & b) != 0; } Currently with -O2 -msse4 we generate: movdqa a(%rip), %xmm0 pand b(%rip), %xmm0 xorl %eax, %eax ptest %xmm0, %xmm0 setne %al ret with this patch we now generate: movdqa a(%rip), %xmm0 xorl %eax, %eax ptest b(%rip), %xmm0 setne %al ret Technically, the magic happens using new define_insn_and_split patterns. Using two patterns allows this transformation to performed independently of whether TImode STV is run before or after combine. The one tricky case is that immediate constant operands of the AND behave slightly differently between TImode and V1TImode: All V1TImode immediate operands becomes loads, but for TImode only values that are not hilo_operands need to be loaded. Hence the new *testti_doubleword accepts any general_operand, but internally during split calls force_reg whenever the second operand is not x86_64_hilo_general_operand. This required (benefits from) some tweaks to TImode STV to support CONST_WIDE_INT in more places, using CONST_SCALAR_INT_P instead of just CONST_INT_P. 2022-08-09 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-features.cc (scalar_chain::convert_compare): Create new pseudos only when/if needed. Add support for TEST, i.e. (COMPARE (AND x y) (const_int 0)), using UNSPEC_PTEST. When broadcasting V2DImode and V4SImode use new pseudo register. (timode_scalar_chain::convert_op): Do nothing if operand is already V1TImode. Avoid generating useless SUBREG conversions, i.e. (SUBREG:V1TImode (REG:V1TImode) 0). Handle CONST_WIDE_INT in addition to CONST_INT by using CONST_SCALAR_INT_P. (convertible_comparison_p): Use CONST_SCALAR_INT_P to match both CONST_WIDE_INT and CONST_INT. Recognize new *testti_doubleword pattern as an STV candidate. (timode_scalar_to_vector_candidate_p): Allow CONST_SCALAR_INT_P operands in binary logic operations. * config/i386/i386.cc (ix86_rtx_costs) <case UNSPEC>: Add costs for UNSPEC_PTEST; a PTEST that performs an AND has the same cost as regular PTEST, i.e. cost->sse_op. * config/i386/i386.md (*testti_doubleword): New pre-reload define_insn_and_split that recognizes comparison of TI mode AND against zero. * config/i386/sse.md (*ptest<mode>_and): New pre-reload define_insn_and_split that recognizes UNSPEC_PTEST of identical AND operands. gcc/testsuite/ChangeLog * gcc.target/i386/sse4_1-stv-8.c: New test case.
2022-08-09 19:59:55 +02:00
if (!REG_P (op11))
{
Use PTEST to perform AND in TImode STV of (A & B) != 0 on x86_64. This x86_64 backend patch allows TImode STV to take advantage of the fact that the PTEST instruction performs an AND operation. Previously PTEST was (mostly) used for comparison against zero, by using the same operands. The benefits are demonstrated by the new test case: __int128 a,b; int foo() { return (a & b) != 0; } Currently with -O2 -msse4 we generate: movdqa a(%rip), %xmm0 pand b(%rip), %xmm0 xorl %eax, %eax ptest %xmm0, %xmm0 setne %al ret with this patch we now generate: movdqa a(%rip), %xmm0 xorl %eax, %eax ptest b(%rip), %xmm0 setne %al ret Technically, the magic happens using new define_insn_and_split patterns. Using two patterns allows this transformation to performed independently of whether TImode STV is run before or after combine. The one tricky case is that immediate constant operands of the AND behave slightly differently between TImode and V1TImode: All V1TImode immediate operands becomes loads, but for TImode only values that are not hilo_operands need to be loaded. Hence the new *testti_doubleword accepts any general_operand, but internally during split calls force_reg whenever the second operand is not x86_64_hilo_general_operand. This required (benefits from) some tweaks to TImode STV to support CONST_WIDE_INT in more places, using CONST_SCALAR_INT_P instead of just CONST_INT_P. 2022-08-09 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-features.cc (scalar_chain::convert_compare): Create new pseudos only when/if needed. Add support for TEST, i.e. (COMPARE (AND x y) (const_int 0)), using UNSPEC_PTEST. When broadcasting V2DImode and V4SImode use new pseudo register. (timode_scalar_chain::convert_op): Do nothing if operand is already V1TImode. Avoid generating useless SUBREG conversions, i.e. (SUBREG:V1TImode (REG:V1TImode) 0). Handle CONST_WIDE_INT in addition to CONST_INT by using CONST_SCALAR_INT_P. (convertible_comparison_p): Use CONST_SCALAR_INT_P to match both CONST_WIDE_INT and CONST_INT. Recognize new *testti_doubleword pattern as an STV candidate. (timode_scalar_to_vector_candidate_p): Allow CONST_SCALAR_INT_P operands in binary logic operations. * config/i386/i386.cc (ix86_rtx_costs) <case UNSPEC>: Add costs for UNSPEC_PTEST; a PTEST that performs an AND has the same cost as regular PTEST, i.e. cost->sse_op. * config/i386/i386.md (*testti_doubleword): New pre-reload define_insn_and_split that recognizes comparison of TI mode AND against zero. * config/i386/sse.md (*ptest<mode>_and): New pre-reload define_insn_and_split that recognizes UNSPEC_PTEST of identical AND operands. gcc/testsuite/ChangeLog * gcc.target/i386/sse4_1-stv-8.c: New test case.
2022-08-09 19:59:55 +02:00
tmp = gen_reg_rtx (vmode);
emit_insn_before (gen_rtx_SET (tmp, op11), insn);
op11 = tmp;
}
src = gen_rtx_AND (vmode, gen_rtx_NOT (vmode, op11), op12);
}
Use PTEST to perform AND in TImode STV of (A & B) != 0 on x86_64. This x86_64 backend patch allows TImode STV to take advantage of the fact that the PTEST instruction performs an AND operation. Previously PTEST was (mostly) used for comparison against zero, by using the same operands. The benefits are demonstrated by the new test case: __int128 a,b; int foo() { return (a & b) != 0; } Currently with -O2 -msse4 we generate: movdqa a(%rip), %xmm0 pand b(%rip), %xmm0 xorl %eax, %eax ptest %xmm0, %xmm0 setne %al ret with this patch we now generate: movdqa a(%rip), %xmm0 xorl %eax, %eax ptest b(%rip), %xmm0 setne %al ret Technically, the magic happens using new define_insn_and_split patterns. Using two patterns allows this transformation to performed independently of whether TImode STV is run before or after combine. The one tricky case is that immediate constant operands of the AND behave slightly differently between TImode and V1TImode: All V1TImode immediate operands becomes loads, but for TImode only values that are not hilo_operands need to be loaded. Hence the new *testti_doubleword accepts any general_operand, but internally during split calls force_reg whenever the second operand is not x86_64_hilo_general_operand. This required (benefits from) some tweaks to TImode STV to support CONST_WIDE_INT in more places, using CONST_SCALAR_INT_P instead of just CONST_INT_P. 2022-08-09 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-features.cc (scalar_chain::convert_compare): Create new pseudos only when/if needed. Add support for TEST, i.e. (COMPARE (AND x y) (const_int 0)), using UNSPEC_PTEST. When broadcasting V2DImode and V4SImode use new pseudo register. (timode_scalar_chain::convert_op): Do nothing if operand is already V1TImode. Avoid generating useless SUBREG conversions, i.e. (SUBREG:V1TImode (REG:V1TImode) 0). Handle CONST_WIDE_INT in addition to CONST_INT by using CONST_SCALAR_INT_P. (convertible_comparison_p): Use CONST_SCALAR_INT_P to match both CONST_WIDE_INT and CONST_INT. Recognize new *testti_doubleword pattern as an STV candidate. (timode_scalar_to_vector_candidate_p): Allow CONST_SCALAR_INT_P operands in binary logic operations. * config/i386/i386.cc (ix86_rtx_costs) <case UNSPEC>: Add costs for UNSPEC_PTEST; a PTEST that performs an AND has the same cost as regular PTEST, i.e. cost->sse_op. * config/i386/i386.md (*testti_doubleword): New pre-reload define_insn_and_split that recognizes comparison of TI mode AND against zero. * config/i386/sse.md (*ptest<mode>_and): New pre-reload define_insn_and_split that recognizes UNSPEC_PTEST of identical AND operands. gcc/testsuite/ChangeLog * gcc.target/i386/sse4_1-stv-8.c: New test case.
2022-08-09 19:59:55 +02:00
else if (GET_CODE (op1) == AND)
{
rtx op11 = XEXP (op1, 0);
rtx op12 = XEXP (op1, 1);
convert_op (&op11, insn);
convert_op (&op12, insn);
if (!REG_P (op11))
{
tmp = gen_reg_rtx (vmode);
emit_insn_before (gen_rtx_SET (tmp, op11), insn);
op11 = tmp;
}
return gen_rtx_UNSPEC (CCmode, gen_rtvec (2, op11, op12),
UNSPEC_PTEST);
}
PR target/70321: Split double word equality/inequality after STV on x86. This patch resolves the last piece of PR target/70321 a code quality (P2 regression) affecting mainline. Currently, for HJ's testcase: void foo (long long ixi) { if (ixi != 14348907) __builtin_abort (); } GCC with -m32 -O2 generates four instructions for the comparison: movl 16(%esp), %eax movl 20(%esp), %edx xorl $14348907, %eax orl %eax, %edx but with this patch it now requires only three, making better use of x86's addressing modes: movl 16(%esp), %eax xorl $14348907, %eax orl 20(%esp), %eax The solution is to expand "doubleword" equality/inequality expressions using flag setting COMPARE instructions for the early RTL passes, and then split them during split1, after STV and before reload. Hence on x86_64, we now see/allow things like: (insn 11 8 12 2 (set (reg:CCZ 17 flags) (compare:CCZ (reg/v:TI 84 [ x ]) (reg:TI 96))) "cmpti.c":2:43 30 {*cmpti_doubleword} This allows the STV pass to decide whether it's preferrable to perform this comparison using vector operations, i.e. a pxor/ptest sequence, or as scalar integer operations, i.e. a xor/xor/or sequence. Alas this required tweaking of the STV pass to recognize the "new" form of these comparisons and split out the pxor operation itself. To confirm this still works as expected I've added a new STV test case: long long a[1024]; long long b[1024]; int foo() { for (int i=0; i<1024; i++) { long long t = (a[i]<<8) | (b[i]<<24); if (t == 0) return 1; } return 0; } where with -m32 -O2 -msse4.1 the above comparison with zero should look like: punpcklqdq %xmm0, %xmm0 ptest %xmm0, %xmm0 Although this patch includes one or two minor tweaks to provide all the necessary infrastructure to support conversion of TImode comparisons to V1TImode (and SImode comparisons to V4SImode), STV doesn't yet implement these transformations, but this is something that can be considered after stage 4. Indeed the new convert_compare functionality is split out into a method to simplify its potential reuse by the timode_scalar_chain class. 2022-05-30 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog PR target/70321 * config/i386/i386-expand.cc (ix86_expand_branch): Don't decompose DI mode equality/inequality using XOR here. Instead generate a COMPARE for doubleword modes (DImode on !TARGET_64BIT or TImode). * config/i386/i386-features.cc (gen_gpr_to_xmm_move_src): Use gen_rtx_SUBREG when NUNITS is 1, i.e. for TImode to V1TImode. (general_scalar_chain::convert_compare): New function to convert scalar equality/inequality comparison into vector operations. (general_scalar_chain::convert_insn) [COMPARE]: Refactor. Call new convert_compare helper method. (convertible_comparion_p): Update to match doubleword COMPARE of two register, memory or integer constant operands. * config/i386/i386-features.h (general_scalar_chain::convert_compare): Prototype/declare member function here. * config/i386/i386.md (cstore<mode>4): Change mode to SDWIM, but only allow new doubleword modes for EQ and NE operators. (*cmp<dwi>_doubleword): New define_insn_and_split, to split a doubleword comparison into a pair of XORs followed by an IOR to set the (zero) flags register, optimizing the XORs if possible. * config/i386/sse.md (V_AVX): Include V1TI and V2TI in mode iterator; V_AVX is (currently) only used by ptest. (sse4_1 mode attribute): Update to support V1TI and V2TI. gcc/testsuite/ChangeLog PR target/70321 * gcc.target/i386/pr70321.c: New test case. * gcc.target/i386/sse4_1-stv-1.c: New test case.
2022-05-30 22:20:09 +02:00
else
{
convert_op (&op1, insn);
src = op1;
}
Use PTEST to perform AND in TImode STV of (A & B) != 0 on x86_64. This x86_64 backend patch allows TImode STV to take advantage of the fact that the PTEST instruction performs an AND operation. Previously PTEST was (mostly) used for comparison against zero, by using the same operands. The benefits are demonstrated by the new test case: __int128 a,b; int foo() { return (a & b) != 0; } Currently with -O2 -msse4 we generate: movdqa a(%rip), %xmm0 pand b(%rip), %xmm0 xorl %eax, %eax ptest %xmm0, %xmm0 setne %al ret with this patch we now generate: movdqa a(%rip), %xmm0 xorl %eax, %eax ptest b(%rip), %xmm0 setne %al ret Technically, the magic happens using new define_insn_and_split patterns. Using two patterns allows this transformation to performed independently of whether TImode STV is run before or after combine. The one tricky case is that immediate constant operands of the AND behave slightly differently between TImode and V1TImode: All V1TImode immediate operands becomes loads, but for TImode only values that are not hilo_operands need to be loaded. Hence the new *testti_doubleword accepts any general_operand, but internally during split calls force_reg whenever the second operand is not x86_64_hilo_general_operand. This required (benefits from) some tweaks to TImode STV to support CONST_WIDE_INT in more places, using CONST_SCALAR_INT_P instead of just CONST_INT_P. 2022-08-09 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-features.cc (scalar_chain::convert_compare): Create new pseudos only when/if needed. Add support for TEST, i.e. (COMPARE (AND x y) (const_int 0)), using UNSPEC_PTEST. When broadcasting V2DImode and V4SImode use new pseudo register. (timode_scalar_chain::convert_op): Do nothing if operand is already V1TImode. Avoid generating useless SUBREG conversions, i.e. (SUBREG:V1TImode (REG:V1TImode) 0). Handle CONST_WIDE_INT in addition to CONST_INT by using CONST_SCALAR_INT_P. (convertible_comparison_p): Use CONST_SCALAR_INT_P to match both CONST_WIDE_INT and CONST_INT. Recognize new *testti_doubleword pattern as an STV candidate. (timode_scalar_to_vector_candidate_p): Allow CONST_SCALAR_INT_P operands in binary logic operations. * config/i386/i386.cc (ix86_rtx_costs) <case UNSPEC>: Add costs for UNSPEC_PTEST; a PTEST that performs an AND has the same cost as regular PTEST, i.e. cost->sse_op. * config/i386/i386.md (*testti_doubleword): New pre-reload define_insn_and_split that recognizes comparison of TI mode AND against zero. * config/i386/sse.md (*ptest<mode>_and): New pre-reload define_insn_and_split that recognizes UNSPEC_PTEST of identical AND operands. gcc/testsuite/ChangeLog * gcc.target/i386/sse4_1-stv-8.c: New test case.
2022-08-09 19:59:55 +02:00
if (!REG_P (src))
{
tmp = gen_reg_rtx (vmode);
emit_insn_before (gen_rtx_SET (tmp, src), insn);
src = tmp;
}
PR target/70321: Split double word equality/inequality after STV on x86. This patch resolves the last piece of PR target/70321 a code quality (P2 regression) affecting mainline. Currently, for HJ's testcase: void foo (long long ixi) { if (ixi != 14348907) __builtin_abort (); } GCC with -m32 -O2 generates four instructions for the comparison: movl 16(%esp), %eax movl 20(%esp), %edx xorl $14348907, %eax orl %eax, %edx but with this patch it now requires only three, making better use of x86's addressing modes: movl 16(%esp), %eax xorl $14348907, %eax orl 20(%esp), %eax The solution is to expand "doubleword" equality/inequality expressions using flag setting COMPARE instructions for the early RTL passes, and then split them during split1, after STV and before reload. Hence on x86_64, we now see/allow things like: (insn 11 8 12 2 (set (reg:CCZ 17 flags) (compare:CCZ (reg/v:TI 84 [ x ]) (reg:TI 96))) "cmpti.c":2:43 30 {*cmpti_doubleword} This allows the STV pass to decide whether it's preferrable to perform this comparison using vector operations, i.e. a pxor/ptest sequence, or as scalar integer operations, i.e. a xor/xor/or sequence. Alas this required tweaking of the STV pass to recognize the "new" form of these comparisons and split out the pxor operation itself. To confirm this still works as expected I've added a new STV test case: long long a[1024]; long long b[1024]; int foo() { for (int i=0; i<1024; i++) { long long t = (a[i]<<8) | (b[i]<<24); if (t == 0) return 1; } return 0; } where with -m32 -O2 -msse4.1 the above comparison with zero should look like: punpcklqdq %xmm0, %xmm0 ptest %xmm0, %xmm0 Although this patch includes one or two minor tweaks to provide all the necessary infrastructure to support conversion of TImode comparisons to V1TImode (and SImode comparisons to V4SImode), STV doesn't yet implement these transformations, but this is something that can be considered after stage 4. Indeed the new convert_compare functionality is split out into a method to simplify its potential reuse by the timode_scalar_chain class. 2022-05-30 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog PR target/70321 * config/i386/i386-expand.cc (ix86_expand_branch): Don't decompose DI mode equality/inequality using XOR here. Instead generate a COMPARE for doubleword modes (DImode on !TARGET_64BIT or TImode). * config/i386/i386-features.cc (gen_gpr_to_xmm_move_src): Use gen_rtx_SUBREG when NUNITS is 1, i.e. for TImode to V1TImode. (general_scalar_chain::convert_compare): New function to convert scalar equality/inequality comparison into vector operations. (general_scalar_chain::convert_insn) [COMPARE]: Refactor. Call new convert_compare helper method. (convertible_comparion_p): Update to match doubleword COMPARE of two register, memory or integer constant operands. * config/i386/i386-features.h (general_scalar_chain::convert_compare): Prototype/declare member function here. * config/i386/i386.md (cstore<mode>4): Change mode to SDWIM, but only allow new doubleword modes for EQ and NE operators. (*cmp<dwi>_doubleword): New define_insn_and_split, to split a doubleword comparison into a pair of XORs followed by an IOR to set the (zero) flags register, optimizing the XORs if possible. * config/i386/sse.md (V_AVX): Include V1TI and V2TI in mode iterator; V_AVX is (currently) only used by ptest. (sse4_1 mode attribute): Update to support V1TI and V2TI. gcc/testsuite/ChangeLog PR target/70321 * gcc.target/i386/pr70321.c: New test case. * gcc.target/i386/sse4_1-stv-1.c: New test case.
2022-05-30 22:20:09 +02:00
if (vmode == V2DImode)
Use PTEST to perform AND in TImode STV of (A & B) != 0 on x86_64. This x86_64 backend patch allows TImode STV to take advantage of the fact that the PTEST instruction performs an AND operation. Previously PTEST was (mostly) used for comparison against zero, by using the same operands. The benefits are demonstrated by the new test case: __int128 a,b; int foo() { return (a & b) != 0; } Currently with -O2 -msse4 we generate: movdqa a(%rip), %xmm0 pand b(%rip), %xmm0 xorl %eax, %eax ptest %xmm0, %xmm0 setne %al ret with this patch we now generate: movdqa a(%rip), %xmm0 xorl %eax, %eax ptest b(%rip), %xmm0 setne %al ret Technically, the magic happens using new define_insn_and_split patterns. Using two patterns allows this transformation to performed independently of whether TImode STV is run before or after combine. The one tricky case is that immediate constant operands of the AND behave slightly differently between TImode and V1TImode: All V1TImode immediate operands becomes loads, but for TImode only values that are not hilo_operands need to be loaded. Hence the new *testti_doubleword accepts any general_operand, but internally during split calls force_reg whenever the second operand is not x86_64_hilo_general_operand. This required (benefits from) some tweaks to TImode STV to support CONST_WIDE_INT in more places, using CONST_SCALAR_INT_P instead of just CONST_INT_P. 2022-08-09 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-features.cc (scalar_chain::convert_compare): Create new pseudos only when/if needed. Add support for TEST, i.e. (COMPARE (AND x y) (const_int 0)), using UNSPEC_PTEST. When broadcasting V2DImode and V4SImode use new pseudo register. (timode_scalar_chain::convert_op): Do nothing if operand is already V1TImode. Avoid generating useless SUBREG conversions, i.e. (SUBREG:V1TImode (REG:V1TImode) 0). Handle CONST_WIDE_INT in addition to CONST_INT by using CONST_SCALAR_INT_P. (convertible_comparison_p): Use CONST_SCALAR_INT_P to match both CONST_WIDE_INT and CONST_INT. Recognize new *testti_doubleword pattern as an STV candidate. (timode_scalar_to_vector_candidate_p): Allow CONST_SCALAR_INT_P operands in binary logic operations. * config/i386/i386.cc (ix86_rtx_costs) <case UNSPEC>: Add costs for UNSPEC_PTEST; a PTEST that performs an AND has the same cost as regular PTEST, i.e. cost->sse_op. * config/i386/i386.md (*testti_doubleword): New pre-reload define_insn_and_split that recognizes comparison of TI mode AND against zero. * config/i386/sse.md (*ptest<mode>_and): New pre-reload define_insn_and_split that recognizes UNSPEC_PTEST of identical AND operands. gcc/testsuite/ChangeLog * gcc.target/i386/sse4_1-stv-8.c: New test case.
2022-08-09 19:59:55 +02:00
{
tmp = gen_reg_rtx (vmode);
emit_insn_before (gen_vec_interleave_lowv2di (tmp, src, src), insn);
src = tmp;
}
PR target/70321: Split double word equality/inequality after STV on x86. This patch resolves the last piece of PR target/70321 a code quality (P2 regression) affecting mainline. Currently, for HJ's testcase: void foo (long long ixi) { if (ixi != 14348907) __builtin_abort (); } GCC with -m32 -O2 generates four instructions for the comparison: movl 16(%esp), %eax movl 20(%esp), %edx xorl $14348907, %eax orl %eax, %edx but with this patch it now requires only three, making better use of x86's addressing modes: movl 16(%esp), %eax xorl $14348907, %eax orl 20(%esp), %eax The solution is to expand "doubleword" equality/inequality expressions using flag setting COMPARE instructions for the early RTL passes, and then split them during split1, after STV and before reload. Hence on x86_64, we now see/allow things like: (insn 11 8 12 2 (set (reg:CCZ 17 flags) (compare:CCZ (reg/v:TI 84 [ x ]) (reg:TI 96))) "cmpti.c":2:43 30 {*cmpti_doubleword} This allows the STV pass to decide whether it's preferrable to perform this comparison using vector operations, i.e. a pxor/ptest sequence, or as scalar integer operations, i.e. a xor/xor/or sequence. Alas this required tweaking of the STV pass to recognize the "new" form of these comparisons and split out the pxor operation itself. To confirm this still works as expected I've added a new STV test case: long long a[1024]; long long b[1024]; int foo() { for (int i=0; i<1024; i++) { long long t = (a[i]<<8) | (b[i]<<24); if (t == 0) return 1; } return 0; } where with -m32 -O2 -msse4.1 the above comparison with zero should look like: punpcklqdq %xmm0, %xmm0 ptest %xmm0, %xmm0 Although this patch includes one or two minor tweaks to provide all the necessary infrastructure to support conversion of TImode comparisons to V1TImode (and SImode comparisons to V4SImode), STV doesn't yet implement these transformations, but this is something that can be considered after stage 4. Indeed the new convert_compare functionality is split out into a method to simplify its potential reuse by the timode_scalar_chain class. 2022-05-30 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog PR target/70321 * config/i386/i386-expand.cc (ix86_expand_branch): Don't decompose DI mode equality/inequality using XOR here. Instead generate a COMPARE for doubleword modes (DImode on !TARGET_64BIT or TImode). * config/i386/i386-features.cc (gen_gpr_to_xmm_move_src): Use gen_rtx_SUBREG when NUNITS is 1, i.e. for TImode to V1TImode. (general_scalar_chain::convert_compare): New function to convert scalar equality/inequality comparison into vector operations. (general_scalar_chain::convert_insn) [COMPARE]: Refactor. Call new convert_compare helper method. (convertible_comparion_p): Update to match doubleword COMPARE of two register, memory or integer constant operands. * config/i386/i386-features.h (general_scalar_chain::convert_compare): Prototype/declare member function here. * config/i386/i386.md (cstore<mode>4): Change mode to SDWIM, but only allow new doubleword modes for EQ and NE operators. (*cmp<dwi>_doubleword): New define_insn_and_split, to split a doubleword comparison into a pair of XORs followed by an IOR to set the (zero) flags register, optimizing the XORs if possible. * config/i386/sse.md (V_AVX): Include V1TI and V2TI in mode iterator; V_AVX is (currently) only used by ptest. (sse4_1 mode attribute): Update to support V1TI and V2TI. gcc/testsuite/ChangeLog PR target/70321 * gcc.target/i386/pr70321.c: New test case. * gcc.target/i386/sse4_1-stv-1.c: New test case.
2022-05-30 22:20:09 +02:00
else if (vmode == V4SImode)
Use PTEST to perform AND in TImode STV of (A & B) != 0 on x86_64. This x86_64 backend patch allows TImode STV to take advantage of the fact that the PTEST instruction performs an AND operation. Previously PTEST was (mostly) used for comparison against zero, by using the same operands. The benefits are demonstrated by the new test case: __int128 a,b; int foo() { return (a & b) != 0; } Currently with -O2 -msse4 we generate: movdqa a(%rip), %xmm0 pand b(%rip), %xmm0 xorl %eax, %eax ptest %xmm0, %xmm0 setne %al ret with this patch we now generate: movdqa a(%rip), %xmm0 xorl %eax, %eax ptest b(%rip), %xmm0 setne %al ret Technically, the magic happens using new define_insn_and_split patterns. Using two patterns allows this transformation to performed independently of whether TImode STV is run before or after combine. The one tricky case is that immediate constant operands of the AND behave slightly differently between TImode and V1TImode: All V1TImode immediate operands becomes loads, but for TImode only values that are not hilo_operands need to be loaded. Hence the new *testti_doubleword accepts any general_operand, but internally during split calls force_reg whenever the second operand is not x86_64_hilo_general_operand. This required (benefits from) some tweaks to TImode STV to support CONST_WIDE_INT in more places, using CONST_SCALAR_INT_P instead of just CONST_INT_P. 2022-08-09 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-features.cc (scalar_chain::convert_compare): Create new pseudos only when/if needed. Add support for TEST, i.e. (COMPARE (AND x y) (const_int 0)), using UNSPEC_PTEST. When broadcasting V2DImode and V4SImode use new pseudo register. (timode_scalar_chain::convert_op): Do nothing if operand is already V1TImode. Avoid generating useless SUBREG conversions, i.e. (SUBREG:V1TImode (REG:V1TImode) 0). Handle CONST_WIDE_INT in addition to CONST_INT by using CONST_SCALAR_INT_P. (convertible_comparison_p): Use CONST_SCALAR_INT_P to match both CONST_WIDE_INT and CONST_INT. Recognize new *testti_doubleword pattern as an STV candidate. (timode_scalar_to_vector_candidate_p): Allow CONST_SCALAR_INT_P operands in binary logic operations. * config/i386/i386.cc (ix86_rtx_costs) <case UNSPEC>: Add costs for UNSPEC_PTEST; a PTEST that performs an AND has the same cost as regular PTEST, i.e. cost->sse_op. * config/i386/i386.md (*testti_doubleword): New pre-reload define_insn_and_split that recognizes comparison of TI mode AND against zero. * config/i386/sse.md (*ptest<mode>_and): New pre-reload define_insn_and_split that recognizes UNSPEC_PTEST of identical AND operands. gcc/testsuite/ChangeLog * gcc.target/i386/sse4_1-stv-8.c: New test case.
2022-08-09 19:59:55 +02:00
{
tmp = gen_reg_rtx (vmode);
emit_insn_before (gen_sse2_pshufd (tmp, src, const0_rtx), insn);
src = tmp;
}
return gen_rtx_UNSPEC (CCmode, gen_rtvec (2, src, src), UNSPEC_PTEST);
PR target/70321: Split double word equality/inequality after STV on x86. This patch resolves the last piece of PR target/70321 a code quality (P2 regression) affecting mainline. Currently, for HJ's testcase: void foo (long long ixi) { if (ixi != 14348907) __builtin_abort (); } GCC with -m32 -O2 generates four instructions for the comparison: movl 16(%esp), %eax movl 20(%esp), %edx xorl $14348907, %eax orl %eax, %edx but with this patch it now requires only three, making better use of x86's addressing modes: movl 16(%esp), %eax xorl $14348907, %eax orl 20(%esp), %eax The solution is to expand "doubleword" equality/inequality expressions using flag setting COMPARE instructions for the early RTL passes, and then split them during split1, after STV and before reload. Hence on x86_64, we now see/allow things like: (insn 11 8 12 2 (set (reg:CCZ 17 flags) (compare:CCZ (reg/v:TI 84 [ x ]) (reg:TI 96))) "cmpti.c":2:43 30 {*cmpti_doubleword} This allows the STV pass to decide whether it's preferrable to perform this comparison using vector operations, i.e. a pxor/ptest sequence, or as scalar integer operations, i.e. a xor/xor/or sequence. Alas this required tweaking of the STV pass to recognize the "new" form of these comparisons and split out the pxor operation itself. To confirm this still works as expected I've added a new STV test case: long long a[1024]; long long b[1024]; int foo() { for (int i=0; i<1024; i++) { long long t = (a[i]<<8) | (b[i]<<24); if (t == 0) return 1; } return 0; } where with -m32 -O2 -msse4.1 the above comparison with zero should look like: punpcklqdq %xmm0, %xmm0 ptest %xmm0, %xmm0 Although this patch includes one or two minor tweaks to provide all the necessary infrastructure to support conversion of TImode comparisons to V1TImode (and SImode comparisons to V4SImode), STV doesn't yet implement these transformations, but this is something that can be considered after stage 4. Indeed the new convert_compare functionality is split out into a method to simplify its potential reuse by the timode_scalar_chain class. 2022-05-30 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog PR target/70321 * config/i386/i386-expand.cc (ix86_expand_branch): Don't decompose DI mode equality/inequality using XOR here. Instead generate a COMPARE for doubleword modes (DImode on !TARGET_64BIT or TImode). * config/i386/i386-features.cc (gen_gpr_to_xmm_move_src): Use gen_rtx_SUBREG when NUNITS is 1, i.e. for TImode to V1TImode. (general_scalar_chain::convert_compare): New function to convert scalar equality/inequality comparison into vector operations. (general_scalar_chain::convert_insn) [COMPARE]: Refactor. Call new convert_compare helper method. (convertible_comparion_p): Update to match doubleword COMPARE of two register, memory or integer constant operands. * config/i386/i386-features.h (general_scalar_chain::convert_compare): Prototype/declare member function here. * config/i386/i386.md (cstore<mode>4): Change mode to SDWIM, but only allow new doubleword modes for EQ and NE operators. (*cmp<dwi>_doubleword): New define_insn_and_split, to split a doubleword comparison into a pair of XORs followed by an IOR to set the (zero) flags register, optimizing the XORs if possible. * config/i386/sse.md (V_AVX): Include V1TI and V2TI in mode iterator; V_AVX is (currently) only used by ptest. (sse4_1 mode attribute): Update to support V1TI and V2TI. gcc/testsuite/ChangeLog PR target/70321 * gcc.target/i386/pr70321.c: New test case. * gcc.target/i386/sse4_1-stv-1.c: New test case.
2022-05-30 22:20:09 +02:00
}
PR target/106303: Fix TImode STV related failures on x86. This patch resolves PR target/106303 (and the related PRs 106347, 106404, 106407) which are ICEs caused by my improvements to x86_64's 128-bit TImode to V1TImode Scalar to Vector (STV) pass. My apologies for the breakage. The issue is that data flow analysis is used to partition usage of each TImode pseudo into "chains", where each chain is analyzed and if suitable converted to vector operations. The problems appears when some chains for a pseudo are converted, and others aren't as RTL sharing can result in some mode changes leaking into other instructions that aren't/shouldn't/can't be converted, which eventually leads to an ICE for mismatched modes. My first approach to a fix was to unify more of the STV infrastructure, reasoning that if TImode STV was exhibiting these problems, but DImode and SImode STV weren't, the issue was likely to be caused/resolved by these remaining differences. This appeared to fix some but not all of the reported PRs. A better solution was then proposed by H.J. Lu in Bugzilla, that we need to iterate the removal of candidates in the function timode_remove_non_convertible_regs until there are no further changes. As each chain is removed from consideration, it in turn may affect whether other insns/chains can safely be converted. 2022-07-24 Roger Sayle <roger@nextmovesoftware.com> H.J. Lu <hjl.tools@gmail.com> gcc/ChangeLog PR target/106303 PR target/106347 * config/i386/i386-features.cc (make_vector_copies): Move from general_scalar_chain to scalar_chain. (convert_reg): Likewise. (convert_insn_common): New scalar_chain method split out from general_scalar_chain convert_insn. (convert_registers): Move from general_scalar_chain to scalar_chain. (scalar_chain::convert): Call convert_insn_common before calling convert_insn. (timode_remove_non_convertible_regs): Iterate until there are no further changes to the candidates. * config/i386/i386-features.h (scalar_chain::hash_map): Move from general_scalar_chain. (scalar_chain::convert_reg): Likewise. (scalar_chain::convert_insn_common): New shared method. (scalar_chain::make_vector_copies): Move from general_scalar_chain. (scalar_chain::convert_registers): Likewise. No longer virtual. (general_scalar_chain::hash_map): Delete. Moved to scalar_chain. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::make_vector_copies): Likewise. (general_scalar_chain::convert_registers): Delete virtual method. (timode_scalar_chain::convert_registers): Likewise. gcc/testsuite/ChangeLog PR target/106303 PR target/106347 * gcc.target/i386/pr106303.c: New test case. * gcc.target/i386/pr106347.c: New test case.
2022-07-24 13:22:22 +02:00
/* Helper function for converting INSN to vector mode. */
void
PR target/106303: Fix TImode STV related failures on x86. This patch resolves PR target/106303 (and the related PRs 106347, 106404, 106407) which are ICEs caused by my improvements to x86_64's 128-bit TImode to V1TImode Scalar to Vector (STV) pass. My apologies for the breakage. The issue is that data flow analysis is used to partition usage of each TImode pseudo into "chains", where each chain is analyzed and if suitable converted to vector operations. The problems appears when some chains for a pseudo are converted, and others aren't as RTL sharing can result in some mode changes leaking into other instructions that aren't/shouldn't/can't be converted, which eventually leads to an ICE for mismatched modes. My first approach to a fix was to unify more of the STV infrastructure, reasoning that if TImode STV was exhibiting these problems, but DImode and SImode STV weren't, the issue was likely to be caused/resolved by these remaining differences. This appeared to fix some but not all of the reported PRs. A better solution was then proposed by H.J. Lu in Bugzilla, that we need to iterate the removal of candidates in the function timode_remove_non_convertible_regs until there are no further changes. As each chain is removed from consideration, it in turn may affect whether other insns/chains can safely be converted. 2022-07-24 Roger Sayle <roger@nextmovesoftware.com> H.J. Lu <hjl.tools@gmail.com> gcc/ChangeLog PR target/106303 PR target/106347 * config/i386/i386-features.cc (make_vector_copies): Move from general_scalar_chain to scalar_chain. (convert_reg): Likewise. (convert_insn_common): New scalar_chain method split out from general_scalar_chain convert_insn. (convert_registers): Move from general_scalar_chain to scalar_chain. (scalar_chain::convert): Call convert_insn_common before calling convert_insn. (timode_remove_non_convertible_regs): Iterate until there are no further changes to the candidates. * config/i386/i386-features.h (scalar_chain::hash_map): Move from general_scalar_chain. (scalar_chain::convert_reg): Likewise. (scalar_chain::convert_insn_common): New shared method. (scalar_chain::make_vector_copies): Move from general_scalar_chain. (scalar_chain::convert_registers): Likewise. No longer virtual. (general_scalar_chain::hash_map): Delete. Moved to scalar_chain. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::make_vector_copies): Likewise. (general_scalar_chain::convert_registers): Delete virtual method. (timode_scalar_chain::convert_registers): Likewise. gcc/testsuite/ChangeLog PR target/106303 PR target/106347 * gcc.target/i386/pr106303.c: New test case. * gcc.target/i386/pr106347.c: New test case.
2022-07-24 13:22:22 +02:00
scalar_chain::convert_insn_common (rtx_insn *insn)
{
/* Generate copies for out-of-chain uses of defs and adjust debug uses. */
for (df_ref ref = DF_INSN_DEFS (insn); ref; ref = DF_REF_NEXT_LOC (ref))
if (bitmap_bit_p (defs_conv, DF_REF_REGNO (ref)))
{
df_link *use;
for (use = DF_REF_CHAIN (ref); use; use = use->next)
if (NONDEBUG_INSN_P (DF_REF_INSN (use->ref))
&& (DF_REF_REG_MEM_P (use->ref)
|| !bitmap_bit_p (insns, DF_REF_INSN_UID (use->ref))))
break;
if (use)
convert_reg (insn, DF_REF_REG (ref),
*defs_map.get (regno_reg_rtx [DF_REF_REGNO (ref)]));
else if (MAY_HAVE_DEBUG_BIND_INSNS)
{
/* If we generated a scalar copy we can leave debug-insns
as-is, if not, we have to adjust them. */
auto_vec<rtx_insn *, 5> to_reset_debug_insns;
for (use = DF_REF_CHAIN (ref); use; use = use->next)
if (DEBUG_INSN_P (DF_REF_INSN (use->ref)))
{
rtx_insn *debug_insn = DF_REF_INSN (use->ref);
/* If there's a reaching definition outside of the
chain we have to reset. */
df_link *def;
for (def = DF_REF_CHAIN (use->ref); def; def = def->next)
if (!bitmap_bit_p (insns, DF_REF_INSN_UID (def->ref)))
break;
if (def)
to_reset_debug_insns.safe_push (debug_insn);
else
{
*DF_REF_REAL_LOC (use->ref)
= *defs_map.get (regno_reg_rtx [DF_REF_REGNO (ref)]);
df_insn_rescan (debug_insn);
}
}
/* Have to do the reset outside of the DF_CHAIN walk to not
disrupt it. */
while (!to_reset_debug_insns.is_empty ())
{
rtx_insn *debug_insn = to_reset_debug_insns.pop ();
INSN_VAR_LOCATION_LOC (debug_insn) = gen_rtx_UNKNOWN_VAR_LOC ();
df_insn_rescan_debug_internal (debug_insn);
}
}
}
/* Replace uses in this insn with the defs we use in the chain. */
for (df_ref ref = DF_INSN_USES (insn); ref; ref = DF_REF_NEXT_LOC (ref))
if (!DF_REF_REG_MEM_P (ref))
if (rtx *vreg = defs_map.get (regno_reg_rtx[DF_REF_REGNO (ref)]))
{
/* Also update a corresponding REG_DEAD note. */
rtx note = find_reg_note (insn, REG_DEAD, DF_REF_REG (ref));
if (note)
XEXP (note, 0) = *vreg;
*DF_REF_REAL_LOC (ref) = *vreg;
}
PR target/106303: Fix TImode STV related failures on x86. This patch resolves PR target/106303 (and the related PRs 106347, 106404, 106407) which are ICEs caused by my improvements to x86_64's 128-bit TImode to V1TImode Scalar to Vector (STV) pass. My apologies for the breakage. The issue is that data flow analysis is used to partition usage of each TImode pseudo into "chains", where each chain is analyzed and if suitable converted to vector operations. The problems appears when some chains for a pseudo are converted, and others aren't as RTL sharing can result in some mode changes leaking into other instructions that aren't/shouldn't/can't be converted, which eventually leads to an ICE for mismatched modes. My first approach to a fix was to unify more of the STV infrastructure, reasoning that if TImode STV was exhibiting these problems, but DImode and SImode STV weren't, the issue was likely to be caused/resolved by these remaining differences. This appeared to fix some but not all of the reported PRs. A better solution was then proposed by H.J. Lu in Bugzilla, that we need to iterate the removal of candidates in the function timode_remove_non_convertible_regs until there are no further changes. As each chain is removed from consideration, it in turn may affect whether other insns/chains can safely be converted. 2022-07-24 Roger Sayle <roger@nextmovesoftware.com> H.J. Lu <hjl.tools@gmail.com> gcc/ChangeLog PR target/106303 PR target/106347 * config/i386/i386-features.cc (make_vector_copies): Move from general_scalar_chain to scalar_chain. (convert_reg): Likewise. (convert_insn_common): New scalar_chain method split out from general_scalar_chain convert_insn. (convert_registers): Move from general_scalar_chain to scalar_chain. (scalar_chain::convert): Call convert_insn_common before calling convert_insn. (timode_remove_non_convertible_regs): Iterate until there are no further changes to the candidates. * config/i386/i386-features.h (scalar_chain::hash_map): Move from general_scalar_chain. (scalar_chain::convert_reg): Likewise. (scalar_chain::convert_insn_common): New shared method. (scalar_chain::make_vector_copies): Move from general_scalar_chain. (scalar_chain::convert_registers): Likewise. No longer virtual. (general_scalar_chain::hash_map): Delete. Moved to scalar_chain. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::make_vector_copies): Likewise. (general_scalar_chain::convert_registers): Delete virtual method. (timode_scalar_chain::convert_registers): Likewise. gcc/testsuite/ChangeLog PR target/106303 PR target/106347 * gcc.target/i386/pr106303.c: New test case. * gcc.target/i386/pr106347.c: New test case.
2022-07-24 13:22:22 +02:00
}
/* Convert INSN to vector mode. */
PR target/106303: Fix TImode STV related failures on x86. This patch resolves PR target/106303 (and the related PRs 106347, 106404, 106407) which are ICEs caused by my improvements to x86_64's 128-bit TImode to V1TImode Scalar to Vector (STV) pass. My apologies for the breakage. The issue is that data flow analysis is used to partition usage of each TImode pseudo into "chains", where each chain is analyzed and if suitable converted to vector operations. The problems appears when some chains for a pseudo are converted, and others aren't as RTL sharing can result in some mode changes leaking into other instructions that aren't/shouldn't/can't be converted, which eventually leads to an ICE for mismatched modes. My first approach to a fix was to unify more of the STV infrastructure, reasoning that if TImode STV was exhibiting these problems, but DImode and SImode STV weren't, the issue was likely to be caused/resolved by these remaining differences. This appeared to fix some but not all of the reported PRs. A better solution was then proposed by H.J. Lu in Bugzilla, that we need to iterate the removal of candidates in the function timode_remove_non_convertible_regs until there are no further changes. As each chain is removed from consideration, it in turn may affect whether other insns/chains can safely be converted. 2022-07-24 Roger Sayle <roger@nextmovesoftware.com> H.J. Lu <hjl.tools@gmail.com> gcc/ChangeLog PR target/106303 PR target/106347 * config/i386/i386-features.cc (make_vector_copies): Move from general_scalar_chain to scalar_chain. (convert_reg): Likewise. (convert_insn_common): New scalar_chain method split out from general_scalar_chain convert_insn. (convert_registers): Move from general_scalar_chain to scalar_chain. (scalar_chain::convert): Call convert_insn_common before calling convert_insn. (timode_remove_non_convertible_regs): Iterate until there are no further changes to the candidates. * config/i386/i386-features.h (scalar_chain::hash_map): Move from general_scalar_chain. (scalar_chain::convert_reg): Likewise. (scalar_chain::convert_insn_common): New shared method. (scalar_chain::make_vector_copies): Move from general_scalar_chain. (scalar_chain::convert_registers): Likewise. No longer virtual. (general_scalar_chain::hash_map): Delete. Moved to scalar_chain. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::make_vector_copies): Likewise. (general_scalar_chain::convert_registers): Delete virtual method. (timode_scalar_chain::convert_registers): Likewise. gcc/testsuite/ChangeLog PR target/106303 PR target/106347 * gcc.target/i386/pr106303.c: New test case. * gcc.target/i386/pr106347.c: New test case.
2022-07-24 13:22:22 +02:00
void
general_scalar_chain::convert_insn (rtx_insn *insn)
{
rtx def_set = single_set (insn);
rtx src = SET_SRC (def_set);
rtx dst = SET_DEST (def_set);
rtx subreg;
if (MEM_P (dst) && !REG_P (src))
{
/* There are no scalar integer instructions and therefore
temporary register usage is required. */
re PR rtl-optimization/91154 (456.hmmer regression on Haswell caused by r272922) 2019-08-14 Richard Biener <rguenther@suse.de> Uroš Bizjak <ubizjak@gmail.com> PR target/91154 * config/i386/i386-features.h (scalar_chain::scalar_chain): Add mode arguments. (scalar_chain::smode): New member. (scalar_chain::vmode): Likewise. (dimode_scalar_chain): Rename to... (general_scalar_chain): ... this. (general_scalar_chain::general_scalar_chain): Take mode arguments. (timode_scalar_chain::timode_scalar_chain): Initialize scalar_chain base with TImode and V1TImode. * config/i386/i386-features.c (scalar_chain::scalar_chain): Adjust. (general_scalar_chain::vector_const_cost): Adjust for SImode chains. (general_scalar_chain::compute_convert_gain): Likewise. Add {S,U}{MIN,MAX} support. (general_scalar_chain::replace_with_subreg): Use vmode/smode. (general_scalar_chain::make_vector_copies): Likewise. Handle non-DImode chains appropriately. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::convert_op): Likewise. (general_scalar_chain::convert_insn): Likewise. Add fatal_insn_not_found if the result is not recognized. (convertible_comparison_p): Pass in the scalar mode and use that. (general_scalar_to_vector_candidate_p): Likewise. Rename from dimode_scalar_to_vector_candidate_p. Add {S,U}{MIN,MAX} support. (scalar_to_vector_candidate_p): Remove by inlining into single caller. (general_remove_non_convertible_regs): Rename from dimode_remove_non_convertible_regs. (remove_non_convertible_regs): Remove by inlining into single caller. (convert_scalars_to_vector): Handle SImode and DImode chains in addition to TImode chains. * config/i386/i386.md (<maxmin><MAXMIN_IMODE>3): New expander. (*<maxmin><MAXMIN_IMODE>3_1): New insn-and-split. (*<maxmin>di3_doubleword): Likewise. * gcc.target/i386/pr91154.c: New testcase. * gcc.target/i386/minmax-3.c: Likewise. * gcc.target/i386/minmax-4.c: Likewise. * gcc.target/i386/minmax-5.c: Likewise. * gcc.target/i386/minmax-6.c: Likewise. * gcc.target/i386/minmax-1.c: Add -mno-stv. * gcc.target/i386/minmax-2.c: Likewise. Co-Authored-By: Uros Bizjak <ubizjak@gmail.com> From-SVN: r274481
2019-08-14 14:04:05 +02:00
rtx tmp = gen_reg_rtx (smode);
emit_conversion_insns (gen_move_insn (dst, tmp), insn);
re PR rtl-optimization/91154 (456.hmmer regression on Haswell caused by r272922) 2019-08-14 Richard Biener <rguenther@suse.de> Uroš Bizjak <ubizjak@gmail.com> PR target/91154 * config/i386/i386-features.h (scalar_chain::scalar_chain): Add mode arguments. (scalar_chain::smode): New member. (scalar_chain::vmode): Likewise. (dimode_scalar_chain): Rename to... (general_scalar_chain): ... this. (general_scalar_chain::general_scalar_chain): Take mode arguments. (timode_scalar_chain::timode_scalar_chain): Initialize scalar_chain base with TImode and V1TImode. * config/i386/i386-features.c (scalar_chain::scalar_chain): Adjust. (general_scalar_chain::vector_const_cost): Adjust for SImode chains. (general_scalar_chain::compute_convert_gain): Likewise. Add {S,U}{MIN,MAX} support. (general_scalar_chain::replace_with_subreg): Use vmode/smode. (general_scalar_chain::make_vector_copies): Likewise. Handle non-DImode chains appropriately. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::convert_op): Likewise. (general_scalar_chain::convert_insn): Likewise. Add fatal_insn_not_found if the result is not recognized. (convertible_comparison_p): Pass in the scalar mode and use that. (general_scalar_to_vector_candidate_p): Likewise. Rename from dimode_scalar_to_vector_candidate_p. Add {S,U}{MIN,MAX} support. (scalar_to_vector_candidate_p): Remove by inlining into single caller. (general_remove_non_convertible_regs): Rename from dimode_remove_non_convertible_regs. (remove_non_convertible_regs): Remove by inlining into single caller. (convert_scalars_to_vector): Handle SImode and DImode chains in addition to TImode chains. * config/i386/i386.md (<maxmin><MAXMIN_IMODE>3): New expander. (*<maxmin><MAXMIN_IMODE>3_1): New insn-and-split. (*<maxmin>di3_doubleword): Likewise. * gcc.target/i386/pr91154.c: New testcase. * gcc.target/i386/minmax-3.c: Likewise. * gcc.target/i386/minmax-4.c: Likewise. * gcc.target/i386/minmax-5.c: Likewise. * gcc.target/i386/minmax-6.c: Likewise. * gcc.target/i386/minmax-1.c: Add -mno-stv. * gcc.target/i386/minmax-2.c: Likewise. Co-Authored-By: Uros Bizjak <ubizjak@gmail.com> From-SVN: r274481
2019-08-14 14:04:05 +02:00
dst = gen_rtx_SUBREG (vmode, tmp, 0);
}
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64. This patch upgrades x86_64's scalar-to-vector (STV) pass to more aggressively transform 128-bit scalar TImode operations into vector V1TImode operations performed on SSE registers. TImode functionality already exists in STV, but only for move operations. This change brings support for logical operations (AND, IOR, XOR, NOT and ANDN) and comparisons. The effect of these changes are conveniently demonstrated by the new sse4_1-stv-5.c test case: __int128 a[16]; __int128 b[16]; __int128 c[16]; void foo() { for (unsigned int i=0; i<16; i++) a[i] = b[i] & ~c[i]; } which when currently compiled on mainline wtih -O2 -msse4 produces: foo: xorl %eax, %eax .L2: movq c(%rax), %rsi movq c+8(%rax), %rdi addq $16, %rax notq %rsi notq %rdi andq b-16(%rax), %rsi andq b-8(%rax), %rdi movq %rsi, a-16(%rax) movq %rdi, a-8(%rax) cmpq $256, %rax jne .L2 ret but with this patch now produces: foo: xorl %eax, %eax .L2: movdqa c(%rax), %xmm0 pandn b(%rax), %xmm0 addq $16, %rax movaps %xmm0, a-16(%rax) cmpq $256, %rax jne .L2 ret Technically, the STV pass is implemented by three C++ classes, a common abstract base class "scalar_chain" that contains common functionality, and two derived classes: general_scalar_chain (which handles SI and DI modes) and timode_scalar_chain (which handles TI modes). As mentioned previously, because only TI mode moves were handled the two worker classes behaved significantly differently. These changes bring the functionality of these two classes closer together, which is reflected by refactoring more shared code from general_scalar_chain to the parent scalar_chain and reusing it from timode_scalar_chain. There still remain significant differences (and simplifications) so the existing division of classes (as specializations) continues to make sense. 2022-07-11 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-features.h (scalar_chain): Add fields insns_conv, n_sse_to_integer and n_integer_to_sse to this parent class, moved from general_scalar_chain. (scalar_chain::convert_compare): Protected method moved from general_scalar_chain. (mark_dual_mode_def): Make protected, not private virtual. (scalar_chain:convert_op): New private virtual method. (general_scalar_chain::general_scalar_chain): Simplify constructor. (general_scalar_chain::~general_scalar_chain): Delete destructor. (general_scalar_chain): Move insns_conv, n_sse_to_integer and n_integer_to_sse fields to parent class, scalar_chain. (general_scalar_chain::mark_dual_mode_def): Delete prototype. (general_scalar_chain::convert_compare): Delete prototype. (timode_scalar_chain::compute_convert_gain): Remove simplistic implementation, convert to a method prototype. (timode_scalar_chain::mark_dual_mode_def): Delete prototype. (timode_scalar_chain::convert_op): Prototype new virtual method. * config/i386/i386-features.cc (scalar_chain::scalar_chain): Allocate insns_conv and initialize n_sse_to_integer and n_integer_to_sse fields in constructor. (scalar_chain::scalar_chain): Free insns_conv in destructor. (general_scalar_chain::general_scalar_chain): Delete constructor, now defined in the class declaration. (general_scalar_chain::~general_scalar_chain): Delete destructor. (scalar_chain::mark_dual_mode_def): Renamed from general_scalar_chain::mark_dual_mode_def. (timode_scalar_chain::mark_dual_mode_def): Delete. (scalar_chain::convert_compare): Renamed from general_scalar_chain::convert_compare. (timode_scalar_chain::compute_convert_gain): New method to determine the gain from converting a TImode chain to V1TImode. (timode_scalar_chain::convert_op): New method to convert an operand from TImode to V1TImode. (timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE on REG_EQUAL notes that were originally TImode (not CONST_INT). Handle AND, ANDN, XOR, IOR, NOT and COMPARE. (timode_mem_p): Helper predicate to check where operand is memory reference with sufficient alignment for TImode STV. (timode_scalar_to_vector_candidate_p): Use convertible_comparison_p to check whether COMPARE is convertible. Handle SET_DESTs that that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT, CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT. gcc/testsuite/ChangeLog * gcc.target/i386/sse4_1-stv-2.c: New test case, pand. * gcc.target/i386/sse4_1-stv-3.c: New test case, por. * gcc.target/i386/sse4_1-stv-4.c: New test case, pxor. * gcc.target/i386/sse4_1-stv-5.c: New test case, pandn. * gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
else if (REG_P (dst) && GET_MODE (dst) == smode)
{
/* Replace the definition with a SUBREG to the definition we
PR target/106278: Keep REG_EQUAL notes consistent during TImode STV on x86_64. This patch resolves PR target/106278 a regression on x86_64 caused by my recent TImode STV improvements. Now that TImode STV can handle comparisons such as "(set (regs:CC) (compare:CC (reg:TI) ...))" the convert_insn method sensibly checks that the mode of the SET_DEST is TImode before setting it to V1TImode [to avoid V1TImode appearing on the hard reg CC_FLAGS. Hence the current code looks like: if (GET_MODE (dst) == TImode) { tmp = find_reg_equal_equiv_note (insn); if (tmp && GET_MODE (XEXP (tmp, 0)) == TImode) PUT_MODE (XEXP (tmp, 0), V1TImode); PUT_MODE (dst, V1TImode); fix_debug_reg_uses (dst); } break; which checks GET_MODE (dst) before calling PUT_MODE, and when a change is made updating the REG_EQUAL_NOTE tmp if it exists. The logical flaw (oversight) is that due to RTL sharing, the destination of this set may already have been updated to V1TImode, as this chain is being converted, but we still need to update any REG_EQUAL_NOTE that still has TImode. Hence the correct code is actually: if (GET_MODE (dst) == TImode) { PUT_MODE (dst, V1TImode); fix_debug_reg_uses (dst); } if (GET_MODE (dst) == V1TImode) { tmp = find_reg_equal_equiv_note (insn); if (tmp && GET_MODE (XEXP (tmp, 0)) == TImode) PUT_MODE (XEXP (tmp, 0), V1TImode); } break; While fixing this behavior, I noticed I had some indentation whitespace issues and some vestigial dead code in this function/method that I've taken the liberty of cleaning up (as obvious) in this patch. 2022-07-15 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog PR target/106278 * config/i386/i386-features.cc (general_scalar_chain::convert_insn): Fix indentation whitespace. (timode_scalar_chain::fix_debug_reg_uses): Likewise. (timode_scalar_chain::convert_insn): Delete dead code. Update TImode REG_EQUAL_NOTE even if the SET_DEST is already V1TI. Fix indentation whitespace. (convertible_comparison_p): Likewise. (timode_scalar_to_vector_candidate_p): Likewise. gcc/testsuite/ChangeLog * gcc.dg/pr106278.c: New test case.
2022-07-15 15:39:28 +02:00
use inside the chain. */
rtx *vdef = defs_map.get (dst);
if (vdef)
dst = *vdef;
dst = gen_rtx_SUBREG (vmode, dst, 0);
/* IRA doesn't like to have REG_EQUAL/EQUIV notes when the SET_DEST
PR target/106278: Keep REG_EQUAL notes consistent during TImode STV on x86_64. This patch resolves PR target/106278 a regression on x86_64 caused by my recent TImode STV improvements. Now that TImode STV can handle comparisons such as "(set (regs:CC) (compare:CC (reg:TI) ...))" the convert_insn method sensibly checks that the mode of the SET_DEST is TImode before setting it to V1TImode [to avoid V1TImode appearing on the hard reg CC_FLAGS. Hence the current code looks like: if (GET_MODE (dst) == TImode) { tmp = find_reg_equal_equiv_note (insn); if (tmp && GET_MODE (XEXP (tmp, 0)) == TImode) PUT_MODE (XEXP (tmp, 0), V1TImode); PUT_MODE (dst, V1TImode); fix_debug_reg_uses (dst); } break; which checks GET_MODE (dst) before calling PUT_MODE, and when a change is made updating the REG_EQUAL_NOTE tmp if it exists. The logical flaw (oversight) is that due to RTL sharing, the destination of this set may already have been updated to V1TImode, as this chain is being converted, but we still need to update any REG_EQUAL_NOTE that still has TImode. Hence the correct code is actually: if (GET_MODE (dst) == TImode) { PUT_MODE (dst, V1TImode); fix_debug_reg_uses (dst); } if (GET_MODE (dst) == V1TImode) { tmp = find_reg_equal_equiv_note (insn); if (tmp && GET_MODE (XEXP (tmp, 0)) == TImode) PUT_MODE (XEXP (tmp, 0), V1TImode); } break; While fixing this behavior, I noticed I had some indentation whitespace issues and some vestigial dead code in this function/method that I've taken the liberty of cleaning up (as obvious) in this patch. 2022-07-15 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog PR target/106278 * config/i386/i386-features.cc (general_scalar_chain::convert_insn): Fix indentation whitespace. (timode_scalar_chain::fix_debug_reg_uses): Likewise. (timode_scalar_chain::convert_insn): Delete dead code. Update TImode REG_EQUAL_NOTE even if the SET_DEST is already V1TI. Fix indentation whitespace. (convertible_comparison_p): Likewise. (timode_scalar_to_vector_candidate_p): Likewise. gcc/testsuite/ChangeLog * gcc.dg/pr106278.c: New test case.
2022-07-15 15:39:28 +02:00
is a non-REG_P. So kill those off. */
rtx note = find_reg_equal_equiv_note (insn);
if (note)
remove_note (insn, note);
}
switch (GET_CODE (src))
{
case PLUS:
case MINUS:
case IOR:
case XOR:
case AND:
re PR rtl-optimization/91154 (456.hmmer regression on Haswell caused by r272922) 2019-08-14 Richard Biener <rguenther@suse.de> Uroš Bizjak <ubizjak@gmail.com> PR target/91154 * config/i386/i386-features.h (scalar_chain::scalar_chain): Add mode arguments. (scalar_chain::smode): New member. (scalar_chain::vmode): Likewise. (dimode_scalar_chain): Rename to... (general_scalar_chain): ... this. (general_scalar_chain::general_scalar_chain): Take mode arguments. (timode_scalar_chain::timode_scalar_chain): Initialize scalar_chain base with TImode and V1TImode. * config/i386/i386-features.c (scalar_chain::scalar_chain): Adjust. (general_scalar_chain::vector_const_cost): Adjust for SImode chains. (general_scalar_chain::compute_convert_gain): Likewise. Add {S,U}{MIN,MAX} support. (general_scalar_chain::replace_with_subreg): Use vmode/smode. (general_scalar_chain::make_vector_copies): Likewise. Handle non-DImode chains appropriately. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::convert_op): Likewise. (general_scalar_chain::convert_insn): Likewise. Add fatal_insn_not_found if the result is not recognized. (convertible_comparison_p): Pass in the scalar mode and use that. (general_scalar_to_vector_candidate_p): Likewise. Rename from dimode_scalar_to_vector_candidate_p. Add {S,U}{MIN,MAX} support. (scalar_to_vector_candidate_p): Remove by inlining into single caller. (general_remove_non_convertible_regs): Rename from dimode_remove_non_convertible_regs. (remove_non_convertible_regs): Remove by inlining into single caller. (convert_scalars_to_vector): Handle SImode and DImode chains in addition to TImode chains. * config/i386/i386.md (<maxmin><MAXMIN_IMODE>3): New expander. (*<maxmin><MAXMIN_IMODE>3_1): New insn-and-split. (*<maxmin>di3_doubleword): Likewise. * gcc.target/i386/pr91154.c: New testcase. * gcc.target/i386/minmax-3.c: Likewise. * gcc.target/i386/minmax-4.c: Likewise. * gcc.target/i386/minmax-5.c: Likewise. * gcc.target/i386/minmax-6.c: Likewise. * gcc.target/i386/minmax-1.c: Add -mno-stv. * gcc.target/i386/minmax-2.c: Likewise. Co-Authored-By: Uros Bizjak <ubizjak@gmail.com> From-SVN: r274481
2019-08-14 14:04:05 +02:00
case SMAX:
case SMIN:
case UMAX:
case UMIN:
convert_op (&XEXP (src, 1), insn);
i386: Optimize abs expansion [PR97873] The patch introduces absM named pattern to generate optimal insn sequence for CMOVE_TARGET targets. Currently, the expansion goes through neg+max optabs, and the following code is generated: movl %edi, %eax negl %eax cmpl %edi, %eax cmovl %edi, %eax This sequence is unoptimal in two ways. a) The compare instruction is not needed, since NEG insn sets the sign flag based on the result. The CMOV can use sign flag to select between negated and original value: movl %edi, %eax negl %eax cmovs %edi, %eax b) On some targets, CMOV is undesirable due to its performance issues. In addition to TARGET_EXPAND_ABS bypass, the patch introduces STV conversion of abs RTX to use PABS SSE insn: vmovd %edi, %xmm0 vpabsd %xmm0, %xmm0 vmovd %xmm0, %eax The patch changes compare mode of NEG instruction to CCGOCmode, which is the same mode as the mode of SUB instruction. IOW, sign bit becomes usable. Also, the mode iterator of <maxmin:code><mode>3 pattern is changed to SWI48x instead of SWI248. The purpose of maxmin expander is to prepare max/min RTX for STV to eventually convert them to SSE PMAX/PMIN instructions, in order to *avoid* CMOV insns with general registers. 2020-11-20 Uroš Bizjak <ubizjak@gmail.com> gcc/ PR target/97873 * config/i386/i386.md (*neg<mode>2_2): Rename from "*neg<mode>2_cmpz". Use CCGOCmode instead of CCZmode. (*negsi2_zext): Rename from *negsi2_cmpz_zext. Use CCGOCmode instead of CCZmode. (*neg<mode>_ccc_1): New insn pattern. (*neg<dwi>2_doubleword): Use *neg<mode>_ccc_1. (abs<mode>2): Add FLAGS_REG clobber. Use TARGET_CMOVE insn predicate. (*abs<mode>2_1): New insn_and_split pattern. (*absdi2_doubleword): Ditto. (<maxmin:code><mode>3): Use SWI48x mode iterator. (*<maxmin:code><mode>3): Use SWI48 mode iterator. * config/i386/i386-features.c (general_scalar_chain::compute_convert_gain): Handle ABS code. (general_scalar_chain::convert_insn): Ditto. (general_scalar_to_vector_candidate_p): Ditto. gcc/testsuite/ PR target/97873 * gcc.target/i386/pr97873.c: New test. * gcc.target/i386/pr97873-1.c: New test.
2020-11-20 10:26:34 +01:00
/* FALLTHRU */
case ABS:
case ASHIFT:
case ASHIFTRT:
case LSHIFTRT:
convert_op (&XEXP (src, 0), insn);
re PR rtl-optimization/91154 (456.hmmer regression on Haswell caused by r272922) 2019-08-14 Richard Biener <rguenther@suse.de> Uroš Bizjak <ubizjak@gmail.com> PR target/91154 * config/i386/i386-features.h (scalar_chain::scalar_chain): Add mode arguments. (scalar_chain::smode): New member. (scalar_chain::vmode): Likewise. (dimode_scalar_chain): Rename to... (general_scalar_chain): ... this. (general_scalar_chain::general_scalar_chain): Take mode arguments. (timode_scalar_chain::timode_scalar_chain): Initialize scalar_chain base with TImode and V1TImode. * config/i386/i386-features.c (scalar_chain::scalar_chain): Adjust. (general_scalar_chain::vector_const_cost): Adjust for SImode chains. (general_scalar_chain::compute_convert_gain): Likewise. Add {S,U}{MIN,MAX} support. (general_scalar_chain::replace_with_subreg): Use vmode/smode. (general_scalar_chain::make_vector_copies): Likewise. Handle non-DImode chains appropriately. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::convert_op): Likewise. (general_scalar_chain::convert_insn): Likewise. Add fatal_insn_not_found if the result is not recognized. (convertible_comparison_p): Pass in the scalar mode and use that. (general_scalar_to_vector_candidate_p): Likewise. Rename from dimode_scalar_to_vector_candidate_p. Add {S,U}{MIN,MAX} support. (scalar_to_vector_candidate_p): Remove by inlining into single caller. (general_remove_non_convertible_regs): Rename from dimode_remove_non_convertible_regs. (remove_non_convertible_regs): Remove by inlining into single caller. (convert_scalars_to_vector): Handle SImode and DImode chains in addition to TImode chains. * config/i386/i386.md (<maxmin><MAXMIN_IMODE>3): New expander. (*<maxmin><MAXMIN_IMODE>3_1): New insn-and-split. (*<maxmin>di3_doubleword): Likewise. * gcc.target/i386/pr91154.c: New testcase. * gcc.target/i386/minmax-3.c: Likewise. * gcc.target/i386/minmax-4.c: Likewise. * gcc.target/i386/minmax-5.c: Likewise. * gcc.target/i386/minmax-6.c: Likewise. * gcc.target/i386/minmax-1.c: Add -mno-stv. * gcc.target/i386/minmax-2.c: Likewise. Co-Authored-By: Uros Bizjak <ubizjak@gmail.com> From-SVN: r274481
2019-08-14 14:04:05 +02:00
PUT_MODE (src, vmode);
break;
case NEG:
src = XEXP (src, 0);
i386: Add integer nabs instructions [PR101044] The patch adds integer nabs "(NEG (ABS (...)))" instructions, adds STV conversion and adjusts STV cost calculations accordingly. When CMOV instruction is used to implement abs, the sign is determined from the preceeding operand negation, and CMOVS is used to select between negated and non-negated value. To implement nabs, just reverse the condition and emit CMOVNS instead. The STV costs are adjusted for inherent NOT of nabs insn. V2DI NOT is somehow costly operation, since it is implemented as a load of zero, followed by a SUB insn. OTOH, integer nabs with inherent NOT is relatively cheap, so some STV chains became less profitable for conversion. The patch rewrites operand scanner in compute_convert_gain to a switch and reorders case instances in general_scalar_to_vector_candidate_p to benefit from fallthroughs, and to remove special processing of andnot in the later case. gcc/ 2021-07-01 Uroš Bizjak <ubizjak@gmail.com> PR target/101044 * config/i386/i386.md (*nabs<dwi>2_doubleword): New insn_and_split pattern. (*nabs<dwi>2_1): Ditto. * config/i386/i386-features.c (general_scalar_chain::compute_convert_gain): Handle (NEG (ABS (...))) RTX. Rewrite src code scanner as switch statement. (general_scalar_chain::convert_insn): Handle (NEG (ABS (...))) RTX. (general_scalar_to_vector_candidate_p): Detect (NEG (ABS (...))) RTX. Reorder case statements for (AND (NOT (...) ...)) fallthrough. gcc/testsuite/ 2021-07-01 Uroš Bizjak <ubizjak@gmail.com> PR target/101044 * gcc.target/i386/pr101044.c: New test.
2021-07-01 10:56:32 +02:00
if (GET_CODE (src) == ABS)
{
src = XEXP (src, 0);
convert_op (&src, insn);
subreg = gen_reg_rtx (vmode);
emit_insn_before (gen_rtx_SET (subreg,
gen_rtx_ABS (vmode, src)), insn);
src = subreg;
}
else
convert_op (&src, insn);
re PR rtl-optimization/91154 (456.hmmer regression on Haswell caused by r272922) 2019-08-14 Richard Biener <rguenther@suse.de> Uroš Bizjak <ubizjak@gmail.com> PR target/91154 * config/i386/i386-features.h (scalar_chain::scalar_chain): Add mode arguments. (scalar_chain::smode): New member. (scalar_chain::vmode): Likewise. (dimode_scalar_chain): Rename to... (general_scalar_chain): ... this. (general_scalar_chain::general_scalar_chain): Take mode arguments. (timode_scalar_chain::timode_scalar_chain): Initialize scalar_chain base with TImode and V1TImode. * config/i386/i386-features.c (scalar_chain::scalar_chain): Adjust. (general_scalar_chain::vector_const_cost): Adjust for SImode chains. (general_scalar_chain::compute_convert_gain): Likewise. Add {S,U}{MIN,MAX} support. (general_scalar_chain::replace_with_subreg): Use vmode/smode. (general_scalar_chain::make_vector_copies): Likewise. Handle non-DImode chains appropriately. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::convert_op): Likewise. (general_scalar_chain::convert_insn): Likewise. Add fatal_insn_not_found if the result is not recognized. (convertible_comparison_p): Pass in the scalar mode and use that. (general_scalar_to_vector_candidate_p): Likewise. Rename from dimode_scalar_to_vector_candidate_p. Add {S,U}{MIN,MAX} support. (scalar_to_vector_candidate_p): Remove by inlining into single caller. (general_remove_non_convertible_regs): Rename from dimode_remove_non_convertible_regs. (remove_non_convertible_regs): Remove by inlining into single caller. (convert_scalars_to_vector): Handle SImode and DImode chains in addition to TImode chains. * config/i386/i386.md (<maxmin><MAXMIN_IMODE>3): New expander. (*<maxmin><MAXMIN_IMODE>3_1): New insn-and-split. (*<maxmin>di3_doubleword): Likewise. * gcc.target/i386/pr91154.c: New testcase. * gcc.target/i386/minmax-3.c: Likewise. * gcc.target/i386/minmax-4.c: Likewise. * gcc.target/i386/minmax-5.c: Likewise. * gcc.target/i386/minmax-6.c: Likewise. * gcc.target/i386/minmax-1.c: Add -mno-stv. * gcc.target/i386/minmax-2.c: Likewise. Co-Authored-By: Uros Bizjak <ubizjak@gmail.com> From-SVN: r274481
2019-08-14 14:04:05 +02:00
subreg = gen_reg_rtx (vmode);
emit_insn_before (gen_move_insn (subreg, CONST0_RTX (vmode)), insn);
src = gen_rtx_MINUS (vmode, subreg, src);
break;
case NOT:
src = XEXP (src, 0);
convert_op (&src, insn);
re PR rtl-optimization/91154 (456.hmmer regression on Haswell caused by r272922) 2019-08-14 Richard Biener <rguenther@suse.de> Uroš Bizjak <ubizjak@gmail.com> PR target/91154 * config/i386/i386-features.h (scalar_chain::scalar_chain): Add mode arguments. (scalar_chain::smode): New member. (scalar_chain::vmode): Likewise. (dimode_scalar_chain): Rename to... (general_scalar_chain): ... this. (general_scalar_chain::general_scalar_chain): Take mode arguments. (timode_scalar_chain::timode_scalar_chain): Initialize scalar_chain base with TImode and V1TImode. * config/i386/i386-features.c (scalar_chain::scalar_chain): Adjust. (general_scalar_chain::vector_const_cost): Adjust for SImode chains. (general_scalar_chain::compute_convert_gain): Likewise. Add {S,U}{MIN,MAX} support. (general_scalar_chain::replace_with_subreg): Use vmode/smode. (general_scalar_chain::make_vector_copies): Likewise. Handle non-DImode chains appropriately. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::convert_op): Likewise. (general_scalar_chain::convert_insn): Likewise. Add fatal_insn_not_found if the result is not recognized. (convertible_comparison_p): Pass in the scalar mode and use that. (general_scalar_to_vector_candidate_p): Likewise. Rename from dimode_scalar_to_vector_candidate_p. Add {S,U}{MIN,MAX} support. (scalar_to_vector_candidate_p): Remove by inlining into single caller. (general_remove_non_convertible_regs): Rename from dimode_remove_non_convertible_regs. (remove_non_convertible_regs): Remove by inlining into single caller. (convert_scalars_to_vector): Handle SImode and DImode chains in addition to TImode chains. * config/i386/i386.md (<maxmin><MAXMIN_IMODE>3): New expander. (*<maxmin><MAXMIN_IMODE>3_1): New insn-and-split. (*<maxmin>di3_doubleword): Likewise. * gcc.target/i386/pr91154.c: New testcase. * gcc.target/i386/minmax-3.c: Likewise. * gcc.target/i386/minmax-4.c: Likewise. * gcc.target/i386/minmax-5.c: Likewise. * gcc.target/i386/minmax-6.c: Likewise. * gcc.target/i386/minmax-1.c: Add -mno-stv. * gcc.target/i386/minmax-2.c: Likewise. Co-Authored-By: Uros Bizjak <ubizjak@gmail.com> From-SVN: r274481
2019-08-14 14:04:05 +02:00
subreg = gen_reg_rtx (vmode);
emit_insn_before (gen_move_insn (subreg, CONSTM1_RTX (vmode)), insn);
src = gen_rtx_XOR (vmode, src, subreg);
break;
case MEM:
if (!REG_P (dst))
convert_op (&src, insn);
break;
case REG:
if (!MEM_P (dst))
convert_op (&src, insn);
break;
case SUBREG:
re PR rtl-optimization/91154 (456.hmmer regression on Haswell caused by r272922) 2019-08-14 Richard Biener <rguenther@suse.de> Uroš Bizjak <ubizjak@gmail.com> PR target/91154 * config/i386/i386-features.h (scalar_chain::scalar_chain): Add mode arguments. (scalar_chain::smode): New member. (scalar_chain::vmode): Likewise. (dimode_scalar_chain): Rename to... (general_scalar_chain): ... this. (general_scalar_chain::general_scalar_chain): Take mode arguments. (timode_scalar_chain::timode_scalar_chain): Initialize scalar_chain base with TImode and V1TImode. * config/i386/i386-features.c (scalar_chain::scalar_chain): Adjust. (general_scalar_chain::vector_const_cost): Adjust for SImode chains. (general_scalar_chain::compute_convert_gain): Likewise. Add {S,U}{MIN,MAX} support. (general_scalar_chain::replace_with_subreg): Use vmode/smode. (general_scalar_chain::make_vector_copies): Likewise. Handle non-DImode chains appropriately. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::convert_op): Likewise. (general_scalar_chain::convert_insn): Likewise. Add fatal_insn_not_found if the result is not recognized. (convertible_comparison_p): Pass in the scalar mode and use that. (general_scalar_to_vector_candidate_p): Likewise. Rename from dimode_scalar_to_vector_candidate_p. Add {S,U}{MIN,MAX} support. (scalar_to_vector_candidate_p): Remove by inlining into single caller. (general_remove_non_convertible_regs): Rename from dimode_remove_non_convertible_regs. (remove_non_convertible_regs): Remove by inlining into single caller. (convert_scalars_to_vector): Handle SImode and DImode chains in addition to TImode chains. * config/i386/i386.md (<maxmin><MAXMIN_IMODE>3): New expander. (*<maxmin><MAXMIN_IMODE>3_1): New insn-and-split. (*<maxmin>di3_doubleword): Likewise. * gcc.target/i386/pr91154.c: New testcase. * gcc.target/i386/minmax-3.c: Likewise. * gcc.target/i386/minmax-4.c: Likewise. * gcc.target/i386/minmax-5.c: Likewise. * gcc.target/i386/minmax-6.c: Likewise. * gcc.target/i386/minmax-1.c: Add -mno-stv. * gcc.target/i386/minmax-2.c: Likewise. Co-Authored-By: Uros Bizjak <ubizjak@gmail.com> From-SVN: r274481
2019-08-14 14:04:05 +02:00
gcc_assert (GET_MODE (src) == vmode);
break;
case COMPARE:
dst = gen_rtx_REG (CCmode, FLAGS_REG);
PR target/70321: Split double word equality/inequality after STV on x86. This patch resolves the last piece of PR target/70321 a code quality (P2 regression) affecting mainline. Currently, for HJ's testcase: void foo (long long ixi) { if (ixi != 14348907) __builtin_abort (); } GCC with -m32 -O2 generates four instructions for the comparison: movl 16(%esp), %eax movl 20(%esp), %edx xorl $14348907, %eax orl %eax, %edx but with this patch it now requires only three, making better use of x86's addressing modes: movl 16(%esp), %eax xorl $14348907, %eax orl 20(%esp), %eax The solution is to expand "doubleword" equality/inequality expressions using flag setting COMPARE instructions for the early RTL passes, and then split them during split1, after STV and before reload. Hence on x86_64, we now see/allow things like: (insn 11 8 12 2 (set (reg:CCZ 17 flags) (compare:CCZ (reg/v:TI 84 [ x ]) (reg:TI 96))) "cmpti.c":2:43 30 {*cmpti_doubleword} This allows the STV pass to decide whether it's preferrable to perform this comparison using vector operations, i.e. a pxor/ptest sequence, or as scalar integer operations, i.e. a xor/xor/or sequence. Alas this required tweaking of the STV pass to recognize the "new" form of these comparisons and split out the pxor operation itself. To confirm this still works as expected I've added a new STV test case: long long a[1024]; long long b[1024]; int foo() { for (int i=0; i<1024; i++) { long long t = (a[i]<<8) | (b[i]<<24); if (t == 0) return 1; } return 0; } where with -m32 -O2 -msse4.1 the above comparison with zero should look like: punpcklqdq %xmm0, %xmm0 ptest %xmm0, %xmm0 Although this patch includes one or two minor tweaks to provide all the necessary infrastructure to support conversion of TImode comparisons to V1TImode (and SImode comparisons to V4SImode), STV doesn't yet implement these transformations, but this is something that can be considered after stage 4. Indeed the new convert_compare functionality is split out into a method to simplify its potential reuse by the timode_scalar_chain class. 2022-05-30 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog PR target/70321 * config/i386/i386-expand.cc (ix86_expand_branch): Don't decompose DI mode equality/inequality using XOR here. Instead generate a COMPARE for doubleword modes (DImode on !TARGET_64BIT or TImode). * config/i386/i386-features.cc (gen_gpr_to_xmm_move_src): Use gen_rtx_SUBREG when NUNITS is 1, i.e. for TImode to V1TImode. (general_scalar_chain::convert_compare): New function to convert scalar equality/inequality comparison into vector operations. (general_scalar_chain::convert_insn) [COMPARE]: Refactor. Call new convert_compare helper method. (convertible_comparion_p): Update to match doubleword COMPARE of two register, memory or integer constant operands. * config/i386/i386-features.h (general_scalar_chain::convert_compare): Prototype/declare member function here. * config/i386/i386.md (cstore<mode>4): Change mode to SDWIM, but only allow new doubleword modes for EQ and NE operators. (*cmp<dwi>_doubleword): New define_insn_and_split, to split a doubleword comparison into a pair of XORs followed by an IOR to set the (zero) flags register, optimizing the XORs if possible. * config/i386/sse.md (V_AVX): Include V1TI and V2TI in mode iterator; V_AVX is (currently) only used by ptest. (sse4_1 mode attribute): Update to support V1TI and V2TI. gcc/testsuite/ChangeLog PR target/70321 * gcc.target/i386/pr70321.c: New test case. * gcc.target/i386/sse4_1-stv-1.c: New test case.
2022-05-30 22:20:09 +02:00
src = convert_compare (XEXP (src, 0), XEXP (src, 1), insn);
break;
case CONST_INT:
convert_op (&src, insn);
break;
default:
gcc_unreachable ();
}
SET_SRC (def_set) = src;
SET_DEST (def_set) = dst;
/* Drop possible dead definitions. */
PATTERN (insn) = def_set;
INSN_CODE (insn) = -1;
re PR rtl-optimization/91154 (456.hmmer regression on Haswell caused by r272922) 2019-08-14 Richard Biener <rguenther@suse.de> Uroš Bizjak <ubizjak@gmail.com> PR target/91154 * config/i386/i386-features.h (scalar_chain::scalar_chain): Add mode arguments. (scalar_chain::smode): New member. (scalar_chain::vmode): Likewise. (dimode_scalar_chain): Rename to... (general_scalar_chain): ... this. (general_scalar_chain::general_scalar_chain): Take mode arguments. (timode_scalar_chain::timode_scalar_chain): Initialize scalar_chain base with TImode and V1TImode. * config/i386/i386-features.c (scalar_chain::scalar_chain): Adjust. (general_scalar_chain::vector_const_cost): Adjust for SImode chains. (general_scalar_chain::compute_convert_gain): Likewise. Add {S,U}{MIN,MAX} support. (general_scalar_chain::replace_with_subreg): Use vmode/smode. (general_scalar_chain::make_vector_copies): Likewise. Handle non-DImode chains appropriately. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::convert_op): Likewise. (general_scalar_chain::convert_insn): Likewise. Add fatal_insn_not_found if the result is not recognized. (convertible_comparison_p): Pass in the scalar mode and use that. (general_scalar_to_vector_candidate_p): Likewise. Rename from dimode_scalar_to_vector_candidate_p. Add {S,U}{MIN,MAX} support. (scalar_to_vector_candidate_p): Remove by inlining into single caller. (general_remove_non_convertible_regs): Rename from dimode_remove_non_convertible_regs. (remove_non_convertible_regs): Remove by inlining into single caller. (convert_scalars_to_vector): Handle SImode and DImode chains in addition to TImode chains. * config/i386/i386.md (<maxmin><MAXMIN_IMODE>3): New expander. (*<maxmin><MAXMIN_IMODE>3_1): New insn-and-split. (*<maxmin>di3_doubleword): Likewise. * gcc.target/i386/pr91154.c: New testcase. * gcc.target/i386/minmax-3.c: Likewise. * gcc.target/i386/minmax-4.c: Likewise. * gcc.target/i386/minmax-5.c: Likewise. * gcc.target/i386/minmax-6.c: Likewise. * gcc.target/i386/minmax-1.c: Add -mno-stv. * gcc.target/i386/minmax-2.c: Likewise. Co-Authored-By: Uros Bizjak <ubizjak@gmail.com> From-SVN: r274481
2019-08-14 14:04:05 +02:00
int patt = recog_memoized (insn);
if (patt == -1)
fatal_insn_not_found (insn);
df_insn_rescan (insn);
}
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64. This patch upgrades x86_64's scalar-to-vector (STV) pass to more aggressively transform 128-bit scalar TImode operations into vector V1TImode operations performed on SSE registers. TImode functionality already exists in STV, but only for move operations. This change brings support for logical operations (AND, IOR, XOR, NOT and ANDN) and comparisons. The effect of these changes are conveniently demonstrated by the new sse4_1-stv-5.c test case: __int128 a[16]; __int128 b[16]; __int128 c[16]; void foo() { for (unsigned int i=0; i<16; i++) a[i] = b[i] & ~c[i]; } which when currently compiled on mainline wtih -O2 -msse4 produces: foo: xorl %eax, %eax .L2: movq c(%rax), %rsi movq c+8(%rax), %rdi addq $16, %rax notq %rsi notq %rdi andq b-16(%rax), %rsi andq b-8(%rax), %rdi movq %rsi, a-16(%rax) movq %rdi, a-8(%rax) cmpq $256, %rax jne .L2 ret but with this patch now produces: foo: xorl %eax, %eax .L2: movdqa c(%rax), %xmm0 pandn b(%rax), %xmm0 addq $16, %rax movaps %xmm0, a-16(%rax) cmpq $256, %rax jne .L2 ret Technically, the STV pass is implemented by three C++ classes, a common abstract base class "scalar_chain" that contains common functionality, and two derived classes: general_scalar_chain (which handles SI and DI modes) and timode_scalar_chain (which handles TI modes). As mentioned previously, because only TI mode moves were handled the two worker classes behaved significantly differently. These changes bring the functionality of these two classes closer together, which is reflected by refactoring more shared code from general_scalar_chain to the parent scalar_chain and reusing it from timode_scalar_chain. There still remain significant differences (and simplifications) so the existing division of classes (as specializations) continues to make sense. 2022-07-11 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-features.h (scalar_chain): Add fields insns_conv, n_sse_to_integer and n_integer_to_sse to this parent class, moved from general_scalar_chain. (scalar_chain::convert_compare): Protected method moved from general_scalar_chain. (mark_dual_mode_def): Make protected, not private virtual. (scalar_chain:convert_op): New private virtual method. (general_scalar_chain::general_scalar_chain): Simplify constructor. (general_scalar_chain::~general_scalar_chain): Delete destructor. (general_scalar_chain): Move insns_conv, n_sse_to_integer and n_integer_to_sse fields to parent class, scalar_chain. (general_scalar_chain::mark_dual_mode_def): Delete prototype. (general_scalar_chain::convert_compare): Delete prototype. (timode_scalar_chain::compute_convert_gain): Remove simplistic implementation, convert to a method prototype. (timode_scalar_chain::mark_dual_mode_def): Delete prototype. (timode_scalar_chain::convert_op): Prototype new virtual method. * config/i386/i386-features.cc (scalar_chain::scalar_chain): Allocate insns_conv and initialize n_sse_to_integer and n_integer_to_sse fields in constructor. (scalar_chain::scalar_chain): Free insns_conv in destructor. (general_scalar_chain::general_scalar_chain): Delete constructor, now defined in the class declaration. (general_scalar_chain::~general_scalar_chain): Delete destructor. (scalar_chain::mark_dual_mode_def): Renamed from general_scalar_chain::mark_dual_mode_def. (timode_scalar_chain::mark_dual_mode_def): Delete. (scalar_chain::convert_compare): Renamed from general_scalar_chain::convert_compare. (timode_scalar_chain::compute_convert_gain): New method to determine the gain from converting a TImode chain to V1TImode. (timode_scalar_chain::convert_op): New method to convert an operand from TImode to V1TImode. (timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE on REG_EQUAL notes that were originally TImode (not CONST_INT). Handle AND, ANDN, XOR, IOR, NOT and COMPARE. (timode_mem_p): Helper predicate to check where operand is memory reference with sufficient alignment for TImode STV. (timode_scalar_to_vector_candidate_p): Use convertible_comparison_p to check whether COMPARE is convertible. Handle SET_DESTs that that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT, CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT. gcc/testsuite/ChangeLog * gcc.target/i386/sse4_1-stv-2.c: New test case, pand. * gcc.target/i386/sse4_1-stv-3.c: New test case, por. * gcc.target/i386/sse4_1-stv-4.c: New test case, pxor. * gcc.target/i386/sse4_1-stv-5.c: New test case, pandn. * gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
/* Compute a gain for chain conversion. */
int
timode_scalar_chain::compute_convert_gain ()
{
/* Assume that if we have to move TImode values between units,
then transforming this chain isn't worth it. */
if (n_sse_to_integer || n_integer_to_sse)
return -1;
bitmap_iterator bi;
unsigned insn_uid;
/* Split ties to prefer V1TImode when not optimizing for size. */
int gain = optimize_size ? 0 : 1;
if (dump_file)
fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
{
rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
rtx def_set = single_set (insn);
rtx src = SET_SRC (def_set);
rtx dst = SET_DEST (def_set);
int igain = 0;
switch (GET_CODE (src))
{
case REG:
if (optimize_insn_for_size_p ())
igain = MEM_P (dst) ? COSTS_N_BYTES (6) : COSTS_N_BYTES (3);
else
igain = COSTS_N_INSNS (1);
break;
case MEM:
igain = optimize_insn_for_size_p () ? COSTS_N_BYTES (7)
: COSTS_N_INSNS (1);
break;
case CONST_INT:
if (MEM_P (dst)
&& standard_sse_constant_p (src, V1TImode))
igain = optimize_insn_for_size_p() ? COSTS_N_BYTES (11) : 1;
break;
case NOT:
if (MEM_P (dst))
igain = -COSTS_N_INSNS (1);
break;
case AND:
case XOR:
case IOR:
if (!MEM_P (dst))
igain = COSTS_N_INSNS (1);
break;
case ASHIFT:
case LSHIFTRT:
/* For logical shifts by constant multiples of 8. */
igain = optimize_insn_for_size_p () ? COSTS_N_BYTES (4)
: COSTS_N_INSNS (1);
break;
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64. This patch upgrades x86_64's scalar-to-vector (STV) pass to more aggressively transform 128-bit scalar TImode operations into vector V1TImode operations performed on SSE registers. TImode functionality already exists in STV, but only for move operations. This change brings support for logical operations (AND, IOR, XOR, NOT and ANDN) and comparisons. The effect of these changes are conveniently demonstrated by the new sse4_1-stv-5.c test case: __int128 a[16]; __int128 b[16]; __int128 c[16]; void foo() { for (unsigned int i=0; i<16; i++) a[i] = b[i] & ~c[i]; } which when currently compiled on mainline wtih -O2 -msse4 produces: foo: xorl %eax, %eax .L2: movq c(%rax), %rsi movq c+8(%rax), %rdi addq $16, %rax notq %rsi notq %rdi andq b-16(%rax), %rsi andq b-8(%rax), %rdi movq %rsi, a-16(%rax) movq %rdi, a-8(%rax) cmpq $256, %rax jne .L2 ret but with this patch now produces: foo: xorl %eax, %eax .L2: movdqa c(%rax), %xmm0 pandn b(%rax), %xmm0 addq $16, %rax movaps %xmm0, a-16(%rax) cmpq $256, %rax jne .L2 ret Technically, the STV pass is implemented by three C++ classes, a common abstract base class "scalar_chain" that contains common functionality, and two derived classes: general_scalar_chain (which handles SI and DI modes) and timode_scalar_chain (which handles TI modes). As mentioned previously, because only TI mode moves were handled the two worker classes behaved significantly differently. These changes bring the functionality of these two classes closer together, which is reflected by refactoring more shared code from general_scalar_chain to the parent scalar_chain and reusing it from timode_scalar_chain. There still remain significant differences (and simplifications) so the existing division of classes (as specializations) continues to make sense. 2022-07-11 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-features.h (scalar_chain): Add fields insns_conv, n_sse_to_integer and n_integer_to_sse to this parent class, moved from general_scalar_chain. (scalar_chain::convert_compare): Protected method moved from general_scalar_chain. (mark_dual_mode_def): Make protected, not private virtual. (scalar_chain:convert_op): New private virtual method. (general_scalar_chain::general_scalar_chain): Simplify constructor. (general_scalar_chain::~general_scalar_chain): Delete destructor. (general_scalar_chain): Move insns_conv, n_sse_to_integer and n_integer_to_sse fields to parent class, scalar_chain. (general_scalar_chain::mark_dual_mode_def): Delete prototype. (general_scalar_chain::convert_compare): Delete prototype. (timode_scalar_chain::compute_convert_gain): Remove simplistic implementation, convert to a method prototype. (timode_scalar_chain::mark_dual_mode_def): Delete prototype. (timode_scalar_chain::convert_op): Prototype new virtual method. * config/i386/i386-features.cc (scalar_chain::scalar_chain): Allocate insns_conv and initialize n_sse_to_integer and n_integer_to_sse fields in constructor. (scalar_chain::scalar_chain): Free insns_conv in destructor. (general_scalar_chain::general_scalar_chain): Delete constructor, now defined in the class declaration. (general_scalar_chain::~general_scalar_chain): Delete destructor. (scalar_chain::mark_dual_mode_def): Renamed from general_scalar_chain::mark_dual_mode_def. (timode_scalar_chain::mark_dual_mode_def): Delete. (scalar_chain::convert_compare): Renamed from general_scalar_chain::convert_compare. (timode_scalar_chain::compute_convert_gain): New method to determine the gain from converting a TImode chain to V1TImode. (timode_scalar_chain::convert_op): New method to convert an operand from TImode to V1TImode. (timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE on REG_EQUAL notes that were originally TImode (not CONST_INT). Handle AND, ANDN, XOR, IOR, NOT and COMPARE. (timode_mem_p): Helper predicate to check where operand is memory reference with sufficient alignment for TImode STV. (timode_scalar_to_vector_candidate_p): Use convertible_comparison_p to check whether COMPARE is convertible. Handle SET_DESTs that that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT, CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT. gcc/testsuite/ChangeLog * gcc.target/i386/sse4_1-stv-2.c: New test case, pand. * gcc.target/i386/sse4_1-stv-3.c: New test case, por. * gcc.target/i386/sse4_1-stv-4.c: New test case, pxor. * gcc.target/i386/sse4_1-stv-5.c: New test case, pandn. * gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
default:
break;
}
if (igain != 0 && dump_file)
{
fprintf (dump_file, " Instruction gain %d for ", igain);
dump_insn_slim (dump_file, insn);
}
gain += igain;
}
if (dump_file)
fprintf (dump_file, " Total gain: %d\n", gain);
return gain;
}
/* Fix uses of converted REG in debug insns. */
void
timode_scalar_chain::fix_debug_reg_uses (rtx reg)
{
if (!flag_var_tracking)
return;
df_ref ref, next;
for (ref = DF_REG_USE_CHAIN (REGNO (reg)); ref; ref = next)
{
rtx_insn *insn = DF_REF_INSN (ref);
/* Make sure the next ref is for a different instruction,
PR target/106278: Keep REG_EQUAL notes consistent during TImode STV on x86_64. This patch resolves PR target/106278 a regression on x86_64 caused by my recent TImode STV improvements. Now that TImode STV can handle comparisons such as "(set (regs:CC) (compare:CC (reg:TI) ...))" the convert_insn method sensibly checks that the mode of the SET_DEST is TImode before setting it to V1TImode [to avoid V1TImode appearing on the hard reg CC_FLAGS. Hence the current code looks like: if (GET_MODE (dst) == TImode) { tmp = find_reg_equal_equiv_note (insn); if (tmp && GET_MODE (XEXP (tmp, 0)) == TImode) PUT_MODE (XEXP (tmp, 0), V1TImode); PUT_MODE (dst, V1TImode); fix_debug_reg_uses (dst); } break; which checks GET_MODE (dst) before calling PUT_MODE, and when a change is made updating the REG_EQUAL_NOTE tmp if it exists. The logical flaw (oversight) is that due to RTL sharing, the destination of this set may already have been updated to V1TImode, as this chain is being converted, but we still need to update any REG_EQUAL_NOTE that still has TImode. Hence the correct code is actually: if (GET_MODE (dst) == TImode) { PUT_MODE (dst, V1TImode); fix_debug_reg_uses (dst); } if (GET_MODE (dst) == V1TImode) { tmp = find_reg_equal_equiv_note (insn); if (tmp && GET_MODE (XEXP (tmp, 0)) == TImode) PUT_MODE (XEXP (tmp, 0), V1TImode); } break; While fixing this behavior, I noticed I had some indentation whitespace issues and some vestigial dead code in this function/method that I've taken the liberty of cleaning up (as obvious) in this patch. 2022-07-15 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog PR target/106278 * config/i386/i386-features.cc (general_scalar_chain::convert_insn): Fix indentation whitespace. (timode_scalar_chain::fix_debug_reg_uses): Likewise. (timode_scalar_chain::convert_insn): Delete dead code. Update TImode REG_EQUAL_NOTE even if the SET_DEST is already V1TI. Fix indentation whitespace. (convertible_comparison_p): Likewise. (timode_scalar_to_vector_candidate_p): Likewise. gcc/testsuite/ChangeLog * gcc.dg/pr106278.c: New test case.
2022-07-15 15:39:28 +02:00
so that we're not affected by the rescan. */
next = DF_REF_NEXT_REG (ref);
while (next && DF_REF_INSN (next) == insn)
next = DF_REF_NEXT_REG (next);
if (DEBUG_INSN_P (insn))
{
/* It may be a debug insn with a TImode variable in
register. */
bool changed = false;
for (; ref != next; ref = DF_REF_NEXT_REG (ref))
{
rtx *loc = DF_REF_LOC (ref);
if (REG_P (*loc) && GET_MODE (*loc) == V1TImode)
{
*loc = gen_rtx_SUBREG (TImode, *loc, 0);
changed = true;
}
}
if (changed)
df_insn_rescan (insn);
}
}
}
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64. This patch upgrades x86_64's scalar-to-vector (STV) pass to more aggressively transform 128-bit scalar TImode operations into vector V1TImode operations performed on SSE registers. TImode functionality already exists in STV, but only for move operations. This change brings support for logical operations (AND, IOR, XOR, NOT and ANDN) and comparisons. The effect of these changes are conveniently demonstrated by the new sse4_1-stv-5.c test case: __int128 a[16]; __int128 b[16]; __int128 c[16]; void foo() { for (unsigned int i=0; i<16; i++) a[i] = b[i] & ~c[i]; } which when currently compiled on mainline wtih -O2 -msse4 produces: foo: xorl %eax, %eax .L2: movq c(%rax), %rsi movq c+8(%rax), %rdi addq $16, %rax notq %rsi notq %rdi andq b-16(%rax), %rsi andq b-8(%rax), %rdi movq %rsi, a-16(%rax) movq %rdi, a-8(%rax) cmpq $256, %rax jne .L2 ret but with this patch now produces: foo: xorl %eax, %eax .L2: movdqa c(%rax), %xmm0 pandn b(%rax), %xmm0 addq $16, %rax movaps %xmm0, a-16(%rax) cmpq $256, %rax jne .L2 ret Technically, the STV pass is implemented by three C++ classes, a common abstract base class "scalar_chain" that contains common functionality, and two derived classes: general_scalar_chain (which handles SI and DI modes) and timode_scalar_chain (which handles TI modes). As mentioned previously, because only TI mode moves were handled the two worker classes behaved significantly differently. These changes bring the functionality of these two classes closer together, which is reflected by refactoring more shared code from general_scalar_chain to the parent scalar_chain and reusing it from timode_scalar_chain. There still remain significant differences (and simplifications) so the existing division of classes (as specializations) continues to make sense. 2022-07-11 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-features.h (scalar_chain): Add fields insns_conv, n_sse_to_integer and n_integer_to_sse to this parent class, moved from general_scalar_chain. (scalar_chain::convert_compare): Protected method moved from general_scalar_chain. (mark_dual_mode_def): Make protected, not private virtual. (scalar_chain:convert_op): New private virtual method. (general_scalar_chain::general_scalar_chain): Simplify constructor. (general_scalar_chain::~general_scalar_chain): Delete destructor. (general_scalar_chain): Move insns_conv, n_sse_to_integer and n_integer_to_sse fields to parent class, scalar_chain. (general_scalar_chain::mark_dual_mode_def): Delete prototype. (general_scalar_chain::convert_compare): Delete prototype. (timode_scalar_chain::compute_convert_gain): Remove simplistic implementation, convert to a method prototype. (timode_scalar_chain::mark_dual_mode_def): Delete prototype. (timode_scalar_chain::convert_op): Prototype new virtual method. * config/i386/i386-features.cc (scalar_chain::scalar_chain): Allocate insns_conv and initialize n_sse_to_integer and n_integer_to_sse fields in constructor. (scalar_chain::scalar_chain): Free insns_conv in destructor. (general_scalar_chain::general_scalar_chain): Delete constructor, now defined in the class declaration. (general_scalar_chain::~general_scalar_chain): Delete destructor. (scalar_chain::mark_dual_mode_def): Renamed from general_scalar_chain::mark_dual_mode_def. (timode_scalar_chain::mark_dual_mode_def): Delete. (scalar_chain::convert_compare): Renamed from general_scalar_chain::convert_compare. (timode_scalar_chain::compute_convert_gain): New method to determine the gain from converting a TImode chain to V1TImode. (timode_scalar_chain::convert_op): New method to convert an operand from TImode to V1TImode. (timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE on REG_EQUAL notes that were originally TImode (not CONST_INT). Handle AND, ANDN, XOR, IOR, NOT and COMPARE. (timode_mem_p): Helper predicate to check where operand is memory reference with sufficient alignment for TImode STV. (timode_scalar_to_vector_candidate_p): Use convertible_comparison_p to check whether COMPARE is convertible. Handle SET_DESTs that that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT, CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT. gcc/testsuite/ChangeLog * gcc.target/i386/sse4_1-stv-2.c: New test case, pand. * gcc.target/i386/sse4_1-stv-3.c: New test case, por. * gcc.target/i386/sse4_1-stv-4.c: New test case, pxor. * gcc.target/i386/sse4_1-stv-5.c: New test case, pandn. * gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
/* Convert operand OP in INSN from TImode to V1TImode. */
void
timode_scalar_chain::convert_op (rtx *op, rtx_insn *insn)
{
Use PTEST to perform AND in TImode STV of (A & B) != 0 on x86_64. This x86_64 backend patch allows TImode STV to take advantage of the fact that the PTEST instruction performs an AND operation. Previously PTEST was (mostly) used for comparison against zero, by using the same operands. The benefits are demonstrated by the new test case: __int128 a,b; int foo() { return (a & b) != 0; } Currently with -O2 -msse4 we generate: movdqa a(%rip), %xmm0 pand b(%rip), %xmm0 xorl %eax, %eax ptest %xmm0, %xmm0 setne %al ret with this patch we now generate: movdqa a(%rip), %xmm0 xorl %eax, %eax ptest b(%rip), %xmm0 setne %al ret Technically, the magic happens using new define_insn_and_split patterns. Using two patterns allows this transformation to performed independently of whether TImode STV is run before or after combine. The one tricky case is that immediate constant operands of the AND behave slightly differently between TImode and V1TImode: All V1TImode immediate operands becomes loads, but for TImode only values that are not hilo_operands need to be loaded. Hence the new *testti_doubleword accepts any general_operand, but internally during split calls force_reg whenever the second operand is not x86_64_hilo_general_operand. This required (benefits from) some tweaks to TImode STV to support CONST_WIDE_INT in more places, using CONST_SCALAR_INT_P instead of just CONST_INT_P. 2022-08-09 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-features.cc (scalar_chain::convert_compare): Create new pseudos only when/if needed. Add support for TEST, i.e. (COMPARE (AND x y) (const_int 0)), using UNSPEC_PTEST. When broadcasting V2DImode and V4SImode use new pseudo register. (timode_scalar_chain::convert_op): Do nothing if operand is already V1TImode. Avoid generating useless SUBREG conversions, i.e. (SUBREG:V1TImode (REG:V1TImode) 0). Handle CONST_WIDE_INT in addition to CONST_INT by using CONST_SCALAR_INT_P. (convertible_comparison_p): Use CONST_SCALAR_INT_P to match both CONST_WIDE_INT and CONST_INT. Recognize new *testti_doubleword pattern as an STV candidate. (timode_scalar_to_vector_candidate_p): Allow CONST_SCALAR_INT_P operands in binary logic operations. * config/i386/i386.cc (ix86_rtx_costs) <case UNSPEC>: Add costs for UNSPEC_PTEST; a PTEST that performs an AND has the same cost as regular PTEST, i.e. cost->sse_op. * config/i386/i386.md (*testti_doubleword): New pre-reload define_insn_and_split that recognizes comparison of TI mode AND against zero. * config/i386/sse.md (*ptest<mode>_and): New pre-reload define_insn_and_split that recognizes UNSPEC_PTEST of identical AND operands. gcc/testsuite/ChangeLog * gcc.target/i386/sse4_1-stv-8.c: New test case.
2022-08-09 19:59:55 +02:00
if (GET_MODE (*op) == V1TImode)
return;
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64. This patch upgrades x86_64's scalar-to-vector (STV) pass to more aggressively transform 128-bit scalar TImode operations into vector V1TImode operations performed on SSE registers. TImode functionality already exists in STV, but only for move operations. This change brings support for logical operations (AND, IOR, XOR, NOT and ANDN) and comparisons. The effect of these changes are conveniently demonstrated by the new sse4_1-stv-5.c test case: __int128 a[16]; __int128 b[16]; __int128 c[16]; void foo() { for (unsigned int i=0; i<16; i++) a[i] = b[i] & ~c[i]; } which when currently compiled on mainline wtih -O2 -msse4 produces: foo: xorl %eax, %eax .L2: movq c(%rax), %rsi movq c+8(%rax), %rdi addq $16, %rax notq %rsi notq %rdi andq b-16(%rax), %rsi andq b-8(%rax), %rdi movq %rsi, a-16(%rax) movq %rdi, a-8(%rax) cmpq $256, %rax jne .L2 ret but with this patch now produces: foo: xorl %eax, %eax .L2: movdqa c(%rax), %xmm0 pandn b(%rax), %xmm0 addq $16, %rax movaps %xmm0, a-16(%rax) cmpq $256, %rax jne .L2 ret Technically, the STV pass is implemented by three C++ classes, a common abstract base class "scalar_chain" that contains common functionality, and two derived classes: general_scalar_chain (which handles SI and DI modes) and timode_scalar_chain (which handles TI modes). As mentioned previously, because only TI mode moves were handled the two worker classes behaved significantly differently. These changes bring the functionality of these two classes closer together, which is reflected by refactoring more shared code from general_scalar_chain to the parent scalar_chain and reusing it from timode_scalar_chain. There still remain significant differences (and simplifications) so the existing division of classes (as specializations) continues to make sense. 2022-07-11 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-features.h (scalar_chain): Add fields insns_conv, n_sse_to_integer and n_integer_to_sse to this parent class, moved from general_scalar_chain. (scalar_chain::convert_compare): Protected method moved from general_scalar_chain. (mark_dual_mode_def): Make protected, not private virtual. (scalar_chain:convert_op): New private virtual method. (general_scalar_chain::general_scalar_chain): Simplify constructor. (general_scalar_chain::~general_scalar_chain): Delete destructor. (general_scalar_chain): Move insns_conv, n_sse_to_integer and n_integer_to_sse fields to parent class, scalar_chain. (general_scalar_chain::mark_dual_mode_def): Delete prototype. (general_scalar_chain::convert_compare): Delete prototype. (timode_scalar_chain::compute_convert_gain): Remove simplistic implementation, convert to a method prototype. (timode_scalar_chain::mark_dual_mode_def): Delete prototype. (timode_scalar_chain::convert_op): Prototype new virtual method. * config/i386/i386-features.cc (scalar_chain::scalar_chain): Allocate insns_conv and initialize n_sse_to_integer and n_integer_to_sse fields in constructor. (scalar_chain::scalar_chain): Free insns_conv in destructor. (general_scalar_chain::general_scalar_chain): Delete constructor, now defined in the class declaration. (general_scalar_chain::~general_scalar_chain): Delete destructor. (scalar_chain::mark_dual_mode_def): Renamed from general_scalar_chain::mark_dual_mode_def. (timode_scalar_chain::mark_dual_mode_def): Delete. (scalar_chain::convert_compare): Renamed from general_scalar_chain::convert_compare. (timode_scalar_chain::compute_convert_gain): New method to determine the gain from converting a TImode chain to V1TImode. (timode_scalar_chain::convert_op): New method to convert an operand from TImode to V1TImode. (timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE on REG_EQUAL notes that were originally TImode (not CONST_INT). Handle AND, ANDN, XOR, IOR, NOT and COMPARE. (timode_mem_p): Helper predicate to check where operand is memory reference with sufficient alignment for TImode STV. (timode_scalar_to_vector_candidate_p): Use convertible_comparison_p to check whether COMPARE is convertible. Handle SET_DESTs that that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT, CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT. gcc/testsuite/ChangeLog * gcc.target/i386/sse4_1-stv-2.c: New test case, pand. * gcc.target/i386/sse4_1-stv-3.c: New test case, por. * gcc.target/i386/sse4_1-stv-4.c: New test case, pxor. * gcc.target/i386/sse4_1-stv-5.c: New test case, pandn. * gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
*op = copy_rtx_if_shared (*op);
if (REG_P (*op))
*op = gen_rtx_SUBREG (V1TImode, *op, 0);
else if (MEM_P (*op))
{
rtx tmp = gen_reg_rtx (V1TImode);
Use PTEST to perform AND in TImode STV of (A & B) != 0 on x86_64. This x86_64 backend patch allows TImode STV to take advantage of the fact that the PTEST instruction performs an AND operation. Previously PTEST was (mostly) used for comparison against zero, by using the same operands. The benefits are demonstrated by the new test case: __int128 a,b; int foo() { return (a & b) != 0; } Currently with -O2 -msse4 we generate: movdqa a(%rip), %xmm0 pand b(%rip), %xmm0 xorl %eax, %eax ptest %xmm0, %xmm0 setne %al ret with this patch we now generate: movdqa a(%rip), %xmm0 xorl %eax, %eax ptest b(%rip), %xmm0 setne %al ret Technically, the magic happens using new define_insn_and_split patterns. Using two patterns allows this transformation to performed independently of whether TImode STV is run before or after combine. The one tricky case is that immediate constant operands of the AND behave slightly differently between TImode and V1TImode: All V1TImode immediate operands becomes loads, but for TImode only values that are not hilo_operands need to be loaded. Hence the new *testti_doubleword accepts any general_operand, but internally during split calls force_reg whenever the second operand is not x86_64_hilo_general_operand. This required (benefits from) some tweaks to TImode STV to support CONST_WIDE_INT in more places, using CONST_SCALAR_INT_P instead of just CONST_INT_P. 2022-08-09 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-features.cc (scalar_chain::convert_compare): Create new pseudos only when/if needed. Add support for TEST, i.e. (COMPARE (AND x y) (const_int 0)), using UNSPEC_PTEST. When broadcasting V2DImode and V4SImode use new pseudo register. (timode_scalar_chain::convert_op): Do nothing if operand is already V1TImode. Avoid generating useless SUBREG conversions, i.e. (SUBREG:V1TImode (REG:V1TImode) 0). Handle CONST_WIDE_INT in addition to CONST_INT by using CONST_SCALAR_INT_P. (convertible_comparison_p): Use CONST_SCALAR_INT_P to match both CONST_WIDE_INT and CONST_INT. Recognize new *testti_doubleword pattern as an STV candidate. (timode_scalar_to_vector_candidate_p): Allow CONST_SCALAR_INT_P operands in binary logic operations. * config/i386/i386.cc (ix86_rtx_costs) <case UNSPEC>: Add costs for UNSPEC_PTEST; a PTEST that performs an AND has the same cost as regular PTEST, i.e. cost->sse_op. * config/i386/i386.md (*testti_doubleword): New pre-reload define_insn_and_split that recognizes comparison of TI mode AND against zero. * config/i386/sse.md (*ptest<mode>_and): New pre-reload define_insn_and_split that recognizes UNSPEC_PTEST of identical AND operands. gcc/testsuite/ChangeLog * gcc.target/i386/sse4_1-stv-8.c: New test case.
2022-08-09 19:59:55 +02:00
emit_insn_before (gen_rtx_SET (tmp,
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64. This patch upgrades x86_64's scalar-to-vector (STV) pass to more aggressively transform 128-bit scalar TImode operations into vector V1TImode operations performed on SSE registers. TImode functionality already exists in STV, but only for move operations. This change brings support for logical operations (AND, IOR, XOR, NOT and ANDN) and comparisons. The effect of these changes are conveniently demonstrated by the new sse4_1-stv-5.c test case: __int128 a[16]; __int128 b[16]; __int128 c[16]; void foo() { for (unsigned int i=0; i<16; i++) a[i] = b[i] & ~c[i]; } which when currently compiled on mainline wtih -O2 -msse4 produces: foo: xorl %eax, %eax .L2: movq c(%rax), %rsi movq c+8(%rax), %rdi addq $16, %rax notq %rsi notq %rdi andq b-16(%rax), %rsi andq b-8(%rax), %rdi movq %rsi, a-16(%rax) movq %rdi, a-8(%rax) cmpq $256, %rax jne .L2 ret but with this patch now produces: foo: xorl %eax, %eax .L2: movdqa c(%rax), %xmm0 pandn b(%rax), %xmm0 addq $16, %rax movaps %xmm0, a-16(%rax) cmpq $256, %rax jne .L2 ret Technically, the STV pass is implemented by three C++ classes, a common abstract base class "scalar_chain" that contains common functionality, and two derived classes: general_scalar_chain (which handles SI and DI modes) and timode_scalar_chain (which handles TI modes). As mentioned previously, because only TI mode moves were handled the two worker classes behaved significantly differently. These changes bring the functionality of these two classes closer together, which is reflected by refactoring more shared code from general_scalar_chain to the parent scalar_chain and reusing it from timode_scalar_chain. There still remain significant differences (and simplifications) so the existing division of classes (as specializations) continues to make sense. 2022-07-11 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-features.h (scalar_chain): Add fields insns_conv, n_sse_to_integer and n_integer_to_sse to this parent class, moved from general_scalar_chain. (scalar_chain::convert_compare): Protected method moved from general_scalar_chain. (mark_dual_mode_def): Make protected, not private virtual. (scalar_chain:convert_op): New private virtual method. (general_scalar_chain::general_scalar_chain): Simplify constructor. (general_scalar_chain::~general_scalar_chain): Delete destructor. (general_scalar_chain): Move insns_conv, n_sse_to_integer and n_integer_to_sse fields to parent class, scalar_chain. (general_scalar_chain::mark_dual_mode_def): Delete prototype. (general_scalar_chain::convert_compare): Delete prototype. (timode_scalar_chain::compute_convert_gain): Remove simplistic implementation, convert to a method prototype. (timode_scalar_chain::mark_dual_mode_def): Delete prototype. (timode_scalar_chain::convert_op): Prototype new virtual method. * config/i386/i386-features.cc (scalar_chain::scalar_chain): Allocate insns_conv and initialize n_sse_to_integer and n_integer_to_sse fields in constructor. (scalar_chain::scalar_chain): Free insns_conv in destructor. (general_scalar_chain::general_scalar_chain): Delete constructor, now defined in the class declaration. (general_scalar_chain::~general_scalar_chain): Delete destructor. (scalar_chain::mark_dual_mode_def): Renamed from general_scalar_chain::mark_dual_mode_def. (timode_scalar_chain::mark_dual_mode_def): Delete. (scalar_chain::convert_compare): Renamed from general_scalar_chain::convert_compare. (timode_scalar_chain::compute_convert_gain): New method to determine the gain from converting a TImode chain to V1TImode. (timode_scalar_chain::convert_op): New method to convert an operand from TImode to V1TImode. (timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE on REG_EQUAL notes that were originally TImode (not CONST_INT). Handle AND, ANDN, XOR, IOR, NOT and COMPARE. (timode_mem_p): Helper predicate to check where operand is memory reference with sufficient alignment for TImode STV. (timode_scalar_to_vector_candidate_p): Use convertible_comparison_p to check whether COMPARE is convertible. Handle SET_DESTs that that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT, CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT. gcc/testsuite/ChangeLog * gcc.target/i386/sse4_1-stv-2.c: New test case, pand. * gcc.target/i386/sse4_1-stv-3.c: New test case, por. * gcc.target/i386/sse4_1-stv-4.c: New test case, pxor. * gcc.target/i386/sse4_1-stv-5.c: New test case, pandn. * gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
gen_gpr_to_xmm_move_src (V1TImode, *op)),
insn);
Use PTEST to perform AND in TImode STV of (A & B) != 0 on x86_64. This x86_64 backend patch allows TImode STV to take advantage of the fact that the PTEST instruction performs an AND operation. Previously PTEST was (mostly) used for comparison against zero, by using the same operands. The benefits are demonstrated by the new test case: __int128 a,b; int foo() { return (a & b) != 0; } Currently with -O2 -msse4 we generate: movdqa a(%rip), %xmm0 pand b(%rip), %xmm0 xorl %eax, %eax ptest %xmm0, %xmm0 setne %al ret with this patch we now generate: movdqa a(%rip), %xmm0 xorl %eax, %eax ptest b(%rip), %xmm0 setne %al ret Technically, the magic happens using new define_insn_and_split patterns. Using two patterns allows this transformation to performed independently of whether TImode STV is run before or after combine. The one tricky case is that immediate constant operands of the AND behave slightly differently between TImode and V1TImode: All V1TImode immediate operands becomes loads, but for TImode only values that are not hilo_operands need to be loaded. Hence the new *testti_doubleword accepts any general_operand, but internally during split calls force_reg whenever the second operand is not x86_64_hilo_general_operand. This required (benefits from) some tweaks to TImode STV to support CONST_WIDE_INT in more places, using CONST_SCALAR_INT_P instead of just CONST_INT_P. 2022-08-09 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-features.cc (scalar_chain::convert_compare): Create new pseudos only when/if needed. Add support for TEST, i.e. (COMPARE (AND x y) (const_int 0)), using UNSPEC_PTEST. When broadcasting V2DImode and V4SImode use new pseudo register. (timode_scalar_chain::convert_op): Do nothing if operand is already V1TImode. Avoid generating useless SUBREG conversions, i.e. (SUBREG:V1TImode (REG:V1TImode) 0). Handle CONST_WIDE_INT in addition to CONST_INT by using CONST_SCALAR_INT_P. (convertible_comparison_p): Use CONST_SCALAR_INT_P to match both CONST_WIDE_INT and CONST_INT. Recognize new *testti_doubleword pattern as an STV candidate. (timode_scalar_to_vector_candidate_p): Allow CONST_SCALAR_INT_P operands in binary logic operations. * config/i386/i386.cc (ix86_rtx_costs) <case UNSPEC>: Add costs for UNSPEC_PTEST; a PTEST that performs an AND has the same cost as regular PTEST, i.e. cost->sse_op. * config/i386/i386.md (*testti_doubleword): New pre-reload define_insn_and_split that recognizes comparison of TI mode AND against zero. * config/i386/sse.md (*ptest<mode>_and): New pre-reload define_insn_and_split that recognizes UNSPEC_PTEST of identical AND operands. gcc/testsuite/ChangeLog * gcc.target/i386/sse4_1-stv-8.c: New test case.
2022-08-09 19:59:55 +02:00
*op = tmp;
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64. This patch upgrades x86_64's scalar-to-vector (STV) pass to more aggressively transform 128-bit scalar TImode operations into vector V1TImode operations performed on SSE registers. TImode functionality already exists in STV, but only for move operations. This change brings support for logical operations (AND, IOR, XOR, NOT and ANDN) and comparisons. The effect of these changes are conveniently demonstrated by the new sse4_1-stv-5.c test case: __int128 a[16]; __int128 b[16]; __int128 c[16]; void foo() { for (unsigned int i=0; i<16; i++) a[i] = b[i] & ~c[i]; } which when currently compiled on mainline wtih -O2 -msse4 produces: foo: xorl %eax, %eax .L2: movq c(%rax), %rsi movq c+8(%rax), %rdi addq $16, %rax notq %rsi notq %rdi andq b-16(%rax), %rsi andq b-8(%rax), %rdi movq %rsi, a-16(%rax) movq %rdi, a-8(%rax) cmpq $256, %rax jne .L2 ret but with this patch now produces: foo: xorl %eax, %eax .L2: movdqa c(%rax), %xmm0 pandn b(%rax), %xmm0 addq $16, %rax movaps %xmm0, a-16(%rax) cmpq $256, %rax jne .L2 ret Technically, the STV pass is implemented by three C++ classes, a common abstract base class "scalar_chain" that contains common functionality, and two derived classes: general_scalar_chain (which handles SI and DI modes) and timode_scalar_chain (which handles TI modes). As mentioned previously, because only TI mode moves were handled the two worker classes behaved significantly differently. These changes bring the functionality of these two classes closer together, which is reflected by refactoring more shared code from general_scalar_chain to the parent scalar_chain and reusing it from timode_scalar_chain. There still remain significant differences (and simplifications) so the existing division of classes (as specializations) continues to make sense. 2022-07-11 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-features.h (scalar_chain): Add fields insns_conv, n_sse_to_integer and n_integer_to_sse to this parent class, moved from general_scalar_chain. (scalar_chain::convert_compare): Protected method moved from general_scalar_chain. (mark_dual_mode_def): Make protected, not private virtual. (scalar_chain:convert_op): New private virtual method. (general_scalar_chain::general_scalar_chain): Simplify constructor. (general_scalar_chain::~general_scalar_chain): Delete destructor. (general_scalar_chain): Move insns_conv, n_sse_to_integer and n_integer_to_sse fields to parent class, scalar_chain. (general_scalar_chain::mark_dual_mode_def): Delete prototype. (general_scalar_chain::convert_compare): Delete prototype. (timode_scalar_chain::compute_convert_gain): Remove simplistic implementation, convert to a method prototype. (timode_scalar_chain::mark_dual_mode_def): Delete prototype. (timode_scalar_chain::convert_op): Prototype new virtual method. * config/i386/i386-features.cc (scalar_chain::scalar_chain): Allocate insns_conv and initialize n_sse_to_integer and n_integer_to_sse fields in constructor. (scalar_chain::scalar_chain): Free insns_conv in destructor. (general_scalar_chain::general_scalar_chain): Delete constructor, now defined in the class declaration. (general_scalar_chain::~general_scalar_chain): Delete destructor. (scalar_chain::mark_dual_mode_def): Renamed from general_scalar_chain::mark_dual_mode_def. (timode_scalar_chain::mark_dual_mode_def): Delete. (scalar_chain::convert_compare): Renamed from general_scalar_chain::convert_compare. (timode_scalar_chain::compute_convert_gain): New method to determine the gain from converting a TImode chain to V1TImode. (timode_scalar_chain::convert_op): New method to convert an operand from TImode to V1TImode. (timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE on REG_EQUAL notes that were originally TImode (not CONST_INT). Handle AND, ANDN, XOR, IOR, NOT and COMPARE. (timode_mem_p): Helper predicate to check where operand is memory reference with sufficient alignment for TImode STV. (timode_scalar_to_vector_candidate_p): Use convertible_comparison_p to check whether COMPARE is convertible. Handle SET_DESTs that that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT, CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT. gcc/testsuite/ChangeLog * gcc.target/i386/sse4_1-stv-2.c: New test case, pand. * gcc.target/i386/sse4_1-stv-3.c: New test case, por. * gcc.target/i386/sse4_1-stv-4.c: New test case, pxor. * gcc.target/i386/sse4_1-stv-5.c: New test case, pandn. * gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
if (dump_file)
fprintf (dump_file, " Preloading operand for insn %d into r%d\n",
INSN_UID (insn), REGNO (tmp));
}
Use PTEST to perform AND in TImode STV of (A & B) != 0 on x86_64. This x86_64 backend patch allows TImode STV to take advantage of the fact that the PTEST instruction performs an AND operation. Previously PTEST was (mostly) used for comparison against zero, by using the same operands. The benefits are demonstrated by the new test case: __int128 a,b; int foo() { return (a & b) != 0; } Currently with -O2 -msse4 we generate: movdqa a(%rip), %xmm0 pand b(%rip), %xmm0 xorl %eax, %eax ptest %xmm0, %xmm0 setne %al ret with this patch we now generate: movdqa a(%rip), %xmm0 xorl %eax, %eax ptest b(%rip), %xmm0 setne %al ret Technically, the magic happens using new define_insn_and_split patterns. Using two patterns allows this transformation to performed independently of whether TImode STV is run before or after combine. The one tricky case is that immediate constant operands of the AND behave slightly differently between TImode and V1TImode: All V1TImode immediate operands becomes loads, but for TImode only values that are not hilo_operands need to be loaded. Hence the new *testti_doubleword accepts any general_operand, but internally during split calls force_reg whenever the second operand is not x86_64_hilo_general_operand. This required (benefits from) some tweaks to TImode STV to support CONST_WIDE_INT in more places, using CONST_SCALAR_INT_P instead of just CONST_INT_P. 2022-08-09 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-features.cc (scalar_chain::convert_compare): Create new pseudos only when/if needed. Add support for TEST, i.e. (COMPARE (AND x y) (const_int 0)), using UNSPEC_PTEST. When broadcasting V2DImode and V4SImode use new pseudo register. (timode_scalar_chain::convert_op): Do nothing if operand is already V1TImode. Avoid generating useless SUBREG conversions, i.e. (SUBREG:V1TImode (REG:V1TImode) 0). Handle CONST_WIDE_INT in addition to CONST_INT by using CONST_SCALAR_INT_P. (convertible_comparison_p): Use CONST_SCALAR_INT_P to match both CONST_WIDE_INT and CONST_INT. Recognize new *testti_doubleword pattern as an STV candidate. (timode_scalar_to_vector_candidate_p): Allow CONST_SCALAR_INT_P operands in binary logic operations. * config/i386/i386.cc (ix86_rtx_costs) <case UNSPEC>: Add costs for UNSPEC_PTEST; a PTEST that performs an AND has the same cost as regular PTEST, i.e. cost->sse_op. * config/i386/i386.md (*testti_doubleword): New pre-reload define_insn_and_split that recognizes comparison of TI mode AND against zero. * config/i386/sse.md (*ptest<mode>_and): New pre-reload define_insn_and_split that recognizes UNSPEC_PTEST of identical AND operands. gcc/testsuite/ChangeLog * gcc.target/i386/sse4_1-stv-8.c: New test case.
2022-08-09 19:59:55 +02:00
else if (CONST_SCALAR_INT_P (*op))
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64. This patch upgrades x86_64's scalar-to-vector (STV) pass to more aggressively transform 128-bit scalar TImode operations into vector V1TImode operations performed on SSE registers. TImode functionality already exists in STV, but only for move operations. This change brings support for logical operations (AND, IOR, XOR, NOT and ANDN) and comparisons. The effect of these changes are conveniently demonstrated by the new sse4_1-stv-5.c test case: __int128 a[16]; __int128 b[16]; __int128 c[16]; void foo() { for (unsigned int i=0; i<16; i++) a[i] = b[i] & ~c[i]; } which when currently compiled on mainline wtih -O2 -msse4 produces: foo: xorl %eax, %eax .L2: movq c(%rax), %rsi movq c+8(%rax), %rdi addq $16, %rax notq %rsi notq %rdi andq b-16(%rax), %rsi andq b-8(%rax), %rdi movq %rsi, a-16(%rax) movq %rdi, a-8(%rax) cmpq $256, %rax jne .L2 ret but with this patch now produces: foo: xorl %eax, %eax .L2: movdqa c(%rax), %xmm0 pandn b(%rax), %xmm0 addq $16, %rax movaps %xmm0, a-16(%rax) cmpq $256, %rax jne .L2 ret Technically, the STV pass is implemented by three C++ classes, a common abstract base class "scalar_chain" that contains common functionality, and two derived classes: general_scalar_chain (which handles SI and DI modes) and timode_scalar_chain (which handles TI modes). As mentioned previously, because only TI mode moves were handled the two worker classes behaved significantly differently. These changes bring the functionality of these two classes closer together, which is reflected by refactoring more shared code from general_scalar_chain to the parent scalar_chain and reusing it from timode_scalar_chain. There still remain significant differences (and simplifications) so the existing division of classes (as specializations) continues to make sense. 2022-07-11 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-features.h (scalar_chain): Add fields insns_conv, n_sse_to_integer and n_integer_to_sse to this parent class, moved from general_scalar_chain. (scalar_chain::convert_compare): Protected method moved from general_scalar_chain. (mark_dual_mode_def): Make protected, not private virtual. (scalar_chain:convert_op): New private virtual method. (general_scalar_chain::general_scalar_chain): Simplify constructor. (general_scalar_chain::~general_scalar_chain): Delete destructor. (general_scalar_chain): Move insns_conv, n_sse_to_integer and n_integer_to_sse fields to parent class, scalar_chain. (general_scalar_chain::mark_dual_mode_def): Delete prototype. (general_scalar_chain::convert_compare): Delete prototype. (timode_scalar_chain::compute_convert_gain): Remove simplistic implementation, convert to a method prototype. (timode_scalar_chain::mark_dual_mode_def): Delete prototype. (timode_scalar_chain::convert_op): Prototype new virtual method. * config/i386/i386-features.cc (scalar_chain::scalar_chain): Allocate insns_conv and initialize n_sse_to_integer and n_integer_to_sse fields in constructor. (scalar_chain::scalar_chain): Free insns_conv in destructor. (general_scalar_chain::general_scalar_chain): Delete constructor, now defined in the class declaration. (general_scalar_chain::~general_scalar_chain): Delete destructor. (scalar_chain::mark_dual_mode_def): Renamed from general_scalar_chain::mark_dual_mode_def. (timode_scalar_chain::mark_dual_mode_def): Delete. (scalar_chain::convert_compare): Renamed from general_scalar_chain::convert_compare. (timode_scalar_chain::compute_convert_gain): New method to determine the gain from converting a TImode chain to V1TImode. (timode_scalar_chain::convert_op): New method to convert an operand from TImode to V1TImode. (timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE on REG_EQUAL notes that were originally TImode (not CONST_INT). Handle AND, ANDN, XOR, IOR, NOT and COMPARE. (timode_mem_p): Helper predicate to check where operand is memory reference with sufficient alignment for TImode STV. (timode_scalar_to_vector_candidate_p): Use convertible_comparison_p to check whether COMPARE is convertible. Handle SET_DESTs that that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT, CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT. gcc/testsuite/ChangeLog * gcc.target/i386/sse4_1-stv-2.c: New test case, pand. * gcc.target/i386/sse4_1-stv-3.c: New test case, por. * gcc.target/i386/sse4_1-stv-4.c: New test case, pxor. * gcc.target/i386/sse4_1-stv-5.c: New test case, pandn. * gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
{
rtx vec_cst;
Use PTEST to perform AND in TImode STV of (A & B) != 0 on x86_64. This x86_64 backend patch allows TImode STV to take advantage of the fact that the PTEST instruction performs an AND operation. Previously PTEST was (mostly) used for comparison against zero, by using the same operands. The benefits are demonstrated by the new test case: __int128 a,b; int foo() { return (a & b) != 0; } Currently with -O2 -msse4 we generate: movdqa a(%rip), %xmm0 pand b(%rip), %xmm0 xorl %eax, %eax ptest %xmm0, %xmm0 setne %al ret with this patch we now generate: movdqa a(%rip), %xmm0 xorl %eax, %eax ptest b(%rip), %xmm0 setne %al ret Technically, the magic happens using new define_insn_and_split patterns. Using two patterns allows this transformation to performed independently of whether TImode STV is run before or after combine. The one tricky case is that immediate constant operands of the AND behave slightly differently between TImode and V1TImode: All V1TImode immediate operands becomes loads, but for TImode only values that are not hilo_operands need to be loaded. Hence the new *testti_doubleword accepts any general_operand, but internally during split calls force_reg whenever the second operand is not x86_64_hilo_general_operand. This required (benefits from) some tweaks to TImode STV to support CONST_WIDE_INT in more places, using CONST_SCALAR_INT_P instead of just CONST_INT_P. 2022-08-09 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-features.cc (scalar_chain::convert_compare): Create new pseudos only when/if needed. Add support for TEST, i.e. (COMPARE (AND x y) (const_int 0)), using UNSPEC_PTEST. When broadcasting V2DImode and V4SImode use new pseudo register. (timode_scalar_chain::convert_op): Do nothing if operand is already V1TImode. Avoid generating useless SUBREG conversions, i.e. (SUBREG:V1TImode (REG:V1TImode) 0). Handle CONST_WIDE_INT in addition to CONST_INT by using CONST_SCALAR_INT_P. (convertible_comparison_p): Use CONST_SCALAR_INT_P to match both CONST_WIDE_INT and CONST_INT. Recognize new *testti_doubleword pattern as an STV candidate. (timode_scalar_to_vector_candidate_p): Allow CONST_SCALAR_INT_P operands in binary logic operations. * config/i386/i386.cc (ix86_rtx_costs) <case UNSPEC>: Add costs for UNSPEC_PTEST; a PTEST that performs an AND has the same cost as regular PTEST, i.e. cost->sse_op. * config/i386/i386.md (*testti_doubleword): New pre-reload define_insn_and_split that recognizes comparison of TI mode AND against zero. * config/i386/sse.md (*ptest<mode>_and): New pre-reload define_insn_and_split that recognizes UNSPEC_PTEST of identical AND operands. gcc/testsuite/ChangeLog * gcc.target/i386/sse4_1-stv-8.c: New test case.
2022-08-09 19:59:55 +02:00
rtx tmp = gen_reg_rtx (V1TImode);
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64. This patch upgrades x86_64's scalar-to-vector (STV) pass to more aggressively transform 128-bit scalar TImode operations into vector V1TImode operations performed on SSE registers. TImode functionality already exists in STV, but only for move operations. This change brings support for logical operations (AND, IOR, XOR, NOT and ANDN) and comparisons. The effect of these changes are conveniently demonstrated by the new sse4_1-stv-5.c test case: __int128 a[16]; __int128 b[16]; __int128 c[16]; void foo() { for (unsigned int i=0; i<16; i++) a[i] = b[i] & ~c[i]; } which when currently compiled on mainline wtih -O2 -msse4 produces: foo: xorl %eax, %eax .L2: movq c(%rax), %rsi movq c+8(%rax), %rdi addq $16, %rax notq %rsi notq %rdi andq b-16(%rax), %rsi andq b-8(%rax), %rdi movq %rsi, a-16(%rax) movq %rdi, a-8(%rax) cmpq $256, %rax jne .L2 ret but with this patch now produces: foo: xorl %eax, %eax .L2: movdqa c(%rax), %xmm0 pandn b(%rax), %xmm0 addq $16, %rax movaps %xmm0, a-16(%rax) cmpq $256, %rax jne .L2 ret Technically, the STV pass is implemented by three C++ classes, a common abstract base class "scalar_chain" that contains common functionality, and two derived classes: general_scalar_chain (which handles SI and DI modes) and timode_scalar_chain (which handles TI modes). As mentioned previously, because only TI mode moves were handled the two worker classes behaved significantly differently. These changes bring the functionality of these two classes closer together, which is reflected by refactoring more shared code from general_scalar_chain to the parent scalar_chain and reusing it from timode_scalar_chain. There still remain significant differences (and simplifications) so the existing division of classes (as specializations) continues to make sense. 2022-07-11 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-features.h (scalar_chain): Add fields insns_conv, n_sse_to_integer and n_integer_to_sse to this parent class, moved from general_scalar_chain. (scalar_chain::convert_compare): Protected method moved from general_scalar_chain. (mark_dual_mode_def): Make protected, not private virtual. (scalar_chain:convert_op): New private virtual method. (general_scalar_chain::general_scalar_chain): Simplify constructor. (general_scalar_chain::~general_scalar_chain): Delete destructor. (general_scalar_chain): Move insns_conv, n_sse_to_integer and n_integer_to_sse fields to parent class, scalar_chain. (general_scalar_chain::mark_dual_mode_def): Delete prototype. (general_scalar_chain::convert_compare): Delete prototype. (timode_scalar_chain::compute_convert_gain): Remove simplistic implementation, convert to a method prototype. (timode_scalar_chain::mark_dual_mode_def): Delete prototype. (timode_scalar_chain::convert_op): Prototype new virtual method. * config/i386/i386-features.cc (scalar_chain::scalar_chain): Allocate insns_conv and initialize n_sse_to_integer and n_integer_to_sse fields in constructor. (scalar_chain::scalar_chain): Free insns_conv in destructor. (general_scalar_chain::general_scalar_chain): Delete constructor, now defined in the class declaration. (general_scalar_chain::~general_scalar_chain): Delete destructor. (scalar_chain::mark_dual_mode_def): Renamed from general_scalar_chain::mark_dual_mode_def. (timode_scalar_chain::mark_dual_mode_def): Delete. (scalar_chain::convert_compare): Renamed from general_scalar_chain::convert_compare. (timode_scalar_chain::compute_convert_gain): New method to determine the gain from converting a TImode chain to V1TImode. (timode_scalar_chain::convert_op): New method to convert an operand from TImode to V1TImode. (timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE on REG_EQUAL notes that were originally TImode (not CONST_INT). Handle AND, ANDN, XOR, IOR, NOT and COMPARE. (timode_mem_p): Helper predicate to check where operand is memory reference with sufficient alignment for TImode STV. (timode_scalar_to_vector_candidate_p): Use convertible_comparison_p to check whether COMPARE is convertible. Handle SET_DESTs that that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT, CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT. gcc/testsuite/ChangeLog * gcc.target/i386/sse4_1-stv-2.c: New test case, pand. * gcc.target/i386/sse4_1-stv-3.c: New test case, por. * gcc.target/i386/sse4_1-stv-4.c: New test case, pxor. * gcc.target/i386/sse4_1-stv-5.c: New test case, pandn. * gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
/* Prefer all ones vector in case of -1. */
if (constm1_operand (*op, TImode))
vec_cst = CONSTM1_RTX (V1TImode);
else
{
rtx *v = XALLOCAVEC (rtx, 1);
v[0] = *op;
vec_cst = gen_rtx_CONST_VECTOR (V1TImode, gen_rtvec_v (1, v));
}
if (!standard_sse_constant_p (vec_cst, V1TImode))
{
start_sequence ();
vec_cst = validize_mem (force_const_mem (V1TImode, vec_cst));
rtx_insn *seq = get_insns ();
end_sequence ();
emit_insn_before (seq, insn);
}
Use PTEST to perform AND in TImode STV of (A & B) != 0 on x86_64. This x86_64 backend patch allows TImode STV to take advantage of the fact that the PTEST instruction performs an AND operation. Previously PTEST was (mostly) used for comparison against zero, by using the same operands. The benefits are demonstrated by the new test case: __int128 a,b; int foo() { return (a & b) != 0; } Currently with -O2 -msse4 we generate: movdqa a(%rip), %xmm0 pand b(%rip), %xmm0 xorl %eax, %eax ptest %xmm0, %xmm0 setne %al ret with this patch we now generate: movdqa a(%rip), %xmm0 xorl %eax, %eax ptest b(%rip), %xmm0 setne %al ret Technically, the magic happens using new define_insn_and_split patterns. Using two patterns allows this transformation to performed independently of whether TImode STV is run before or after combine. The one tricky case is that immediate constant operands of the AND behave slightly differently between TImode and V1TImode: All V1TImode immediate operands becomes loads, but for TImode only values that are not hilo_operands need to be loaded. Hence the new *testti_doubleword accepts any general_operand, but internally during split calls force_reg whenever the second operand is not x86_64_hilo_general_operand. This required (benefits from) some tweaks to TImode STV to support CONST_WIDE_INT in more places, using CONST_SCALAR_INT_P instead of just CONST_INT_P. 2022-08-09 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-features.cc (scalar_chain::convert_compare): Create new pseudos only when/if needed. Add support for TEST, i.e. (COMPARE (AND x y) (const_int 0)), using UNSPEC_PTEST. When broadcasting V2DImode and V4SImode use new pseudo register. (timode_scalar_chain::convert_op): Do nothing if operand is already V1TImode. Avoid generating useless SUBREG conversions, i.e. (SUBREG:V1TImode (REG:V1TImode) 0). Handle CONST_WIDE_INT in addition to CONST_INT by using CONST_SCALAR_INT_P. (convertible_comparison_p): Use CONST_SCALAR_INT_P to match both CONST_WIDE_INT and CONST_INT. Recognize new *testti_doubleword pattern as an STV candidate. (timode_scalar_to_vector_candidate_p): Allow CONST_SCALAR_INT_P operands in binary logic operations. * config/i386/i386.cc (ix86_rtx_costs) <case UNSPEC>: Add costs for UNSPEC_PTEST; a PTEST that performs an AND has the same cost as regular PTEST, i.e. cost->sse_op. * config/i386/i386.md (*testti_doubleword): New pre-reload define_insn_and_split that recognizes comparison of TI mode AND against zero. * config/i386/sse.md (*ptest<mode>_and): New pre-reload define_insn_and_split that recognizes UNSPEC_PTEST of identical AND operands. gcc/testsuite/ChangeLog * gcc.target/i386/sse4_1-stv-8.c: New test case.
2022-08-09 19:59:55 +02:00
emit_insn_before (gen_move_insn (tmp, vec_cst), insn);
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64. This patch upgrades x86_64's scalar-to-vector (STV) pass to more aggressively transform 128-bit scalar TImode operations into vector V1TImode operations performed on SSE registers. TImode functionality already exists in STV, but only for move operations. This change brings support for logical operations (AND, IOR, XOR, NOT and ANDN) and comparisons. The effect of these changes are conveniently demonstrated by the new sse4_1-stv-5.c test case: __int128 a[16]; __int128 b[16]; __int128 c[16]; void foo() { for (unsigned int i=0; i<16; i++) a[i] = b[i] & ~c[i]; } which when currently compiled on mainline wtih -O2 -msse4 produces: foo: xorl %eax, %eax .L2: movq c(%rax), %rsi movq c+8(%rax), %rdi addq $16, %rax notq %rsi notq %rdi andq b-16(%rax), %rsi andq b-8(%rax), %rdi movq %rsi, a-16(%rax) movq %rdi, a-8(%rax) cmpq $256, %rax jne .L2 ret but with this patch now produces: foo: xorl %eax, %eax .L2: movdqa c(%rax), %xmm0 pandn b(%rax), %xmm0 addq $16, %rax movaps %xmm0, a-16(%rax) cmpq $256, %rax jne .L2 ret Technically, the STV pass is implemented by three C++ classes, a common abstract base class "scalar_chain" that contains common functionality, and two derived classes: general_scalar_chain (which handles SI and DI modes) and timode_scalar_chain (which handles TI modes). As mentioned previously, because only TI mode moves were handled the two worker classes behaved significantly differently. These changes bring the functionality of these two classes closer together, which is reflected by refactoring more shared code from general_scalar_chain to the parent scalar_chain and reusing it from timode_scalar_chain. There still remain significant differences (and simplifications) so the existing division of classes (as specializations) continues to make sense. 2022-07-11 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-features.h (scalar_chain): Add fields insns_conv, n_sse_to_integer and n_integer_to_sse to this parent class, moved from general_scalar_chain. (scalar_chain::convert_compare): Protected method moved from general_scalar_chain. (mark_dual_mode_def): Make protected, not private virtual. (scalar_chain:convert_op): New private virtual method. (general_scalar_chain::general_scalar_chain): Simplify constructor. (general_scalar_chain::~general_scalar_chain): Delete destructor. (general_scalar_chain): Move insns_conv, n_sse_to_integer and n_integer_to_sse fields to parent class, scalar_chain. (general_scalar_chain::mark_dual_mode_def): Delete prototype. (general_scalar_chain::convert_compare): Delete prototype. (timode_scalar_chain::compute_convert_gain): Remove simplistic implementation, convert to a method prototype. (timode_scalar_chain::mark_dual_mode_def): Delete prototype. (timode_scalar_chain::convert_op): Prototype new virtual method. * config/i386/i386-features.cc (scalar_chain::scalar_chain): Allocate insns_conv and initialize n_sse_to_integer and n_integer_to_sse fields in constructor. (scalar_chain::scalar_chain): Free insns_conv in destructor. (general_scalar_chain::general_scalar_chain): Delete constructor, now defined in the class declaration. (general_scalar_chain::~general_scalar_chain): Delete destructor. (scalar_chain::mark_dual_mode_def): Renamed from general_scalar_chain::mark_dual_mode_def. (timode_scalar_chain::mark_dual_mode_def): Delete. (scalar_chain::convert_compare): Renamed from general_scalar_chain::convert_compare. (timode_scalar_chain::compute_convert_gain): New method to determine the gain from converting a TImode chain to V1TImode. (timode_scalar_chain::convert_op): New method to convert an operand from TImode to V1TImode. (timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE on REG_EQUAL notes that were originally TImode (not CONST_INT). Handle AND, ANDN, XOR, IOR, NOT and COMPARE. (timode_mem_p): Helper predicate to check where operand is memory reference with sufficient alignment for TImode STV. (timode_scalar_to_vector_candidate_p): Use convertible_comparison_p to check whether COMPARE is convertible. Handle SET_DESTs that that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT, CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT. gcc/testsuite/ChangeLog * gcc.target/i386/sse4_1-stv-2.c: New test case, pand. * gcc.target/i386/sse4_1-stv-3.c: New test case, por. * gcc.target/i386/sse4_1-stv-4.c: New test case, pxor. * gcc.target/i386/sse4_1-stv-5.c: New test case, pandn. * gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
*op = tmp;
}
else
{
gcc_assert (SUBREG_P (*op));
gcc_assert (GET_MODE (*op) == vmode);
}
}
/* Convert INSN from TImode to V1T1mode. */
void
timode_scalar_chain::convert_insn (rtx_insn *insn)
{
rtx def_set = single_set (insn);
rtx src = SET_SRC (def_set);
rtx dst = SET_DEST (def_set);
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64. This patch upgrades x86_64's scalar-to-vector (STV) pass to more aggressively transform 128-bit scalar TImode operations into vector V1TImode operations performed on SSE registers. TImode functionality already exists in STV, but only for move operations. This change brings support for logical operations (AND, IOR, XOR, NOT and ANDN) and comparisons. The effect of these changes are conveniently demonstrated by the new sse4_1-stv-5.c test case: __int128 a[16]; __int128 b[16]; __int128 c[16]; void foo() { for (unsigned int i=0; i<16; i++) a[i] = b[i] & ~c[i]; } which when currently compiled on mainline wtih -O2 -msse4 produces: foo: xorl %eax, %eax .L2: movq c(%rax), %rsi movq c+8(%rax), %rdi addq $16, %rax notq %rsi notq %rdi andq b-16(%rax), %rsi andq b-8(%rax), %rdi movq %rsi, a-16(%rax) movq %rdi, a-8(%rax) cmpq $256, %rax jne .L2 ret but with this patch now produces: foo: xorl %eax, %eax .L2: movdqa c(%rax), %xmm0 pandn b(%rax), %xmm0 addq $16, %rax movaps %xmm0, a-16(%rax) cmpq $256, %rax jne .L2 ret Technically, the STV pass is implemented by three C++ classes, a common abstract base class "scalar_chain" that contains common functionality, and two derived classes: general_scalar_chain (which handles SI and DI modes) and timode_scalar_chain (which handles TI modes). As mentioned previously, because only TI mode moves were handled the two worker classes behaved significantly differently. These changes bring the functionality of these two classes closer together, which is reflected by refactoring more shared code from general_scalar_chain to the parent scalar_chain and reusing it from timode_scalar_chain. There still remain significant differences (and simplifications) so the existing division of classes (as specializations) continues to make sense. 2022-07-11 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-features.h (scalar_chain): Add fields insns_conv, n_sse_to_integer and n_integer_to_sse to this parent class, moved from general_scalar_chain. (scalar_chain::convert_compare): Protected method moved from general_scalar_chain. (mark_dual_mode_def): Make protected, not private virtual. (scalar_chain:convert_op): New private virtual method. (general_scalar_chain::general_scalar_chain): Simplify constructor. (general_scalar_chain::~general_scalar_chain): Delete destructor. (general_scalar_chain): Move insns_conv, n_sse_to_integer and n_integer_to_sse fields to parent class, scalar_chain. (general_scalar_chain::mark_dual_mode_def): Delete prototype. (general_scalar_chain::convert_compare): Delete prototype. (timode_scalar_chain::compute_convert_gain): Remove simplistic implementation, convert to a method prototype. (timode_scalar_chain::mark_dual_mode_def): Delete prototype. (timode_scalar_chain::convert_op): Prototype new virtual method. * config/i386/i386-features.cc (scalar_chain::scalar_chain): Allocate insns_conv and initialize n_sse_to_integer and n_integer_to_sse fields in constructor. (scalar_chain::scalar_chain): Free insns_conv in destructor. (general_scalar_chain::general_scalar_chain): Delete constructor, now defined in the class declaration. (general_scalar_chain::~general_scalar_chain): Delete destructor. (scalar_chain::mark_dual_mode_def): Renamed from general_scalar_chain::mark_dual_mode_def. (timode_scalar_chain::mark_dual_mode_def): Delete. (scalar_chain::convert_compare): Renamed from general_scalar_chain::convert_compare. (timode_scalar_chain::compute_convert_gain): New method to determine the gain from converting a TImode chain to V1TImode. (timode_scalar_chain::convert_op): New method to convert an operand from TImode to V1TImode. (timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE on REG_EQUAL notes that were originally TImode (not CONST_INT). Handle AND, ANDN, XOR, IOR, NOT and COMPARE. (timode_mem_p): Helper predicate to check where operand is memory reference with sufficient alignment for TImode STV. (timode_scalar_to_vector_candidate_p): Use convertible_comparison_p to check whether COMPARE is convertible. Handle SET_DESTs that that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT, CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT. gcc/testsuite/ChangeLog * gcc.target/i386/sse4_1-stv-2.c: New test case, pand. * gcc.target/i386/sse4_1-stv-3.c: New test case, por. * gcc.target/i386/sse4_1-stv-4.c: New test case, pxor. * gcc.target/i386/sse4_1-stv-5.c: New test case, pandn. * gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
rtx tmp;
switch (GET_CODE (dst))
{
case REG:
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64. This patch upgrades x86_64's scalar-to-vector (STV) pass to more aggressively transform 128-bit scalar TImode operations into vector V1TImode operations performed on SSE registers. TImode functionality already exists in STV, but only for move operations. This change brings support for logical operations (AND, IOR, XOR, NOT and ANDN) and comparisons. The effect of these changes are conveniently demonstrated by the new sse4_1-stv-5.c test case: __int128 a[16]; __int128 b[16]; __int128 c[16]; void foo() { for (unsigned int i=0; i<16; i++) a[i] = b[i] & ~c[i]; } which when currently compiled on mainline wtih -O2 -msse4 produces: foo: xorl %eax, %eax .L2: movq c(%rax), %rsi movq c+8(%rax), %rdi addq $16, %rax notq %rsi notq %rdi andq b-16(%rax), %rsi andq b-8(%rax), %rdi movq %rsi, a-16(%rax) movq %rdi, a-8(%rax) cmpq $256, %rax jne .L2 ret but with this patch now produces: foo: xorl %eax, %eax .L2: movdqa c(%rax), %xmm0 pandn b(%rax), %xmm0 addq $16, %rax movaps %xmm0, a-16(%rax) cmpq $256, %rax jne .L2 ret Technically, the STV pass is implemented by three C++ classes, a common abstract base class "scalar_chain" that contains common functionality, and two derived classes: general_scalar_chain (which handles SI and DI modes) and timode_scalar_chain (which handles TI modes). As mentioned previously, because only TI mode moves were handled the two worker classes behaved significantly differently. These changes bring the functionality of these two classes closer together, which is reflected by refactoring more shared code from general_scalar_chain to the parent scalar_chain and reusing it from timode_scalar_chain. There still remain significant differences (and simplifications) so the existing division of classes (as specializations) continues to make sense. 2022-07-11 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-features.h (scalar_chain): Add fields insns_conv, n_sse_to_integer and n_integer_to_sse to this parent class, moved from general_scalar_chain. (scalar_chain::convert_compare): Protected method moved from general_scalar_chain. (mark_dual_mode_def): Make protected, not private virtual. (scalar_chain:convert_op): New private virtual method. (general_scalar_chain::general_scalar_chain): Simplify constructor. (general_scalar_chain::~general_scalar_chain): Delete destructor. (general_scalar_chain): Move insns_conv, n_sse_to_integer and n_integer_to_sse fields to parent class, scalar_chain. (general_scalar_chain::mark_dual_mode_def): Delete prototype. (general_scalar_chain::convert_compare): Delete prototype. (timode_scalar_chain::compute_convert_gain): Remove simplistic implementation, convert to a method prototype. (timode_scalar_chain::mark_dual_mode_def): Delete prototype. (timode_scalar_chain::convert_op): Prototype new virtual method. * config/i386/i386-features.cc (scalar_chain::scalar_chain): Allocate insns_conv and initialize n_sse_to_integer and n_integer_to_sse fields in constructor. (scalar_chain::scalar_chain): Free insns_conv in destructor. (general_scalar_chain::general_scalar_chain): Delete constructor, now defined in the class declaration. (general_scalar_chain::~general_scalar_chain): Delete destructor. (scalar_chain::mark_dual_mode_def): Renamed from general_scalar_chain::mark_dual_mode_def. (timode_scalar_chain::mark_dual_mode_def): Delete. (scalar_chain::convert_compare): Renamed from general_scalar_chain::convert_compare. (timode_scalar_chain::compute_convert_gain): New method to determine the gain from converting a TImode chain to V1TImode. (timode_scalar_chain::convert_op): New method to convert an operand from TImode to V1TImode. (timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE on REG_EQUAL notes that were originally TImode (not CONST_INT). Handle AND, ANDN, XOR, IOR, NOT and COMPARE. (timode_mem_p): Helper predicate to check where operand is memory reference with sufficient alignment for TImode STV. (timode_scalar_to_vector_candidate_p): Use convertible_comparison_p to check whether COMPARE is convertible. Handle SET_DESTs that that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT, CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT. gcc/testsuite/ChangeLog * gcc.target/i386/sse4_1-stv-2.c: New test case, pand. * gcc.target/i386/sse4_1-stv-3.c: New test case, por. * gcc.target/i386/sse4_1-stv-4.c: New test case, pxor. * gcc.target/i386/sse4_1-stv-5.c: New test case, pandn. * gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
if (GET_MODE (dst) == TImode)
PR target/106278: Keep REG_EQUAL notes consistent during TImode STV on x86_64. This patch resolves PR target/106278 a regression on x86_64 caused by my recent TImode STV improvements. Now that TImode STV can handle comparisons such as "(set (regs:CC) (compare:CC (reg:TI) ...))" the convert_insn method sensibly checks that the mode of the SET_DEST is TImode before setting it to V1TImode [to avoid V1TImode appearing on the hard reg CC_FLAGS. Hence the current code looks like: if (GET_MODE (dst) == TImode) { tmp = find_reg_equal_equiv_note (insn); if (tmp && GET_MODE (XEXP (tmp, 0)) == TImode) PUT_MODE (XEXP (tmp, 0), V1TImode); PUT_MODE (dst, V1TImode); fix_debug_reg_uses (dst); } break; which checks GET_MODE (dst) before calling PUT_MODE, and when a change is made updating the REG_EQUAL_NOTE tmp if it exists. The logical flaw (oversight) is that due to RTL sharing, the destination of this set may already have been updated to V1TImode, as this chain is being converted, but we still need to update any REG_EQUAL_NOTE that still has TImode. Hence the correct code is actually: if (GET_MODE (dst) == TImode) { PUT_MODE (dst, V1TImode); fix_debug_reg_uses (dst); } if (GET_MODE (dst) == V1TImode) { tmp = find_reg_equal_equiv_note (insn); if (tmp && GET_MODE (XEXP (tmp, 0)) == TImode) PUT_MODE (XEXP (tmp, 0), V1TImode); } break; While fixing this behavior, I noticed I had some indentation whitespace issues and some vestigial dead code in this function/method that I've taken the liberty of cleaning up (as obvious) in this patch. 2022-07-15 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog PR target/106278 * config/i386/i386-features.cc (general_scalar_chain::convert_insn): Fix indentation whitespace. (timode_scalar_chain::fix_debug_reg_uses): Likewise. (timode_scalar_chain::convert_insn): Delete dead code. Update TImode REG_EQUAL_NOTE even if the SET_DEST is already V1TI. Fix indentation whitespace. (convertible_comparison_p): Likewise. (timode_scalar_to_vector_candidate_p): Likewise. gcc/testsuite/ChangeLog * gcc.dg/pr106278.c: New test case.
2022-07-15 15:39:28 +02:00
{
PUT_MODE (dst, V1TImode);
fix_debug_reg_uses (dst);
}
if (GET_MODE (dst) == V1TImode)
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64. This patch upgrades x86_64's scalar-to-vector (STV) pass to more aggressively transform 128-bit scalar TImode operations into vector V1TImode operations performed on SSE registers. TImode functionality already exists in STV, but only for move operations. This change brings support for logical operations (AND, IOR, XOR, NOT and ANDN) and comparisons. The effect of these changes are conveniently demonstrated by the new sse4_1-stv-5.c test case: __int128 a[16]; __int128 b[16]; __int128 c[16]; void foo() { for (unsigned int i=0; i<16; i++) a[i] = b[i] & ~c[i]; } which when currently compiled on mainline wtih -O2 -msse4 produces: foo: xorl %eax, %eax .L2: movq c(%rax), %rsi movq c+8(%rax), %rdi addq $16, %rax notq %rsi notq %rdi andq b-16(%rax), %rsi andq b-8(%rax), %rdi movq %rsi, a-16(%rax) movq %rdi, a-8(%rax) cmpq $256, %rax jne .L2 ret but with this patch now produces: foo: xorl %eax, %eax .L2: movdqa c(%rax), %xmm0 pandn b(%rax), %xmm0 addq $16, %rax movaps %xmm0, a-16(%rax) cmpq $256, %rax jne .L2 ret Technically, the STV pass is implemented by three C++ classes, a common abstract base class "scalar_chain" that contains common functionality, and two derived classes: general_scalar_chain (which handles SI and DI modes) and timode_scalar_chain (which handles TI modes). As mentioned previously, because only TI mode moves were handled the two worker classes behaved significantly differently. These changes bring the functionality of these two classes closer together, which is reflected by refactoring more shared code from general_scalar_chain to the parent scalar_chain and reusing it from timode_scalar_chain. There still remain significant differences (and simplifications) so the existing division of classes (as specializations) continues to make sense. 2022-07-11 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-features.h (scalar_chain): Add fields insns_conv, n_sse_to_integer and n_integer_to_sse to this parent class, moved from general_scalar_chain. (scalar_chain::convert_compare): Protected method moved from general_scalar_chain. (mark_dual_mode_def): Make protected, not private virtual. (scalar_chain:convert_op): New private virtual method. (general_scalar_chain::general_scalar_chain): Simplify constructor. (general_scalar_chain::~general_scalar_chain): Delete destructor. (general_scalar_chain): Move insns_conv, n_sse_to_integer and n_integer_to_sse fields to parent class, scalar_chain. (general_scalar_chain::mark_dual_mode_def): Delete prototype. (general_scalar_chain::convert_compare): Delete prototype. (timode_scalar_chain::compute_convert_gain): Remove simplistic implementation, convert to a method prototype. (timode_scalar_chain::mark_dual_mode_def): Delete prototype. (timode_scalar_chain::convert_op): Prototype new virtual method. * config/i386/i386-features.cc (scalar_chain::scalar_chain): Allocate insns_conv and initialize n_sse_to_integer and n_integer_to_sse fields in constructor. (scalar_chain::scalar_chain): Free insns_conv in destructor. (general_scalar_chain::general_scalar_chain): Delete constructor, now defined in the class declaration. (general_scalar_chain::~general_scalar_chain): Delete destructor. (scalar_chain::mark_dual_mode_def): Renamed from general_scalar_chain::mark_dual_mode_def. (timode_scalar_chain::mark_dual_mode_def): Delete. (scalar_chain::convert_compare): Renamed from general_scalar_chain::convert_compare. (timode_scalar_chain::compute_convert_gain): New method to determine the gain from converting a TImode chain to V1TImode. (timode_scalar_chain::convert_op): New method to convert an operand from TImode to V1TImode. (timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE on REG_EQUAL notes that were originally TImode (not CONST_INT). Handle AND, ANDN, XOR, IOR, NOT and COMPARE. (timode_mem_p): Helper predicate to check where operand is memory reference with sufficient alignment for TImode STV. (timode_scalar_to_vector_candidate_p): Use convertible_comparison_p to check whether COMPARE is convertible. Handle SET_DESTs that that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT, CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT. gcc/testsuite/ChangeLog * gcc.target/i386/sse4_1-stv-2.c: New test case, pand. * gcc.target/i386/sse4_1-stv-3.c: New test case, por. * gcc.target/i386/sse4_1-stv-4.c: New test case, pxor. * gcc.target/i386/sse4_1-stv-5.c: New test case, pandn. * gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
{
tmp = find_reg_equal_equiv_note (insn);
if (tmp)
{
if (GET_MODE (XEXP (tmp, 0)) == TImode)
PUT_MODE (XEXP (tmp, 0), V1TImode);
else if (CONST_SCALAR_INT_P (XEXP (tmp, 0)))
XEXP (tmp, 0)
= gen_rtx_CONST_VECTOR (V1TImode,
gen_rtvec (1, XEXP (tmp, 0)));
}
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64. This patch upgrades x86_64's scalar-to-vector (STV) pass to more aggressively transform 128-bit scalar TImode operations into vector V1TImode operations performed on SSE registers. TImode functionality already exists in STV, but only for move operations. This change brings support for logical operations (AND, IOR, XOR, NOT and ANDN) and comparisons. The effect of these changes are conveniently demonstrated by the new sse4_1-stv-5.c test case: __int128 a[16]; __int128 b[16]; __int128 c[16]; void foo() { for (unsigned int i=0; i<16; i++) a[i] = b[i] & ~c[i]; } which when currently compiled on mainline wtih -O2 -msse4 produces: foo: xorl %eax, %eax .L2: movq c(%rax), %rsi movq c+8(%rax), %rdi addq $16, %rax notq %rsi notq %rdi andq b-16(%rax), %rsi andq b-8(%rax), %rdi movq %rsi, a-16(%rax) movq %rdi, a-8(%rax) cmpq $256, %rax jne .L2 ret but with this patch now produces: foo: xorl %eax, %eax .L2: movdqa c(%rax), %xmm0 pandn b(%rax), %xmm0 addq $16, %rax movaps %xmm0, a-16(%rax) cmpq $256, %rax jne .L2 ret Technically, the STV pass is implemented by three C++ classes, a common abstract base class "scalar_chain" that contains common functionality, and two derived classes: general_scalar_chain (which handles SI and DI modes) and timode_scalar_chain (which handles TI modes). As mentioned previously, because only TI mode moves were handled the two worker classes behaved significantly differently. These changes bring the functionality of these two classes closer together, which is reflected by refactoring more shared code from general_scalar_chain to the parent scalar_chain and reusing it from timode_scalar_chain. There still remain significant differences (and simplifications) so the existing division of classes (as specializations) continues to make sense. 2022-07-11 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-features.h (scalar_chain): Add fields insns_conv, n_sse_to_integer and n_integer_to_sse to this parent class, moved from general_scalar_chain. (scalar_chain::convert_compare): Protected method moved from general_scalar_chain. (mark_dual_mode_def): Make protected, not private virtual. (scalar_chain:convert_op): New private virtual method. (general_scalar_chain::general_scalar_chain): Simplify constructor. (general_scalar_chain::~general_scalar_chain): Delete destructor. (general_scalar_chain): Move insns_conv, n_sse_to_integer and n_integer_to_sse fields to parent class, scalar_chain. (general_scalar_chain::mark_dual_mode_def): Delete prototype. (general_scalar_chain::convert_compare): Delete prototype. (timode_scalar_chain::compute_convert_gain): Remove simplistic implementation, convert to a method prototype. (timode_scalar_chain::mark_dual_mode_def): Delete prototype. (timode_scalar_chain::convert_op): Prototype new virtual method. * config/i386/i386-features.cc (scalar_chain::scalar_chain): Allocate insns_conv and initialize n_sse_to_integer and n_integer_to_sse fields in constructor. (scalar_chain::scalar_chain): Free insns_conv in destructor. (general_scalar_chain::general_scalar_chain): Delete constructor, now defined in the class declaration. (general_scalar_chain::~general_scalar_chain): Delete destructor. (scalar_chain::mark_dual_mode_def): Renamed from general_scalar_chain::mark_dual_mode_def. (timode_scalar_chain::mark_dual_mode_def): Delete. (scalar_chain::convert_compare): Renamed from general_scalar_chain::convert_compare. (timode_scalar_chain::compute_convert_gain): New method to determine the gain from converting a TImode chain to V1TImode. (timode_scalar_chain::convert_op): New method to convert an operand from TImode to V1TImode. (timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE on REG_EQUAL notes that were originally TImode (not CONST_INT). Handle AND, ANDN, XOR, IOR, NOT and COMPARE. (timode_mem_p): Helper predicate to check where operand is memory reference with sufficient alignment for TImode STV. (timode_scalar_to_vector_candidate_p): Use convertible_comparison_p to check whether COMPARE is convertible. Handle SET_DESTs that that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT, CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT. gcc/testsuite/ChangeLog * gcc.target/i386/sse4_1-stv-2.c: New test case, pand. * gcc.target/i386/sse4_1-stv-3.c: New test case, por. * gcc.target/i386/sse4_1-stv-4.c: New test case, pxor. * gcc.target/i386/sse4_1-stv-5.c: New test case, pandn. * gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
}
break;
case MEM:
PUT_MODE (dst, V1TImode);
break;
default:
gcc_unreachable ();
}
switch (GET_CODE (src))
{
case REG:
PUT_MODE (src, V1TImode);
/* Call fix_debug_reg_uses only if SRC is never defined. */
if (!DF_REG_DEF_CHAIN (REGNO (src)))
fix_debug_reg_uses (src);
break;
case MEM:
PUT_MODE (src, V1TImode);
break;
case CONST_WIDE_INT:
if (NONDEBUG_INSN_P (insn))
{
/* Since there are no instructions to store 128-bit constant,
temporary register usage is required. */
start_sequence ();
src = gen_rtx_CONST_VECTOR (V1TImode, gen_rtvec (1, src));
src = validize_mem (force_const_mem (V1TImode, src));
rtx_insn *seq = get_insns ();
end_sequence ();
if (seq)
emit_insn_before (seq, insn);
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64. This patch upgrades x86_64's scalar-to-vector (STV) pass to more aggressively transform 128-bit scalar TImode operations into vector V1TImode operations performed on SSE registers. TImode functionality already exists in STV, but only for move operations. This change brings support for logical operations (AND, IOR, XOR, NOT and ANDN) and comparisons. The effect of these changes are conveniently demonstrated by the new sse4_1-stv-5.c test case: __int128 a[16]; __int128 b[16]; __int128 c[16]; void foo() { for (unsigned int i=0; i<16; i++) a[i] = b[i] & ~c[i]; } which when currently compiled on mainline wtih -O2 -msse4 produces: foo: xorl %eax, %eax .L2: movq c(%rax), %rsi movq c+8(%rax), %rdi addq $16, %rax notq %rsi notq %rdi andq b-16(%rax), %rsi andq b-8(%rax), %rdi movq %rsi, a-16(%rax) movq %rdi, a-8(%rax) cmpq $256, %rax jne .L2 ret but with this patch now produces: foo: xorl %eax, %eax .L2: movdqa c(%rax), %xmm0 pandn b(%rax), %xmm0 addq $16, %rax movaps %xmm0, a-16(%rax) cmpq $256, %rax jne .L2 ret Technically, the STV pass is implemented by three C++ classes, a common abstract base class "scalar_chain" that contains common functionality, and two derived classes: general_scalar_chain (which handles SI and DI modes) and timode_scalar_chain (which handles TI modes). As mentioned previously, because only TI mode moves were handled the two worker classes behaved significantly differently. These changes bring the functionality of these two classes closer together, which is reflected by refactoring more shared code from general_scalar_chain to the parent scalar_chain and reusing it from timode_scalar_chain. There still remain significant differences (and simplifications) so the existing division of classes (as specializations) continues to make sense. 2022-07-11 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-features.h (scalar_chain): Add fields insns_conv, n_sse_to_integer and n_integer_to_sse to this parent class, moved from general_scalar_chain. (scalar_chain::convert_compare): Protected method moved from general_scalar_chain. (mark_dual_mode_def): Make protected, not private virtual. (scalar_chain:convert_op): New private virtual method. (general_scalar_chain::general_scalar_chain): Simplify constructor. (general_scalar_chain::~general_scalar_chain): Delete destructor. (general_scalar_chain): Move insns_conv, n_sse_to_integer and n_integer_to_sse fields to parent class, scalar_chain. (general_scalar_chain::mark_dual_mode_def): Delete prototype. (general_scalar_chain::convert_compare): Delete prototype. (timode_scalar_chain::compute_convert_gain): Remove simplistic implementation, convert to a method prototype. (timode_scalar_chain::mark_dual_mode_def): Delete prototype. (timode_scalar_chain::convert_op): Prototype new virtual method. * config/i386/i386-features.cc (scalar_chain::scalar_chain): Allocate insns_conv and initialize n_sse_to_integer and n_integer_to_sse fields in constructor. (scalar_chain::scalar_chain): Free insns_conv in destructor. (general_scalar_chain::general_scalar_chain): Delete constructor, now defined in the class declaration. (general_scalar_chain::~general_scalar_chain): Delete destructor. (scalar_chain::mark_dual_mode_def): Renamed from general_scalar_chain::mark_dual_mode_def. (timode_scalar_chain::mark_dual_mode_def): Delete. (scalar_chain::convert_compare): Renamed from general_scalar_chain::convert_compare. (timode_scalar_chain::compute_convert_gain): New method to determine the gain from converting a TImode chain to V1TImode. (timode_scalar_chain::convert_op): New method to convert an operand from TImode to V1TImode. (timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE on REG_EQUAL notes that were originally TImode (not CONST_INT). Handle AND, ANDN, XOR, IOR, NOT and COMPARE. (timode_mem_p): Helper predicate to check where operand is memory reference with sufficient alignment for TImode STV. (timode_scalar_to_vector_candidate_p): Use convertible_comparison_p to check whether COMPARE is convertible. Handle SET_DESTs that that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT, CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT. gcc/testsuite/ChangeLog * gcc.target/i386/sse4_1-stv-2.c: New test case, pand. * gcc.target/i386/sse4_1-stv-3.c: New test case, por. * gcc.target/i386/sse4_1-stv-4.c: New test case, pxor. * gcc.target/i386/sse4_1-stv-5.c: New test case, pandn. * gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
if (MEM_P (dst))
{
tmp = gen_reg_rtx (V1TImode);
emit_insn_before (gen_rtx_SET (tmp, src), insn);
src = tmp;
}
}
break;
case CONST_INT:
switch (standard_sse_constant_p (src, TImode))
{
case 1:
src = CONST0_RTX (GET_MODE (dst));
break;
case 2:
src = CONSTM1_RTX (GET_MODE (dst));
break;
default:
gcc_unreachable ();
}
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64. This patch upgrades x86_64's scalar-to-vector (STV) pass to more aggressively transform 128-bit scalar TImode operations into vector V1TImode operations performed on SSE registers. TImode functionality already exists in STV, but only for move operations. This change brings support for logical operations (AND, IOR, XOR, NOT and ANDN) and comparisons. The effect of these changes are conveniently demonstrated by the new sse4_1-stv-5.c test case: __int128 a[16]; __int128 b[16]; __int128 c[16]; void foo() { for (unsigned int i=0; i<16; i++) a[i] = b[i] & ~c[i]; } which when currently compiled on mainline wtih -O2 -msse4 produces: foo: xorl %eax, %eax .L2: movq c(%rax), %rsi movq c+8(%rax), %rdi addq $16, %rax notq %rsi notq %rdi andq b-16(%rax), %rsi andq b-8(%rax), %rdi movq %rsi, a-16(%rax) movq %rdi, a-8(%rax) cmpq $256, %rax jne .L2 ret but with this patch now produces: foo: xorl %eax, %eax .L2: movdqa c(%rax), %xmm0 pandn b(%rax), %xmm0 addq $16, %rax movaps %xmm0, a-16(%rax) cmpq $256, %rax jne .L2 ret Technically, the STV pass is implemented by three C++ classes, a common abstract base class "scalar_chain" that contains common functionality, and two derived classes: general_scalar_chain (which handles SI and DI modes) and timode_scalar_chain (which handles TI modes). As mentioned previously, because only TI mode moves were handled the two worker classes behaved significantly differently. These changes bring the functionality of these two classes closer together, which is reflected by refactoring more shared code from general_scalar_chain to the parent scalar_chain and reusing it from timode_scalar_chain. There still remain significant differences (and simplifications) so the existing division of classes (as specializations) continues to make sense. 2022-07-11 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-features.h (scalar_chain): Add fields insns_conv, n_sse_to_integer and n_integer_to_sse to this parent class, moved from general_scalar_chain. (scalar_chain::convert_compare): Protected method moved from general_scalar_chain. (mark_dual_mode_def): Make protected, not private virtual. (scalar_chain:convert_op): New private virtual method. (general_scalar_chain::general_scalar_chain): Simplify constructor. (general_scalar_chain::~general_scalar_chain): Delete destructor. (general_scalar_chain): Move insns_conv, n_sse_to_integer and n_integer_to_sse fields to parent class, scalar_chain. (general_scalar_chain::mark_dual_mode_def): Delete prototype. (general_scalar_chain::convert_compare): Delete prototype. (timode_scalar_chain::compute_convert_gain): Remove simplistic implementation, convert to a method prototype. (timode_scalar_chain::mark_dual_mode_def): Delete prototype. (timode_scalar_chain::convert_op): Prototype new virtual method. * config/i386/i386-features.cc (scalar_chain::scalar_chain): Allocate insns_conv and initialize n_sse_to_integer and n_integer_to_sse fields in constructor. (scalar_chain::scalar_chain): Free insns_conv in destructor. (general_scalar_chain::general_scalar_chain): Delete constructor, now defined in the class declaration. (general_scalar_chain::~general_scalar_chain): Delete destructor. (scalar_chain::mark_dual_mode_def): Renamed from general_scalar_chain::mark_dual_mode_def. (timode_scalar_chain::mark_dual_mode_def): Delete. (scalar_chain::convert_compare): Renamed from general_scalar_chain::convert_compare. (timode_scalar_chain::compute_convert_gain): New method to determine the gain from converting a TImode chain to V1TImode. (timode_scalar_chain::convert_op): New method to convert an operand from TImode to V1TImode. (timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE on REG_EQUAL notes that were originally TImode (not CONST_INT). Handle AND, ANDN, XOR, IOR, NOT and COMPARE. (timode_mem_p): Helper predicate to check where operand is memory reference with sufficient alignment for TImode STV. (timode_scalar_to_vector_candidate_p): Use convertible_comparison_p to check whether COMPARE is convertible. Handle SET_DESTs that that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT, CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT. gcc/testsuite/ChangeLog * gcc.target/i386/sse4_1-stv-2.c: New test case, pand. * gcc.target/i386/sse4_1-stv-3.c: New test case, por. * gcc.target/i386/sse4_1-stv-4.c: New test case, pxor. * gcc.target/i386/sse4_1-stv-5.c: New test case, pandn. * gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
if (MEM_P (dst))
{
tmp = gen_reg_rtx (V1TImode);
PR target/106278: Keep REG_EQUAL notes consistent during TImode STV on x86_64. This patch resolves PR target/106278 a regression on x86_64 caused by my recent TImode STV improvements. Now that TImode STV can handle comparisons such as "(set (regs:CC) (compare:CC (reg:TI) ...))" the convert_insn method sensibly checks that the mode of the SET_DEST is TImode before setting it to V1TImode [to avoid V1TImode appearing on the hard reg CC_FLAGS. Hence the current code looks like: if (GET_MODE (dst) == TImode) { tmp = find_reg_equal_equiv_note (insn); if (tmp && GET_MODE (XEXP (tmp, 0)) == TImode) PUT_MODE (XEXP (tmp, 0), V1TImode); PUT_MODE (dst, V1TImode); fix_debug_reg_uses (dst); } break; which checks GET_MODE (dst) before calling PUT_MODE, and when a change is made updating the REG_EQUAL_NOTE tmp if it exists. The logical flaw (oversight) is that due to RTL sharing, the destination of this set may already have been updated to V1TImode, as this chain is being converted, but we still need to update any REG_EQUAL_NOTE that still has TImode. Hence the correct code is actually: if (GET_MODE (dst) == TImode) { PUT_MODE (dst, V1TImode); fix_debug_reg_uses (dst); } if (GET_MODE (dst) == V1TImode) { tmp = find_reg_equal_equiv_note (insn); if (tmp && GET_MODE (XEXP (tmp, 0)) == TImode) PUT_MODE (XEXP (tmp, 0), V1TImode); } break; While fixing this behavior, I noticed I had some indentation whitespace issues and some vestigial dead code in this function/method that I've taken the liberty of cleaning up (as obvious) in this patch. 2022-07-15 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog PR target/106278 * config/i386/i386-features.cc (general_scalar_chain::convert_insn): Fix indentation whitespace. (timode_scalar_chain::fix_debug_reg_uses): Likewise. (timode_scalar_chain::convert_insn): Delete dead code. Update TImode REG_EQUAL_NOTE even if the SET_DEST is already V1TI. Fix indentation whitespace. (convertible_comparison_p): Likewise. (timode_scalar_to_vector_candidate_p): Likewise. gcc/testsuite/ChangeLog * gcc.dg/pr106278.c: New test case.
2022-07-15 15:39:28 +02:00
emit_insn_before (gen_rtx_SET (tmp, src), insn);
src = tmp;
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64. This patch upgrades x86_64's scalar-to-vector (STV) pass to more aggressively transform 128-bit scalar TImode operations into vector V1TImode operations performed on SSE registers. TImode functionality already exists in STV, but only for move operations. This change brings support for logical operations (AND, IOR, XOR, NOT and ANDN) and comparisons. The effect of these changes are conveniently demonstrated by the new sse4_1-stv-5.c test case: __int128 a[16]; __int128 b[16]; __int128 c[16]; void foo() { for (unsigned int i=0; i<16; i++) a[i] = b[i] & ~c[i]; } which when currently compiled on mainline wtih -O2 -msse4 produces: foo: xorl %eax, %eax .L2: movq c(%rax), %rsi movq c+8(%rax), %rdi addq $16, %rax notq %rsi notq %rdi andq b-16(%rax), %rsi andq b-8(%rax), %rdi movq %rsi, a-16(%rax) movq %rdi, a-8(%rax) cmpq $256, %rax jne .L2 ret but with this patch now produces: foo: xorl %eax, %eax .L2: movdqa c(%rax), %xmm0 pandn b(%rax), %xmm0 addq $16, %rax movaps %xmm0, a-16(%rax) cmpq $256, %rax jne .L2 ret Technically, the STV pass is implemented by three C++ classes, a common abstract base class "scalar_chain" that contains common functionality, and two derived classes: general_scalar_chain (which handles SI and DI modes) and timode_scalar_chain (which handles TI modes). As mentioned previously, because only TI mode moves were handled the two worker classes behaved significantly differently. These changes bring the functionality of these two classes closer together, which is reflected by refactoring more shared code from general_scalar_chain to the parent scalar_chain and reusing it from timode_scalar_chain. There still remain significant differences (and simplifications) so the existing division of classes (as specializations) continues to make sense. 2022-07-11 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-features.h (scalar_chain): Add fields insns_conv, n_sse_to_integer and n_integer_to_sse to this parent class, moved from general_scalar_chain. (scalar_chain::convert_compare): Protected method moved from general_scalar_chain. (mark_dual_mode_def): Make protected, not private virtual. (scalar_chain:convert_op): New private virtual method. (general_scalar_chain::general_scalar_chain): Simplify constructor. (general_scalar_chain::~general_scalar_chain): Delete destructor. (general_scalar_chain): Move insns_conv, n_sse_to_integer and n_integer_to_sse fields to parent class, scalar_chain. (general_scalar_chain::mark_dual_mode_def): Delete prototype. (general_scalar_chain::convert_compare): Delete prototype. (timode_scalar_chain::compute_convert_gain): Remove simplistic implementation, convert to a method prototype. (timode_scalar_chain::mark_dual_mode_def): Delete prototype. (timode_scalar_chain::convert_op): Prototype new virtual method. * config/i386/i386-features.cc (scalar_chain::scalar_chain): Allocate insns_conv and initialize n_sse_to_integer and n_integer_to_sse fields in constructor. (scalar_chain::scalar_chain): Free insns_conv in destructor. (general_scalar_chain::general_scalar_chain): Delete constructor, now defined in the class declaration. (general_scalar_chain::~general_scalar_chain): Delete destructor. (scalar_chain::mark_dual_mode_def): Renamed from general_scalar_chain::mark_dual_mode_def. (timode_scalar_chain::mark_dual_mode_def): Delete. (scalar_chain::convert_compare): Renamed from general_scalar_chain::convert_compare. (timode_scalar_chain::compute_convert_gain): New method to determine the gain from converting a TImode chain to V1TImode. (timode_scalar_chain::convert_op): New method to convert an operand from TImode to V1TImode. (timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE on REG_EQUAL notes that were originally TImode (not CONST_INT). Handle AND, ANDN, XOR, IOR, NOT and COMPARE. (timode_mem_p): Helper predicate to check where operand is memory reference with sufficient alignment for TImode STV. (timode_scalar_to_vector_candidate_p): Use convertible_comparison_p to check whether COMPARE is convertible. Handle SET_DESTs that that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT, CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT. gcc/testsuite/ChangeLog * gcc.target/i386/sse4_1-stv-2.c: New test case, pand. * gcc.target/i386/sse4_1-stv-3.c: New test case, por. * gcc.target/i386/sse4_1-stv-4.c: New test case, pxor. * gcc.target/i386/sse4_1-stv-5.c: New test case, pandn. * gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
}
break;
case AND:
if (GET_CODE (XEXP (src, 0)) == NOT)
{
convert_op (&XEXP (XEXP (src, 0), 0), insn);
convert_op (&XEXP (src, 1), insn);
PUT_MODE (XEXP (src, 0), V1TImode);
PUT_MODE (src, V1TImode);
break;
}
/* FALLTHRU */
case XOR:
case IOR:
convert_op (&XEXP (src, 0), insn);
convert_op (&XEXP (src, 1), insn);
PUT_MODE (src, V1TImode);
if (MEM_P (dst))
{
tmp = gen_reg_rtx (V1TImode);
PR target/106278: Keep REG_EQUAL notes consistent during TImode STV on x86_64. This patch resolves PR target/106278 a regression on x86_64 caused by my recent TImode STV improvements. Now that TImode STV can handle comparisons such as "(set (regs:CC) (compare:CC (reg:TI) ...))" the convert_insn method sensibly checks that the mode of the SET_DEST is TImode before setting it to V1TImode [to avoid V1TImode appearing on the hard reg CC_FLAGS. Hence the current code looks like: if (GET_MODE (dst) == TImode) { tmp = find_reg_equal_equiv_note (insn); if (tmp && GET_MODE (XEXP (tmp, 0)) == TImode) PUT_MODE (XEXP (tmp, 0), V1TImode); PUT_MODE (dst, V1TImode); fix_debug_reg_uses (dst); } break; which checks GET_MODE (dst) before calling PUT_MODE, and when a change is made updating the REG_EQUAL_NOTE tmp if it exists. The logical flaw (oversight) is that due to RTL sharing, the destination of this set may already have been updated to V1TImode, as this chain is being converted, but we still need to update any REG_EQUAL_NOTE that still has TImode. Hence the correct code is actually: if (GET_MODE (dst) == TImode) { PUT_MODE (dst, V1TImode); fix_debug_reg_uses (dst); } if (GET_MODE (dst) == V1TImode) { tmp = find_reg_equal_equiv_note (insn); if (tmp && GET_MODE (XEXP (tmp, 0)) == TImode) PUT_MODE (XEXP (tmp, 0), V1TImode); } break; While fixing this behavior, I noticed I had some indentation whitespace issues and some vestigial dead code in this function/method that I've taken the liberty of cleaning up (as obvious) in this patch. 2022-07-15 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog PR target/106278 * config/i386/i386-features.cc (general_scalar_chain::convert_insn): Fix indentation whitespace. (timode_scalar_chain::fix_debug_reg_uses): Likewise. (timode_scalar_chain::convert_insn): Delete dead code. Update TImode REG_EQUAL_NOTE even if the SET_DEST is already V1TI. Fix indentation whitespace. (convertible_comparison_p): Likewise. (timode_scalar_to_vector_candidate_p): Likewise. gcc/testsuite/ChangeLog * gcc.dg/pr106278.c: New test case.
2022-07-15 15:39:28 +02:00
emit_insn_before (gen_rtx_SET (tmp, src), insn);
src = tmp;
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64. This patch upgrades x86_64's scalar-to-vector (STV) pass to more aggressively transform 128-bit scalar TImode operations into vector V1TImode operations performed on SSE registers. TImode functionality already exists in STV, but only for move operations. This change brings support for logical operations (AND, IOR, XOR, NOT and ANDN) and comparisons. The effect of these changes are conveniently demonstrated by the new sse4_1-stv-5.c test case: __int128 a[16]; __int128 b[16]; __int128 c[16]; void foo() { for (unsigned int i=0; i<16; i++) a[i] = b[i] & ~c[i]; } which when currently compiled on mainline wtih -O2 -msse4 produces: foo: xorl %eax, %eax .L2: movq c(%rax), %rsi movq c+8(%rax), %rdi addq $16, %rax notq %rsi notq %rdi andq b-16(%rax), %rsi andq b-8(%rax), %rdi movq %rsi, a-16(%rax) movq %rdi, a-8(%rax) cmpq $256, %rax jne .L2 ret but with this patch now produces: foo: xorl %eax, %eax .L2: movdqa c(%rax), %xmm0 pandn b(%rax), %xmm0 addq $16, %rax movaps %xmm0, a-16(%rax) cmpq $256, %rax jne .L2 ret Technically, the STV pass is implemented by three C++ classes, a common abstract base class "scalar_chain" that contains common functionality, and two derived classes: general_scalar_chain (which handles SI and DI modes) and timode_scalar_chain (which handles TI modes). As mentioned previously, because only TI mode moves were handled the two worker classes behaved significantly differently. These changes bring the functionality of these two classes closer together, which is reflected by refactoring more shared code from general_scalar_chain to the parent scalar_chain and reusing it from timode_scalar_chain. There still remain significant differences (and simplifications) so the existing division of classes (as specializations) continues to make sense. 2022-07-11 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-features.h (scalar_chain): Add fields insns_conv, n_sse_to_integer and n_integer_to_sse to this parent class, moved from general_scalar_chain. (scalar_chain::convert_compare): Protected method moved from general_scalar_chain. (mark_dual_mode_def): Make protected, not private virtual. (scalar_chain:convert_op): New private virtual method. (general_scalar_chain::general_scalar_chain): Simplify constructor. (general_scalar_chain::~general_scalar_chain): Delete destructor. (general_scalar_chain): Move insns_conv, n_sse_to_integer and n_integer_to_sse fields to parent class, scalar_chain. (general_scalar_chain::mark_dual_mode_def): Delete prototype. (general_scalar_chain::convert_compare): Delete prototype. (timode_scalar_chain::compute_convert_gain): Remove simplistic implementation, convert to a method prototype. (timode_scalar_chain::mark_dual_mode_def): Delete prototype. (timode_scalar_chain::convert_op): Prototype new virtual method. * config/i386/i386-features.cc (scalar_chain::scalar_chain): Allocate insns_conv and initialize n_sse_to_integer and n_integer_to_sse fields in constructor. (scalar_chain::scalar_chain): Free insns_conv in destructor. (general_scalar_chain::general_scalar_chain): Delete constructor, now defined in the class declaration. (general_scalar_chain::~general_scalar_chain): Delete destructor. (scalar_chain::mark_dual_mode_def): Renamed from general_scalar_chain::mark_dual_mode_def. (timode_scalar_chain::mark_dual_mode_def): Delete. (scalar_chain::convert_compare): Renamed from general_scalar_chain::convert_compare. (timode_scalar_chain::compute_convert_gain): New method to determine the gain from converting a TImode chain to V1TImode. (timode_scalar_chain::convert_op): New method to convert an operand from TImode to V1TImode. (timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE on REG_EQUAL notes that were originally TImode (not CONST_INT). Handle AND, ANDN, XOR, IOR, NOT and COMPARE. (timode_mem_p): Helper predicate to check where operand is memory reference with sufficient alignment for TImode STV. (timode_scalar_to_vector_candidate_p): Use convertible_comparison_p to check whether COMPARE is convertible. Handle SET_DESTs that that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT, CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT. gcc/testsuite/ChangeLog * gcc.target/i386/sse4_1-stv-2.c: New test case, pand. * gcc.target/i386/sse4_1-stv-3.c: New test case, por. * gcc.target/i386/sse4_1-stv-4.c: New test case, pxor. * gcc.target/i386/sse4_1-stv-5.c: New test case, pandn. * gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
}
break;
case NOT:
src = XEXP (src, 0);
convert_op (&src, insn);
tmp = gen_reg_rtx (V1TImode);
emit_insn_before (gen_move_insn (tmp, CONSTM1_RTX (V1TImode)), insn);
src = gen_rtx_XOR (V1TImode, src, tmp);
if (MEM_P (dst))
{
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64. This patch upgrades x86_64's scalar-to-vector (STV) pass to more aggressively transform 128-bit scalar TImode operations into vector V1TImode operations performed on SSE registers. TImode functionality already exists in STV, but only for move operations. This change brings support for logical operations (AND, IOR, XOR, NOT and ANDN) and comparisons. The effect of these changes are conveniently demonstrated by the new sse4_1-stv-5.c test case: __int128 a[16]; __int128 b[16]; __int128 c[16]; void foo() { for (unsigned int i=0; i<16; i++) a[i] = b[i] & ~c[i]; } which when currently compiled on mainline wtih -O2 -msse4 produces: foo: xorl %eax, %eax .L2: movq c(%rax), %rsi movq c+8(%rax), %rdi addq $16, %rax notq %rsi notq %rdi andq b-16(%rax), %rsi andq b-8(%rax), %rdi movq %rsi, a-16(%rax) movq %rdi, a-8(%rax) cmpq $256, %rax jne .L2 ret but with this patch now produces: foo: xorl %eax, %eax .L2: movdqa c(%rax), %xmm0 pandn b(%rax), %xmm0 addq $16, %rax movaps %xmm0, a-16(%rax) cmpq $256, %rax jne .L2 ret Technically, the STV pass is implemented by three C++ classes, a common abstract base class "scalar_chain" that contains common functionality, and two derived classes: general_scalar_chain (which handles SI and DI modes) and timode_scalar_chain (which handles TI modes). As mentioned previously, because only TI mode moves were handled the two worker classes behaved significantly differently. These changes bring the functionality of these two classes closer together, which is reflected by refactoring more shared code from general_scalar_chain to the parent scalar_chain and reusing it from timode_scalar_chain. There still remain significant differences (and simplifications) so the existing division of classes (as specializations) continues to make sense. 2022-07-11 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-features.h (scalar_chain): Add fields insns_conv, n_sse_to_integer and n_integer_to_sse to this parent class, moved from general_scalar_chain. (scalar_chain::convert_compare): Protected method moved from general_scalar_chain. (mark_dual_mode_def): Make protected, not private virtual. (scalar_chain:convert_op): New private virtual method. (general_scalar_chain::general_scalar_chain): Simplify constructor. (general_scalar_chain::~general_scalar_chain): Delete destructor. (general_scalar_chain): Move insns_conv, n_sse_to_integer and n_integer_to_sse fields to parent class, scalar_chain. (general_scalar_chain::mark_dual_mode_def): Delete prototype. (general_scalar_chain::convert_compare): Delete prototype. (timode_scalar_chain::compute_convert_gain): Remove simplistic implementation, convert to a method prototype. (timode_scalar_chain::mark_dual_mode_def): Delete prototype. (timode_scalar_chain::convert_op): Prototype new virtual method. * config/i386/i386-features.cc (scalar_chain::scalar_chain): Allocate insns_conv and initialize n_sse_to_integer and n_integer_to_sse fields in constructor. (scalar_chain::scalar_chain): Free insns_conv in destructor. (general_scalar_chain::general_scalar_chain): Delete constructor, now defined in the class declaration. (general_scalar_chain::~general_scalar_chain): Delete destructor. (scalar_chain::mark_dual_mode_def): Renamed from general_scalar_chain::mark_dual_mode_def. (timode_scalar_chain::mark_dual_mode_def): Delete. (scalar_chain::convert_compare): Renamed from general_scalar_chain::convert_compare. (timode_scalar_chain::compute_convert_gain): New method to determine the gain from converting a TImode chain to V1TImode. (timode_scalar_chain::convert_op): New method to convert an operand from TImode to V1TImode. (timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE on REG_EQUAL notes that were originally TImode (not CONST_INT). Handle AND, ANDN, XOR, IOR, NOT and COMPARE. (timode_mem_p): Helper predicate to check where operand is memory reference with sufficient alignment for TImode STV. (timode_scalar_to_vector_candidate_p): Use convertible_comparison_p to check whether COMPARE is convertible. Handle SET_DESTs that that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT, CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT. gcc/testsuite/ChangeLog * gcc.target/i386/sse4_1-stv-2.c: New test case, pand. * gcc.target/i386/sse4_1-stv-3.c: New test case, por. * gcc.target/i386/sse4_1-stv-4.c: New test case, pxor. * gcc.target/i386/sse4_1-stv-5.c: New test case, pandn. * gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
tmp = gen_reg_rtx (V1TImode);
PR target/106278: Keep REG_EQUAL notes consistent during TImode STV on x86_64. This patch resolves PR target/106278 a regression on x86_64 caused by my recent TImode STV improvements. Now that TImode STV can handle comparisons such as "(set (regs:CC) (compare:CC (reg:TI) ...))" the convert_insn method sensibly checks that the mode of the SET_DEST is TImode before setting it to V1TImode [to avoid V1TImode appearing on the hard reg CC_FLAGS. Hence the current code looks like: if (GET_MODE (dst) == TImode) { tmp = find_reg_equal_equiv_note (insn); if (tmp && GET_MODE (XEXP (tmp, 0)) == TImode) PUT_MODE (XEXP (tmp, 0), V1TImode); PUT_MODE (dst, V1TImode); fix_debug_reg_uses (dst); } break; which checks GET_MODE (dst) before calling PUT_MODE, and when a change is made updating the REG_EQUAL_NOTE tmp if it exists. The logical flaw (oversight) is that due to RTL sharing, the destination of this set may already have been updated to V1TImode, as this chain is being converted, but we still need to update any REG_EQUAL_NOTE that still has TImode. Hence the correct code is actually: if (GET_MODE (dst) == TImode) { PUT_MODE (dst, V1TImode); fix_debug_reg_uses (dst); } if (GET_MODE (dst) == V1TImode) { tmp = find_reg_equal_equiv_note (insn); if (tmp && GET_MODE (XEXP (tmp, 0)) == TImode) PUT_MODE (XEXP (tmp, 0), V1TImode); } break; While fixing this behavior, I noticed I had some indentation whitespace issues and some vestigial dead code in this function/method that I've taken the liberty of cleaning up (as obvious) in this patch. 2022-07-15 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog PR target/106278 * config/i386/i386-features.cc (general_scalar_chain::convert_insn): Fix indentation whitespace. (timode_scalar_chain::fix_debug_reg_uses): Likewise. (timode_scalar_chain::convert_insn): Delete dead code. Update TImode REG_EQUAL_NOTE even if the SET_DEST is already V1TI. Fix indentation whitespace. (convertible_comparison_p): Likewise. (timode_scalar_to_vector_candidate_p): Likewise. gcc/testsuite/ChangeLog * gcc.dg/pr106278.c: New test case.
2022-07-15 15:39:28 +02:00
emit_insn_before (gen_rtx_SET (tmp, src), insn);
src = tmp;
}
break;
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64. This patch upgrades x86_64's scalar-to-vector (STV) pass to more aggressively transform 128-bit scalar TImode operations into vector V1TImode operations performed on SSE registers. TImode functionality already exists in STV, but only for move operations. This change brings support for logical operations (AND, IOR, XOR, NOT and ANDN) and comparisons. The effect of these changes are conveniently demonstrated by the new sse4_1-stv-5.c test case: __int128 a[16]; __int128 b[16]; __int128 c[16]; void foo() { for (unsigned int i=0; i<16; i++) a[i] = b[i] & ~c[i]; } which when currently compiled on mainline wtih -O2 -msse4 produces: foo: xorl %eax, %eax .L2: movq c(%rax), %rsi movq c+8(%rax), %rdi addq $16, %rax notq %rsi notq %rdi andq b-16(%rax), %rsi andq b-8(%rax), %rdi movq %rsi, a-16(%rax) movq %rdi, a-8(%rax) cmpq $256, %rax jne .L2 ret but with this patch now produces: foo: xorl %eax, %eax .L2: movdqa c(%rax), %xmm0 pandn b(%rax), %xmm0 addq $16, %rax movaps %xmm0, a-16(%rax) cmpq $256, %rax jne .L2 ret Technically, the STV pass is implemented by three C++ classes, a common abstract base class "scalar_chain" that contains common functionality, and two derived classes: general_scalar_chain (which handles SI and DI modes) and timode_scalar_chain (which handles TI modes). As mentioned previously, because only TI mode moves were handled the two worker classes behaved significantly differently. These changes bring the functionality of these two classes closer together, which is reflected by refactoring more shared code from general_scalar_chain to the parent scalar_chain and reusing it from timode_scalar_chain. There still remain significant differences (and simplifications) so the existing division of classes (as specializations) continues to make sense. 2022-07-11 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-features.h (scalar_chain): Add fields insns_conv, n_sse_to_integer and n_integer_to_sse to this parent class, moved from general_scalar_chain. (scalar_chain::convert_compare): Protected method moved from general_scalar_chain. (mark_dual_mode_def): Make protected, not private virtual. (scalar_chain:convert_op): New private virtual method. (general_scalar_chain::general_scalar_chain): Simplify constructor. (general_scalar_chain::~general_scalar_chain): Delete destructor. (general_scalar_chain): Move insns_conv, n_sse_to_integer and n_integer_to_sse fields to parent class, scalar_chain. (general_scalar_chain::mark_dual_mode_def): Delete prototype. (general_scalar_chain::convert_compare): Delete prototype. (timode_scalar_chain::compute_convert_gain): Remove simplistic implementation, convert to a method prototype. (timode_scalar_chain::mark_dual_mode_def): Delete prototype. (timode_scalar_chain::convert_op): Prototype new virtual method. * config/i386/i386-features.cc (scalar_chain::scalar_chain): Allocate insns_conv and initialize n_sse_to_integer and n_integer_to_sse fields in constructor. (scalar_chain::scalar_chain): Free insns_conv in destructor. (general_scalar_chain::general_scalar_chain): Delete constructor, now defined in the class declaration. (general_scalar_chain::~general_scalar_chain): Delete destructor. (scalar_chain::mark_dual_mode_def): Renamed from general_scalar_chain::mark_dual_mode_def. (timode_scalar_chain::mark_dual_mode_def): Delete. (scalar_chain::convert_compare): Renamed from general_scalar_chain::convert_compare. (timode_scalar_chain::compute_convert_gain): New method to determine the gain from converting a TImode chain to V1TImode. (timode_scalar_chain::convert_op): New method to convert an operand from TImode to V1TImode. (timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE on REG_EQUAL notes that were originally TImode (not CONST_INT). Handle AND, ANDN, XOR, IOR, NOT and COMPARE. (timode_mem_p): Helper predicate to check where operand is memory reference with sufficient alignment for TImode STV. (timode_scalar_to_vector_candidate_p): Use convertible_comparison_p to check whether COMPARE is convertible. Handle SET_DESTs that that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT, CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT. gcc/testsuite/ChangeLog * gcc.target/i386/sse4_1-stv-2.c: New test case, pand. * gcc.target/i386/sse4_1-stv-3.c: New test case, por. * gcc.target/i386/sse4_1-stv-4.c: New test case, pxor. * gcc.target/i386/sse4_1-stv-5.c: New test case, pandn. * gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
case COMPARE:
dst = gen_rtx_REG (CCmode, FLAGS_REG);
src = convert_compare (XEXP (src, 0), XEXP (src, 1), insn);
break;
case ASHIFT:
case LSHIFTRT:
convert_op (&XEXP (src, 0), insn);
PUT_MODE (src, V1TImode);
break;
default:
gcc_unreachable ();
}
SET_SRC (def_set) = src;
SET_DEST (def_set) = dst;
/* Drop possible dead definitions. */
PATTERN (insn) = def_set;
INSN_CODE (insn) = -1;
recog_memoized (insn);
df_insn_rescan (insn);
}
/* Generate copies from defs used by the chain but not defined therein.
Also populates defs_map which is used later by convert_insn. */
void
PR target/106303: Fix TImode STV related failures on x86. This patch resolves PR target/106303 (and the related PRs 106347, 106404, 106407) which are ICEs caused by my improvements to x86_64's 128-bit TImode to V1TImode Scalar to Vector (STV) pass. My apologies for the breakage. The issue is that data flow analysis is used to partition usage of each TImode pseudo into "chains", where each chain is analyzed and if suitable converted to vector operations. The problems appears when some chains for a pseudo are converted, and others aren't as RTL sharing can result in some mode changes leaking into other instructions that aren't/shouldn't/can't be converted, which eventually leads to an ICE for mismatched modes. My first approach to a fix was to unify more of the STV infrastructure, reasoning that if TImode STV was exhibiting these problems, but DImode and SImode STV weren't, the issue was likely to be caused/resolved by these remaining differences. This appeared to fix some but not all of the reported PRs. A better solution was then proposed by H.J. Lu in Bugzilla, that we need to iterate the removal of candidates in the function timode_remove_non_convertible_regs until there are no further changes. As each chain is removed from consideration, it in turn may affect whether other insns/chains can safely be converted. 2022-07-24 Roger Sayle <roger@nextmovesoftware.com> H.J. Lu <hjl.tools@gmail.com> gcc/ChangeLog PR target/106303 PR target/106347 * config/i386/i386-features.cc (make_vector_copies): Move from general_scalar_chain to scalar_chain. (convert_reg): Likewise. (convert_insn_common): New scalar_chain method split out from general_scalar_chain convert_insn. (convert_registers): Move from general_scalar_chain to scalar_chain. (scalar_chain::convert): Call convert_insn_common before calling convert_insn. (timode_remove_non_convertible_regs): Iterate until there are no further changes to the candidates. * config/i386/i386-features.h (scalar_chain::hash_map): Move from general_scalar_chain. (scalar_chain::convert_reg): Likewise. (scalar_chain::convert_insn_common): New shared method. (scalar_chain::make_vector_copies): Move from general_scalar_chain. (scalar_chain::convert_registers): Likewise. No longer virtual. (general_scalar_chain::hash_map): Delete. Moved to scalar_chain. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::make_vector_copies): Likewise. (general_scalar_chain::convert_registers): Delete virtual method. (timode_scalar_chain::convert_registers): Likewise. gcc/testsuite/ChangeLog PR target/106303 PR target/106347 * gcc.target/i386/pr106303.c: New test case. * gcc.target/i386/pr106347.c: New test case.
2022-07-24 13:22:22 +02:00
scalar_chain::convert_registers ()
{
bitmap_iterator bi;
unsigned id;
EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
{
rtx chain_reg = gen_reg_rtx (smode);
defs_map.put (regno_reg_rtx[id], chain_reg);
}
EXECUTE_IF_SET_IN_BITMAP (insns_conv, 0, id, bi)
for (df_ref ref = DF_INSN_UID_DEFS (id); ref; ref = DF_REF_NEXT_LOC (ref))
if (bitmap_bit_p (defs_conv, DF_REF_REGNO (ref)))
make_vector_copies (DF_REF_INSN (ref), DF_REF_REAL_REG (ref));
}
/* Convert whole chain creating required register
conversions and copies. */
int
scalar_chain::convert ()
{
bitmap_iterator bi;
unsigned id;
int converted_insns = 0;
if (!dbg_cnt (stv_conversion))
return 0;
if (dump_file)
fprintf (dump_file, "Converting chain #%d...\n", chain_id);
convert_registers ();
EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi)
{
PR target/106303: Fix TImode STV related failures on x86. This patch resolves PR target/106303 (and the related PRs 106347, 106404, 106407) which are ICEs caused by my improvements to x86_64's 128-bit TImode to V1TImode Scalar to Vector (STV) pass. My apologies for the breakage. The issue is that data flow analysis is used to partition usage of each TImode pseudo into "chains", where each chain is analyzed and if suitable converted to vector operations. The problems appears when some chains for a pseudo are converted, and others aren't as RTL sharing can result in some mode changes leaking into other instructions that aren't/shouldn't/can't be converted, which eventually leads to an ICE for mismatched modes. My first approach to a fix was to unify more of the STV infrastructure, reasoning that if TImode STV was exhibiting these problems, but DImode and SImode STV weren't, the issue was likely to be caused/resolved by these remaining differences. This appeared to fix some but not all of the reported PRs. A better solution was then proposed by H.J. Lu in Bugzilla, that we need to iterate the removal of candidates in the function timode_remove_non_convertible_regs until there are no further changes. As each chain is removed from consideration, it in turn may affect whether other insns/chains can safely be converted. 2022-07-24 Roger Sayle <roger@nextmovesoftware.com> H.J. Lu <hjl.tools@gmail.com> gcc/ChangeLog PR target/106303 PR target/106347 * config/i386/i386-features.cc (make_vector_copies): Move from general_scalar_chain to scalar_chain. (convert_reg): Likewise. (convert_insn_common): New scalar_chain method split out from general_scalar_chain convert_insn. (convert_registers): Move from general_scalar_chain to scalar_chain. (scalar_chain::convert): Call convert_insn_common before calling convert_insn. (timode_remove_non_convertible_regs): Iterate until there are no further changes to the candidates. * config/i386/i386-features.h (scalar_chain::hash_map): Move from general_scalar_chain. (scalar_chain::convert_reg): Likewise. (scalar_chain::convert_insn_common): New shared method. (scalar_chain::make_vector_copies): Move from general_scalar_chain. (scalar_chain::convert_registers): Likewise. No longer virtual. (general_scalar_chain::hash_map): Delete. Moved to scalar_chain. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::make_vector_copies): Likewise. (general_scalar_chain::convert_registers): Delete virtual method. (timode_scalar_chain::convert_registers): Likewise. gcc/testsuite/ChangeLog PR target/106303 PR target/106347 * gcc.target/i386/pr106303.c: New test case. * gcc.target/i386/pr106347.c: New test case.
2022-07-24 13:22:22 +02:00
rtx_insn *insn = DF_INSN_UID_GET (id)->insn;
convert_insn_common (insn);
convert_insn (insn);
converted_insns++;
}
return converted_insns;
}
x86: Allow V1TI vector register pushes Add V1TI vector register push and split it after reload to a sequence of: (set (reg:P SP_REG) (plus:P SP_REG) (const_int -8))) (set (match_dup 0) (match_dup 1)) so that STV pass can convert TI mode integer push to V1TI vector register push. Rename has_non_address_hard_reg to pseudo_reg_set, combine calls of single_set and has_non_address_hard_reg to pseudo_reg_set, to ignore pseudo register push. Remove c-c++-common/dfp/func-vararg-mixed-2.c since it is compiled with -mpreferred-stack-boundary=2 and leads to segfault: Dump of assembler code for function __bid_nesd2: 0x08049210 <+0>: endbr32 0x08049214 <+4>: push %esi 0x08049215 <+5>: push %ebx 0x08049216 <+6>: call 0x8049130 <__x86.get_pc_thunk.bx> 0x0804921b <+11>: add $0x8de5,%ebx 0x08049221 <+17>: sub $0x20,%esp 0x08049224 <+20>: mov 0x30(%esp),%esi 0x08049228 <+24>: pushl 0x2c(%esp) 0x0804922c <+28>: call 0x804e600 <__bid32_to_bid64> 0x08049231 <+33>: mov %esi,(%esp) 0x08049234 <+36>: movd %edx,%xmm1 0x08049238 <+40>: movd %eax,%xmm0 0x0804923c <+44>: punpckldq %xmm1,%xmm0 => 0x08049240 <+48>: movaps %xmm0,0x10(%esp) 0x08049245 <+53>: call 0x804e600 <__bid32_to_bid64> 0x0804924a <+58>: push %edx 0x0804924b <+59>: push %eax 0x0804924c <+60>: pushl 0x1c(%esp) 0x08049250 <+64>: pushl 0x1c(%esp) 0x08049254 <+68>: call 0x804b260 <__bid64_quiet_not_equal> 0x08049259 <+73>: add $0x34,%esp 0x0804925c <+76>: pop %ebx 0x0804925d <+77>: pop %esi 0x0804925e <+78>: ret when libgcc is compiled with -msse2. According to GCC manual: '-mpreferred-stack-boundary=NUM' Attempt to keep the stack boundary aligned to a 2 raised to NUM byte boundary. If '-mpreferred-stack-boundary' is not specified, the default is 4 (16 bytes or 128-bits). *Warning:* If you use this switch, then you must build all modules with the same value, including any libraries. This includes the system libraries and startup modules. c-c++-common/dfp/func-vararg-mixed-2.c, which was added by commit 3b2488ca6ece182f2136a20ee5fa0bb92f935b0f Author: H.J. Lu <hongjiu.lu@intel.com> Date: Wed Jul 30 19:24:02 2008 +0000 func-vararg-alternate-d128-2.c: New. 2008-07-30 H.J. Lu <hongjiu.lu@intel.com> Joey Ye <joey.ye@intel.com> * gcc.dg/dfp/func-vararg-alternate-d128-2.c: New. * gcc.dg/dfp/func-vararg-mixed-2.c: Likewise. isn't expected to work with libgcc. gcc/ PR target/95021 * config/i386/i386-features.c (has_non_address_hard_reg): Renamed to ... (pseudo_reg_set): This. Return the SET expression. Ignore pseudo register push. (general_scalar_to_vector_candidate_p): Combine single_set and has_non_address_hard_reg calls to pseudo_reg_set. (timode_scalar_to_vector_candidate_p): Likewise. * config/i386/i386.md (*pushv1ti2): New pattern. gcc/testsuite/ PR target/95021 * c-c++-common/dfp/func-vararg-mixed-2.c: Removed. * gcc.target/i386/pr95021-1.c: New test. * gcc.target/i386/pr95021-2.c: Likewise. * gcc.target/i386/pr95021-3.c: Likewise. * gcc.target/i386/pr95021-4.c: Likewise. * gcc.target/i386/pr95021-5.c: Likewise.
2020-05-17 19:10:34 +02:00
/* Return the SET expression if INSN doesn't reference hard register.
Return NULL if INSN uses or defines a hard register, excluding
pseudo register pushes, hard register uses in a memory address,
clobbers and flags definitions. */
x86: Allow V1TI vector register pushes Add V1TI vector register push and split it after reload to a sequence of: (set (reg:P SP_REG) (plus:P SP_REG) (const_int -8))) (set (match_dup 0) (match_dup 1)) so that STV pass can convert TI mode integer push to V1TI vector register push. Rename has_non_address_hard_reg to pseudo_reg_set, combine calls of single_set and has_non_address_hard_reg to pseudo_reg_set, to ignore pseudo register push. Remove c-c++-common/dfp/func-vararg-mixed-2.c since it is compiled with -mpreferred-stack-boundary=2 and leads to segfault: Dump of assembler code for function __bid_nesd2: 0x08049210 <+0>: endbr32 0x08049214 <+4>: push %esi 0x08049215 <+5>: push %ebx 0x08049216 <+6>: call 0x8049130 <__x86.get_pc_thunk.bx> 0x0804921b <+11>: add $0x8de5,%ebx 0x08049221 <+17>: sub $0x20,%esp 0x08049224 <+20>: mov 0x30(%esp),%esi 0x08049228 <+24>: pushl 0x2c(%esp) 0x0804922c <+28>: call 0x804e600 <__bid32_to_bid64> 0x08049231 <+33>: mov %esi,(%esp) 0x08049234 <+36>: movd %edx,%xmm1 0x08049238 <+40>: movd %eax,%xmm0 0x0804923c <+44>: punpckldq %xmm1,%xmm0 => 0x08049240 <+48>: movaps %xmm0,0x10(%esp) 0x08049245 <+53>: call 0x804e600 <__bid32_to_bid64> 0x0804924a <+58>: push %edx 0x0804924b <+59>: push %eax 0x0804924c <+60>: pushl 0x1c(%esp) 0x08049250 <+64>: pushl 0x1c(%esp) 0x08049254 <+68>: call 0x804b260 <__bid64_quiet_not_equal> 0x08049259 <+73>: add $0x34,%esp 0x0804925c <+76>: pop %ebx 0x0804925d <+77>: pop %esi 0x0804925e <+78>: ret when libgcc is compiled with -msse2. According to GCC manual: '-mpreferred-stack-boundary=NUM' Attempt to keep the stack boundary aligned to a 2 raised to NUM byte boundary. If '-mpreferred-stack-boundary' is not specified, the default is 4 (16 bytes or 128-bits). *Warning:* If you use this switch, then you must build all modules with the same value, including any libraries. This includes the system libraries and startup modules. c-c++-common/dfp/func-vararg-mixed-2.c, which was added by commit 3b2488ca6ece182f2136a20ee5fa0bb92f935b0f Author: H.J. Lu <hongjiu.lu@intel.com> Date: Wed Jul 30 19:24:02 2008 +0000 func-vararg-alternate-d128-2.c: New. 2008-07-30 H.J. Lu <hongjiu.lu@intel.com> Joey Ye <joey.ye@intel.com> * gcc.dg/dfp/func-vararg-alternate-d128-2.c: New. * gcc.dg/dfp/func-vararg-mixed-2.c: Likewise. isn't expected to work with libgcc. gcc/ PR target/95021 * config/i386/i386-features.c (has_non_address_hard_reg): Renamed to ... (pseudo_reg_set): This. Return the SET expression. Ignore pseudo register push. (general_scalar_to_vector_candidate_p): Combine single_set and has_non_address_hard_reg calls to pseudo_reg_set. (timode_scalar_to_vector_candidate_p): Likewise. * config/i386/i386.md (*pushv1ti2): New pattern. gcc/testsuite/ PR target/95021 * c-c++-common/dfp/func-vararg-mixed-2.c: Removed. * gcc.target/i386/pr95021-1.c: New test. * gcc.target/i386/pr95021-2.c: Likewise. * gcc.target/i386/pr95021-3.c: Likewise. * gcc.target/i386/pr95021-4.c: Likewise. * gcc.target/i386/pr95021-5.c: Likewise.
2020-05-17 19:10:34 +02:00
static rtx
pseudo_reg_set (rtx_insn *insn)
{
x86: Allow V1TI vector register pushes Add V1TI vector register push and split it after reload to a sequence of: (set (reg:P SP_REG) (plus:P SP_REG) (const_int -8))) (set (match_dup 0) (match_dup 1)) so that STV pass can convert TI mode integer push to V1TI vector register push. Rename has_non_address_hard_reg to pseudo_reg_set, combine calls of single_set and has_non_address_hard_reg to pseudo_reg_set, to ignore pseudo register push. Remove c-c++-common/dfp/func-vararg-mixed-2.c since it is compiled with -mpreferred-stack-boundary=2 and leads to segfault: Dump of assembler code for function __bid_nesd2: 0x08049210 <+0>: endbr32 0x08049214 <+4>: push %esi 0x08049215 <+5>: push %ebx 0x08049216 <+6>: call 0x8049130 <__x86.get_pc_thunk.bx> 0x0804921b <+11>: add $0x8de5,%ebx 0x08049221 <+17>: sub $0x20,%esp 0x08049224 <+20>: mov 0x30(%esp),%esi 0x08049228 <+24>: pushl 0x2c(%esp) 0x0804922c <+28>: call 0x804e600 <__bid32_to_bid64> 0x08049231 <+33>: mov %esi,(%esp) 0x08049234 <+36>: movd %edx,%xmm1 0x08049238 <+40>: movd %eax,%xmm0 0x0804923c <+44>: punpckldq %xmm1,%xmm0 => 0x08049240 <+48>: movaps %xmm0,0x10(%esp) 0x08049245 <+53>: call 0x804e600 <__bid32_to_bid64> 0x0804924a <+58>: push %edx 0x0804924b <+59>: push %eax 0x0804924c <+60>: pushl 0x1c(%esp) 0x08049250 <+64>: pushl 0x1c(%esp) 0x08049254 <+68>: call 0x804b260 <__bid64_quiet_not_equal> 0x08049259 <+73>: add $0x34,%esp 0x0804925c <+76>: pop %ebx 0x0804925d <+77>: pop %esi 0x0804925e <+78>: ret when libgcc is compiled with -msse2. According to GCC manual: '-mpreferred-stack-boundary=NUM' Attempt to keep the stack boundary aligned to a 2 raised to NUM byte boundary. If '-mpreferred-stack-boundary' is not specified, the default is 4 (16 bytes or 128-bits). *Warning:* If you use this switch, then you must build all modules with the same value, including any libraries. This includes the system libraries and startup modules. c-c++-common/dfp/func-vararg-mixed-2.c, which was added by commit 3b2488ca6ece182f2136a20ee5fa0bb92f935b0f Author: H.J. Lu <hongjiu.lu@intel.com> Date: Wed Jul 30 19:24:02 2008 +0000 func-vararg-alternate-d128-2.c: New. 2008-07-30 H.J. Lu <hongjiu.lu@intel.com> Joey Ye <joey.ye@intel.com> * gcc.dg/dfp/func-vararg-alternate-d128-2.c: New. * gcc.dg/dfp/func-vararg-mixed-2.c: Likewise. isn't expected to work with libgcc. gcc/ PR target/95021 * config/i386/i386-features.c (has_non_address_hard_reg): Renamed to ... (pseudo_reg_set): This. Return the SET expression. Ignore pseudo register push. (general_scalar_to_vector_candidate_p): Combine single_set and has_non_address_hard_reg calls to pseudo_reg_set. (timode_scalar_to_vector_candidate_p): Likewise. * config/i386/i386.md (*pushv1ti2): New pattern. gcc/testsuite/ PR target/95021 * c-c++-common/dfp/func-vararg-mixed-2.c: Removed. * gcc.target/i386/pr95021-1.c: New test. * gcc.target/i386/pr95021-2.c: Likewise. * gcc.target/i386/pr95021-3.c: Likewise. * gcc.target/i386/pr95021-4.c: Likewise. * gcc.target/i386/pr95021-5.c: Likewise.
2020-05-17 19:10:34 +02:00
rtx set = single_set (insn);
if (!set)
return NULL;
/* Check pseudo register push first. */
machine_mode mode = TARGET_64BIT ? TImode : DImode;
x86: Allow V1TI vector register pushes Add V1TI vector register push and split it after reload to a sequence of: (set (reg:P SP_REG) (plus:P SP_REG) (const_int -8))) (set (match_dup 0) (match_dup 1)) so that STV pass can convert TI mode integer push to V1TI vector register push. Rename has_non_address_hard_reg to pseudo_reg_set, combine calls of single_set and has_non_address_hard_reg to pseudo_reg_set, to ignore pseudo register push. Remove c-c++-common/dfp/func-vararg-mixed-2.c since it is compiled with -mpreferred-stack-boundary=2 and leads to segfault: Dump of assembler code for function __bid_nesd2: 0x08049210 <+0>: endbr32 0x08049214 <+4>: push %esi 0x08049215 <+5>: push %ebx 0x08049216 <+6>: call 0x8049130 <__x86.get_pc_thunk.bx> 0x0804921b <+11>: add $0x8de5,%ebx 0x08049221 <+17>: sub $0x20,%esp 0x08049224 <+20>: mov 0x30(%esp),%esi 0x08049228 <+24>: pushl 0x2c(%esp) 0x0804922c <+28>: call 0x804e600 <__bid32_to_bid64> 0x08049231 <+33>: mov %esi,(%esp) 0x08049234 <+36>: movd %edx,%xmm1 0x08049238 <+40>: movd %eax,%xmm0 0x0804923c <+44>: punpckldq %xmm1,%xmm0 => 0x08049240 <+48>: movaps %xmm0,0x10(%esp) 0x08049245 <+53>: call 0x804e600 <__bid32_to_bid64> 0x0804924a <+58>: push %edx 0x0804924b <+59>: push %eax 0x0804924c <+60>: pushl 0x1c(%esp) 0x08049250 <+64>: pushl 0x1c(%esp) 0x08049254 <+68>: call 0x804b260 <__bid64_quiet_not_equal> 0x08049259 <+73>: add $0x34,%esp 0x0804925c <+76>: pop %ebx 0x0804925d <+77>: pop %esi 0x0804925e <+78>: ret when libgcc is compiled with -msse2. According to GCC manual: '-mpreferred-stack-boundary=NUM' Attempt to keep the stack boundary aligned to a 2 raised to NUM byte boundary. If '-mpreferred-stack-boundary' is not specified, the default is 4 (16 bytes or 128-bits). *Warning:* If you use this switch, then you must build all modules with the same value, including any libraries. This includes the system libraries and startup modules. c-c++-common/dfp/func-vararg-mixed-2.c, which was added by commit 3b2488ca6ece182f2136a20ee5fa0bb92f935b0f Author: H.J. Lu <hongjiu.lu@intel.com> Date: Wed Jul 30 19:24:02 2008 +0000 func-vararg-alternate-d128-2.c: New. 2008-07-30 H.J. Lu <hongjiu.lu@intel.com> Joey Ye <joey.ye@intel.com> * gcc.dg/dfp/func-vararg-alternate-d128-2.c: New. * gcc.dg/dfp/func-vararg-mixed-2.c: Likewise. isn't expected to work with libgcc. gcc/ PR target/95021 * config/i386/i386-features.c (has_non_address_hard_reg): Renamed to ... (pseudo_reg_set): This. Return the SET expression. Ignore pseudo register push. (general_scalar_to_vector_candidate_p): Combine single_set and has_non_address_hard_reg calls to pseudo_reg_set. (timode_scalar_to_vector_candidate_p): Likewise. * config/i386/i386.md (*pushv1ti2): New pattern. gcc/testsuite/ PR target/95021 * c-c++-common/dfp/func-vararg-mixed-2.c: Removed. * gcc.target/i386/pr95021-1.c: New test. * gcc.target/i386/pr95021-2.c: Likewise. * gcc.target/i386/pr95021-3.c: Likewise. * gcc.target/i386/pr95021-4.c: Likewise. * gcc.target/i386/pr95021-5.c: Likewise.
2020-05-17 19:10:34 +02:00
if (REG_P (SET_SRC (set))
&& !HARD_REGISTER_P (SET_SRC (set))
&& push_operand (SET_DEST (set), mode))
x86: Allow V1TI vector register pushes Add V1TI vector register push and split it after reload to a sequence of: (set (reg:P SP_REG) (plus:P SP_REG) (const_int -8))) (set (match_dup 0) (match_dup 1)) so that STV pass can convert TI mode integer push to V1TI vector register push. Rename has_non_address_hard_reg to pseudo_reg_set, combine calls of single_set and has_non_address_hard_reg to pseudo_reg_set, to ignore pseudo register push. Remove c-c++-common/dfp/func-vararg-mixed-2.c since it is compiled with -mpreferred-stack-boundary=2 and leads to segfault: Dump of assembler code for function __bid_nesd2: 0x08049210 <+0>: endbr32 0x08049214 <+4>: push %esi 0x08049215 <+5>: push %ebx 0x08049216 <+6>: call 0x8049130 <__x86.get_pc_thunk.bx> 0x0804921b <+11>: add $0x8de5,%ebx 0x08049221 <+17>: sub $0x20,%esp 0x08049224 <+20>: mov 0x30(%esp),%esi 0x08049228 <+24>: pushl 0x2c(%esp) 0x0804922c <+28>: call 0x804e600 <__bid32_to_bid64> 0x08049231 <+33>: mov %esi,(%esp) 0x08049234 <+36>: movd %edx,%xmm1 0x08049238 <+40>: movd %eax,%xmm0 0x0804923c <+44>: punpckldq %xmm1,%xmm0 => 0x08049240 <+48>: movaps %xmm0,0x10(%esp) 0x08049245 <+53>: call 0x804e600 <__bid32_to_bid64> 0x0804924a <+58>: push %edx 0x0804924b <+59>: push %eax 0x0804924c <+60>: pushl 0x1c(%esp) 0x08049250 <+64>: pushl 0x1c(%esp) 0x08049254 <+68>: call 0x804b260 <__bid64_quiet_not_equal> 0x08049259 <+73>: add $0x34,%esp 0x0804925c <+76>: pop %ebx 0x0804925d <+77>: pop %esi 0x0804925e <+78>: ret when libgcc is compiled with -msse2. According to GCC manual: '-mpreferred-stack-boundary=NUM' Attempt to keep the stack boundary aligned to a 2 raised to NUM byte boundary. If '-mpreferred-stack-boundary' is not specified, the default is 4 (16 bytes or 128-bits). *Warning:* If you use this switch, then you must build all modules with the same value, including any libraries. This includes the system libraries and startup modules. c-c++-common/dfp/func-vararg-mixed-2.c, which was added by commit 3b2488ca6ece182f2136a20ee5fa0bb92f935b0f Author: H.J. Lu <hongjiu.lu@intel.com> Date: Wed Jul 30 19:24:02 2008 +0000 func-vararg-alternate-d128-2.c: New. 2008-07-30 H.J. Lu <hongjiu.lu@intel.com> Joey Ye <joey.ye@intel.com> * gcc.dg/dfp/func-vararg-alternate-d128-2.c: New. * gcc.dg/dfp/func-vararg-mixed-2.c: Likewise. isn't expected to work with libgcc. gcc/ PR target/95021 * config/i386/i386-features.c (has_non_address_hard_reg): Renamed to ... (pseudo_reg_set): This. Return the SET expression. Ignore pseudo register push. (general_scalar_to_vector_candidate_p): Combine single_set and has_non_address_hard_reg calls to pseudo_reg_set. (timode_scalar_to_vector_candidate_p): Likewise. * config/i386/i386.md (*pushv1ti2): New pattern. gcc/testsuite/ PR target/95021 * c-c++-common/dfp/func-vararg-mixed-2.c: Removed. * gcc.target/i386/pr95021-1.c: New test. * gcc.target/i386/pr95021-2.c: Likewise. * gcc.target/i386/pr95021-3.c: Likewise. * gcc.target/i386/pr95021-4.c: Likewise. * gcc.target/i386/pr95021-5.c: Likewise.
2020-05-17 19:10:34 +02:00
return set;
df_ref ref;
FOR_EACH_INSN_DEF (ref, insn)
if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))
&& !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
&& DF_REF_REGNO (ref) != FLAGS_REG)
x86: Allow V1TI vector register pushes Add V1TI vector register push and split it after reload to a sequence of: (set (reg:P SP_REG) (plus:P SP_REG) (const_int -8))) (set (match_dup 0) (match_dup 1)) so that STV pass can convert TI mode integer push to V1TI vector register push. Rename has_non_address_hard_reg to pseudo_reg_set, combine calls of single_set and has_non_address_hard_reg to pseudo_reg_set, to ignore pseudo register push. Remove c-c++-common/dfp/func-vararg-mixed-2.c since it is compiled with -mpreferred-stack-boundary=2 and leads to segfault: Dump of assembler code for function __bid_nesd2: 0x08049210 <+0>: endbr32 0x08049214 <+4>: push %esi 0x08049215 <+5>: push %ebx 0x08049216 <+6>: call 0x8049130 <__x86.get_pc_thunk.bx> 0x0804921b <+11>: add $0x8de5,%ebx 0x08049221 <+17>: sub $0x20,%esp 0x08049224 <+20>: mov 0x30(%esp),%esi 0x08049228 <+24>: pushl 0x2c(%esp) 0x0804922c <+28>: call 0x804e600 <__bid32_to_bid64> 0x08049231 <+33>: mov %esi,(%esp) 0x08049234 <+36>: movd %edx,%xmm1 0x08049238 <+40>: movd %eax,%xmm0 0x0804923c <+44>: punpckldq %xmm1,%xmm0 => 0x08049240 <+48>: movaps %xmm0,0x10(%esp) 0x08049245 <+53>: call 0x804e600 <__bid32_to_bid64> 0x0804924a <+58>: push %edx 0x0804924b <+59>: push %eax 0x0804924c <+60>: pushl 0x1c(%esp) 0x08049250 <+64>: pushl 0x1c(%esp) 0x08049254 <+68>: call 0x804b260 <__bid64_quiet_not_equal> 0x08049259 <+73>: add $0x34,%esp 0x0804925c <+76>: pop %ebx 0x0804925d <+77>: pop %esi 0x0804925e <+78>: ret when libgcc is compiled with -msse2. According to GCC manual: '-mpreferred-stack-boundary=NUM' Attempt to keep the stack boundary aligned to a 2 raised to NUM byte boundary. If '-mpreferred-stack-boundary' is not specified, the default is 4 (16 bytes or 128-bits). *Warning:* If you use this switch, then you must build all modules with the same value, including any libraries. This includes the system libraries and startup modules. c-c++-common/dfp/func-vararg-mixed-2.c, which was added by commit 3b2488ca6ece182f2136a20ee5fa0bb92f935b0f Author: H.J. Lu <hongjiu.lu@intel.com> Date: Wed Jul 30 19:24:02 2008 +0000 func-vararg-alternate-d128-2.c: New. 2008-07-30 H.J. Lu <hongjiu.lu@intel.com> Joey Ye <joey.ye@intel.com> * gcc.dg/dfp/func-vararg-alternate-d128-2.c: New. * gcc.dg/dfp/func-vararg-mixed-2.c: Likewise. isn't expected to work with libgcc. gcc/ PR target/95021 * config/i386/i386-features.c (has_non_address_hard_reg): Renamed to ... (pseudo_reg_set): This. Return the SET expression. Ignore pseudo register push. (general_scalar_to_vector_candidate_p): Combine single_set and has_non_address_hard_reg calls to pseudo_reg_set. (timode_scalar_to_vector_candidate_p): Likewise. * config/i386/i386.md (*pushv1ti2): New pattern. gcc/testsuite/ PR target/95021 * c-c++-common/dfp/func-vararg-mixed-2.c: Removed. * gcc.target/i386/pr95021-1.c: New test. * gcc.target/i386/pr95021-2.c: Likewise. * gcc.target/i386/pr95021-3.c: Likewise. * gcc.target/i386/pr95021-4.c: Likewise. * gcc.target/i386/pr95021-5.c: Likewise.
2020-05-17 19:10:34 +02:00
return NULL;
FOR_EACH_INSN_USE (ref, insn)
if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref)))
x86: Allow V1TI vector register pushes Add V1TI vector register push and split it after reload to a sequence of: (set (reg:P SP_REG) (plus:P SP_REG) (const_int -8))) (set (match_dup 0) (match_dup 1)) so that STV pass can convert TI mode integer push to V1TI vector register push. Rename has_non_address_hard_reg to pseudo_reg_set, combine calls of single_set and has_non_address_hard_reg to pseudo_reg_set, to ignore pseudo register push. Remove c-c++-common/dfp/func-vararg-mixed-2.c since it is compiled with -mpreferred-stack-boundary=2 and leads to segfault: Dump of assembler code for function __bid_nesd2: 0x08049210 <+0>: endbr32 0x08049214 <+4>: push %esi 0x08049215 <+5>: push %ebx 0x08049216 <+6>: call 0x8049130 <__x86.get_pc_thunk.bx> 0x0804921b <+11>: add $0x8de5,%ebx 0x08049221 <+17>: sub $0x20,%esp 0x08049224 <+20>: mov 0x30(%esp),%esi 0x08049228 <+24>: pushl 0x2c(%esp) 0x0804922c <+28>: call 0x804e600 <__bid32_to_bid64> 0x08049231 <+33>: mov %esi,(%esp) 0x08049234 <+36>: movd %edx,%xmm1 0x08049238 <+40>: movd %eax,%xmm0 0x0804923c <+44>: punpckldq %xmm1,%xmm0 => 0x08049240 <+48>: movaps %xmm0,0x10(%esp) 0x08049245 <+53>: call 0x804e600 <__bid32_to_bid64> 0x0804924a <+58>: push %edx 0x0804924b <+59>: push %eax 0x0804924c <+60>: pushl 0x1c(%esp) 0x08049250 <+64>: pushl 0x1c(%esp) 0x08049254 <+68>: call 0x804b260 <__bid64_quiet_not_equal> 0x08049259 <+73>: add $0x34,%esp 0x0804925c <+76>: pop %ebx 0x0804925d <+77>: pop %esi 0x0804925e <+78>: ret when libgcc is compiled with -msse2. According to GCC manual: '-mpreferred-stack-boundary=NUM' Attempt to keep the stack boundary aligned to a 2 raised to NUM byte boundary. If '-mpreferred-stack-boundary' is not specified, the default is 4 (16 bytes or 128-bits). *Warning:* If you use this switch, then you must build all modules with the same value, including any libraries. This includes the system libraries and startup modules. c-c++-common/dfp/func-vararg-mixed-2.c, which was added by commit 3b2488ca6ece182f2136a20ee5fa0bb92f935b0f Author: H.J. Lu <hongjiu.lu@intel.com> Date: Wed Jul 30 19:24:02 2008 +0000 func-vararg-alternate-d128-2.c: New. 2008-07-30 H.J. Lu <hongjiu.lu@intel.com> Joey Ye <joey.ye@intel.com> * gcc.dg/dfp/func-vararg-alternate-d128-2.c: New. * gcc.dg/dfp/func-vararg-mixed-2.c: Likewise. isn't expected to work with libgcc. gcc/ PR target/95021 * config/i386/i386-features.c (has_non_address_hard_reg): Renamed to ... (pseudo_reg_set): This. Return the SET expression. Ignore pseudo register push. (general_scalar_to_vector_candidate_p): Combine single_set and has_non_address_hard_reg calls to pseudo_reg_set. (timode_scalar_to_vector_candidate_p): Likewise. * config/i386/i386.md (*pushv1ti2): New pattern. gcc/testsuite/ PR target/95021 * c-c++-common/dfp/func-vararg-mixed-2.c: Removed. * gcc.target/i386/pr95021-1.c: New test. * gcc.target/i386/pr95021-2.c: Likewise. * gcc.target/i386/pr95021-3.c: Likewise. * gcc.target/i386/pr95021-4.c: Likewise. * gcc.target/i386/pr95021-5.c: Likewise.
2020-05-17 19:10:34 +02:00
return NULL;
x86: Allow V1TI vector register pushes Add V1TI vector register push and split it after reload to a sequence of: (set (reg:P SP_REG) (plus:P SP_REG) (const_int -8))) (set (match_dup 0) (match_dup 1)) so that STV pass can convert TI mode integer push to V1TI vector register push. Rename has_non_address_hard_reg to pseudo_reg_set, combine calls of single_set and has_non_address_hard_reg to pseudo_reg_set, to ignore pseudo register push. Remove c-c++-common/dfp/func-vararg-mixed-2.c since it is compiled with -mpreferred-stack-boundary=2 and leads to segfault: Dump of assembler code for function __bid_nesd2: 0x08049210 <+0>: endbr32 0x08049214 <+4>: push %esi 0x08049215 <+5>: push %ebx 0x08049216 <+6>: call 0x8049130 <__x86.get_pc_thunk.bx> 0x0804921b <+11>: add $0x8de5,%ebx 0x08049221 <+17>: sub $0x20,%esp 0x08049224 <+20>: mov 0x30(%esp),%esi 0x08049228 <+24>: pushl 0x2c(%esp) 0x0804922c <+28>: call 0x804e600 <__bid32_to_bid64> 0x08049231 <+33>: mov %esi,(%esp) 0x08049234 <+36>: movd %edx,%xmm1 0x08049238 <+40>: movd %eax,%xmm0 0x0804923c <+44>: punpckldq %xmm1,%xmm0 => 0x08049240 <+48>: movaps %xmm0,0x10(%esp) 0x08049245 <+53>: call 0x804e600 <__bid32_to_bid64> 0x0804924a <+58>: push %edx 0x0804924b <+59>: push %eax 0x0804924c <+60>: pushl 0x1c(%esp) 0x08049250 <+64>: pushl 0x1c(%esp) 0x08049254 <+68>: call 0x804b260 <__bid64_quiet_not_equal> 0x08049259 <+73>: add $0x34,%esp 0x0804925c <+76>: pop %ebx 0x0804925d <+77>: pop %esi 0x0804925e <+78>: ret when libgcc is compiled with -msse2. According to GCC manual: '-mpreferred-stack-boundary=NUM' Attempt to keep the stack boundary aligned to a 2 raised to NUM byte boundary. If '-mpreferred-stack-boundary' is not specified, the default is 4 (16 bytes or 128-bits). *Warning:* If you use this switch, then you must build all modules with the same value, including any libraries. This includes the system libraries and startup modules. c-c++-common/dfp/func-vararg-mixed-2.c, which was added by commit 3b2488ca6ece182f2136a20ee5fa0bb92f935b0f Author: H.J. Lu <hongjiu.lu@intel.com> Date: Wed Jul 30 19:24:02 2008 +0000 func-vararg-alternate-d128-2.c: New. 2008-07-30 H.J. Lu <hongjiu.lu@intel.com> Joey Ye <joey.ye@intel.com> * gcc.dg/dfp/func-vararg-alternate-d128-2.c: New. * gcc.dg/dfp/func-vararg-mixed-2.c: Likewise. isn't expected to work with libgcc. gcc/ PR target/95021 * config/i386/i386-features.c (has_non_address_hard_reg): Renamed to ... (pseudo_reg_set): This. Return the SET expression. Ignore pseudo register push. (general_scalar_to_vector_candidate_p): Combine single_set and has_non_address_hard_reg calls to pseudo_reg_set. (timode_scalar_to_vector_candidate_p): Likewise. * config/i386/i386.md (*pushv1ti2): New pattern. gcc/testsuite/ PR target/95021 * c-c++-common/dfp/func-vararg-mixed-2.c: Removed. * gcc.target/i386/pr95021-1.c: New test. * gcc.target/i386/pr95021-2.c: Likewise. * gcc.target/i386/pr95021-3.c: Likewise. * gcc.target/i386/pr95021-4.c: Likewise. * gcc.target/i386/pr95021-5.c: Likewise.
2020-05-17 19:10:34 +02:00
return set;
}
PR target/70321: Split double word equality/inequality after STV on x86. This patch resolves the last piece of PR target/70321 a code quality (P2 regression) affecting mainline. Currently, for HJ's testcase: void foo (long long ixi) { if (ixi != 14348907) __builtin_abort (); } GCC with -m32 -O2 generates four instructions for the comparison: movl 16(%esp), %eax movl 20(%esp), %edx xorl $14348907, %eax orl %eax, %edx but with this patch it now requires only three, making better use of x86's addressing modes: movl 16(%esp), %eax xorl $14348907, %eax orl 20(%esp), %eax The solution is to expand "doubleword" equality/inequality expressions using flag setting COMPARE instructions for the early RTL passes, and then split them during split1, after STV and before reload. Hence on x86_64, we now see/allow things like: (insn 11 8 12 2 (set (reg:CCZ 17 flags) (compare:CCZ (reg/v:TI 84 [ x ]) (reg:TI 96))) "cmpti.c":2:43 30 {*cmpti_doubleword} This allows the STV pass to decide whether it's preferrable to perform this comparison using vector operations, i.e. a pxor/ptest sequence, or as scalar integer operations, i.e. a xor/xor/or sequence. Alas this required tweaking of the STV pass to recognize the "new" form of these comparisons and split out the pxor operation itself. To confirm this still works as expected I've added a new STV test case: long long a[1024]; long long b[1024]; int foo() { for (int i=0; i<1024; i++) { long long t = (a[i]<<8) | (b[i]<<24); if (t == 0) return 1; } return 0; } where with -m32 -O2 -msse4.1 the above comparison with zero should look like: punpcklqdq %xmm0, %xmm0 ptest %xmm0, %xmm0 Although this patch includes one or two minor tweaks to provide all the necessary infrastructure to support conversion of TImode comparisons to V1TImode (and SImode comparisons to V4SImode), STV doesn't yet implement these transformations, but this is something that can be considered after stage 4. Indeed the new convert_compare functionality is split out into a method to simplify its potential reuse by the timode_scalar_chain class. 2022-05-30 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog PR target/70321 * config/i386/i386-expand.cc (ix86_expand_branch): Don't decompose DI mode equality/inequality using XOR here. Instead generate a COMPARE for doubleword modes (DImode on !TARGET_64BIT or TImode). * config/i386/i386-features.cc (gen_gpr_to_xmm_move_src): Use gen_rtx_SUBREG when NUNITS is 1, i.e. for TImode to V1TImode. (general_scalar_chain::convert_compare): New function to convert scalar equality/inequality comparison into vector operations. (general_scalar_chain::convert_insn) [COMPARE]: Refactor. Call new convert_compare helper method. (convertible_comparion_p): Update to match doubleword COMPARE of two register, memory or integer constant operands. * config/i386/i386-features.h (general_scalar_chain::convert_compare): Prototype/declare member function here. * config/i386/i386.md (cstore<mode>4): Change mode to SDWIM, but only allow new doubleword modes for EQ and NE operators. (*cmp<dwi>_doubleword): New define_insn_and_split, to split a doubleword comparison into a pair of XORs followed by an IOR to set the (zero) flags register, optimizing the XORs if possible. * config/i386/sse.md (V_AVX): Include V1TI and V2TI in mode iterator; V_AVX is (currently) only used by ptest. (sse4_1 mode attribute): Update to support V1TI and V2TI. gcc/testsuite/ChangeLog PR target/70321 * gcc.target/i386/pr70321.c: New test case. * gcc.target/i386/sse4_1-stv-1.c: New test case.
2022-05-30 22:20:09 +02:00
/* Check if comparison INSN may be transformed into vector comparison.
Currently we transform equality/inequality checks which look like:
(set (reg:CCZ 17 flags) (compare:CCZ (reg:TI x) (reg:TI y))) */
static bool
convertible_comparison_p (rtx_insn *insn, enum machine_mode mode)
{
PR target/70321: Split double word equality/inequality after STV on x86. This patch resolves the last piece of PR target/70321 a code quality (P2 regression) affecting mainline. Currently, for HJ's testcase: void foo (long long ixi) { if (ixi != 14348907) __builtin_abort (); } GCC with -m32 -O2 generates four instructions for the comparison: movl 16(%esp), %eax movl 20(%esp), %edx xorl $14348907, %eax orl %eax, %edx but with this patch it now requires only three, making better use of x86's addressing modes: movl 16(%esp), %eax xorl $14348907, %eax orl 20(%esp), %eax The solution is to expand "doubleword" equality/inequality expressions using flag setting COMPARE instructions for the early RTL passes, and then split them during split1, after STV and before reload. Hence on x86_64, we now see/allow things like: (insn 11 8 12 2 (set (reg:CCZ 17 flags) (compare:CCZ (reg/v:TI 84 [ x ]) (reg:TI 96))) "cmpti.c":2:43 30 {*cmpti_doubleword} This allows the STV pass to decide whether it's preferrable to perform this comparison using vector operations, i.e. a pxor/ptest sequence, or as scalar integer operations, i.e. a xor/xor/or sequence. Alas this required tweaking of the STV pass to recognize the "new" form of these comparisons and split out the pxor operation itself. To confirm this still works as expected I've added a new STV test case: long long a[1024]; long long b[1024]; int foo() { for (int i=0; i<1024; i++) { long long t = (a[i]<<8) | (b[i]<<24); if (t == 0) return 1; } return 0; } where with -m32 -O2 -msse4.1 the above comparison with zero should look like: punpcklqdq %xmm0, %xmm0 ptest %xmm0, %xmm0 Although this patch includes one or two minor tweaks to provide all the necessary infrastructure to support conversion of TImode comparisons to V1TImode (and SImode comparisons to V4SImode), STV doesn't yet implement these transformations, but this is something that can be considered after stage 4. Indeed the new convert_compare functionality is split out into a method to simplify its potential reuse by the timode_scalar_chain class. 2022-05-30 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog PR target/70321 * config/i386/i386-expand.cc (ix86_expand_branch): Don't decompose DI mode equality/inequality using XOR here. Instead generate a COMPARE for doubleword modes (DImode on !TARGET_64BIT or TImode). * config/i386/i386-features.cc (gen_gpr_to_xmm_move_src): Use gen_rtx_SUBREG when NUNITS is 1, i.e. for TImode to V1TImode. (general_scalar_chain::convert_compare): New function to convert scalar equality/inequality comparison into vector operations. (general_scalar_chain::convert_insn) [COMPARE]: Refactor. Call new convert_compare helper method. (convertible_comparion_p): Update to match doubleword COMPARE of two register, memory or integer constant operands. * config/i386/i386-features.h (general_scalar_chain::convert_compare): Prototype/declare member function here. * config/i386/i386.md (cstore<mode>4): Change mode to SDWIM, but only allow new doubleword modes for EQ and NE operators. (*cmp<dwi>_doubleword): New define_insn_and_split, to split a doubleword comparison into a pair of XORs followed by an IOR to set the (zero) flags register, optimizing the XORs if possible. * config/i386/sse.md (V_AVX): Include V1TI and V2TI in mode iterator; V_AVX is (currently) only used by ptest. (sse4_1 mode attribute): Update to support V1TI and V2TI. gcc/testsuite/ChangeLog PR target/70321 * gcc.target/i386/pr70321.c: New test case. * gcc.target/i386/sse4_1-stv-1.c: New test case.
2022-05-30 22:20:09 +02:00
if (mode != (TARGET_64BIT ? TImode : DImode))
return false;
if (!TARGET_SSE4_1)
return false;
rtx def_set = single_set (insn);
gcc_assert (def_set);
rtx src = SET_SRC (def_set);
rtx dst = SET_DEST (def_set);
gcc_assert (GET_CODE (src) == COMPARE);
if (GET_CODE (dst) != REG
|| REGNO (dst) != FLAGS_REG
|| GET_MODE (dst) != CCZmode)
return false;
rtx op1 = XEXP (src, 0);
rtx op2 = XEXP (src, 1);
/* *cmp<dwi>_doubleword. */
Use PTEST to perform AND in TImode STV of (A & B) != 0 on x86_64. This x86_64 backend patch allows TImode STV to take advantage of the fact that the PTEST instruction performs an AND operation. Previously PTEST was (mostly) used for comparison against zero, by using the same operands. The benefits are demonstrated by the new test case: __int128 a,b; int foo() { return (a & b) != 0; } Currently with -O2 -msse4 we generate: movdqa a(%rip), %xmm0 pand b(%rip), %xmm0 xorl %eax, %eax ptest %xmm0, %xmm0 setne %al ret with this patch we now generate: movdqa a(%rip), %xmm0 xorl %eax, %eax ptest b(%rip), %xmm0 setne %al ret Technically, the magic happens using new define_insn_and_split patterns. Using two patterns allows this transformation to performed independently of whether TImode STV is run before or after combine. The one tricky case is that immediate constant operands of the AND behave slightly differently between TImode and V1TImode: All V1TImode immediate operands becomes loads, but for TImode only values that are not hilo_operands need to be loaded. Hence the new *testti_doubleword accepts any general_operand, but internally during split calls force_reg whenever the second operand is not x86_64_hilo_general_operand. This required (benefits from) some tweaks to TImode STV to support CONST_WIDE_INT in more places, using CONST_SCALAR_INT_P instead of just CONST_INT_P. 2022-08-09 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-features.cc (scalar_chain::convert_compare): Create new pseudos only when/if needed. Add support for TEST, i.e. (COMPARE (AND x y) (const_int 0)), using UNSPEC_PTEST. When broadcasting V2DImode and V4SImode use new pseudo register. (timode_scalar_chain::convert_op): Do nothing if operand is already V1TImode. Avoid generating useless SUBREG conversions, i.e. (SUBREG:V1TImode (REG:V1TImode) 0). Handle CONST_WIDE_INT in addition to CONST_INT by using CONST_SCALAR_INT_P. (convertible_comparison_p): Use CONST_SCALAR_INT_P to match both CONST_WIDE_INT and CONST_INT. Recognize new *testti_doubleword pattern as an STV candidate. (timode_scalar_to_vector_candidate_p): Allow CONST_SCALAR_INT_P operands in binary logic operations. * config/i386/i386.cc (ix86_rtx_costs) <case UNSPEC>: Add costs for UNSPEC_PTEST; a PTEST that performs an AND has the same cost as regular PTEST, i.e. cost->sse_op. * config/i386/i386.md (*testti_doubleword): New pre-reload define_insn_and_split that recognizes comparison of TI mode AND against zero. * config/i386/sse.md (*ptest<mode>_and): New pre-reload define_insn_and_split that recognizes UNSPEC_PTEST of identical AND operands. gcc/testsuite/ChangeLog * gcc.target/i386/sse4_1-stv-8.c: New test case.
2022-08-09 19:59:55 +02:00
if ((CONST_SCALAR_INT_P (op1)
|| ((REG_P (op1) || MEM_P (op1))
PR target/106278: Keep REG_EQUAL notes consistent during TImode STV on x86_64. This patch resolves PR target/106278 a regression on x86_64 caused by my recent TImode STV improvements. Now that TImode STV can handle comparisons such as "(set (regs:CC) (compare:CC (reg:TI) ...))" the convert_insn method sensibly checks that the mode of the SET_DEST is TImode before setting it to V1TImode [to avoid V1TImode appearing on the hard reg CC_FLAGS. Hence the current code looks like: if (GET_MODE (dst) == TImode) { tmp = find_reg_equal_equiv_note (insn); if (tmp && GET_MODE (XEXP (tmp, 0)) == TImode) PUT_MODE (XEXP (tmp, 0), V1TImode); PUT_MODE (dst, V1TImode); fix_debug_reg_uses (dst); } break; which checks GET_MODE (dst) before calling PUT_MODE, and when a change is made updating the REG_EQUAL_NOTE tmp if it exists. The logical flaw (oversight) is that due to RTL sharing, the destination of this set may already have been updated to V1TImode, as this chain is being converted, but we still need to update any REG_EQUAL_NOTE that still has TImode. Hence the correct code is actually: if (GET_MODE (dst) == TImode) { PUT_MODE (dst, V1TImode); fix_debug_reg_uses (dst); } if (GET_MODE (dst) == V1TImode) { tmp = find_reg_equal_equiv_note (insn); if (tmp && GET_MODE (XEXP (tmp, 0)) == TImode) PUT_MODE (XEXP (tmp, 0), V1TImode); } break; While fixing this behavior, I noticed I had some indentation whitespace issues and some vestigial dead code in this function/method that I've taken the liberty of cleaning up (as obvious) in this patch. 2022-07-15 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog PR target/106278 * config/i386/i386-features.cc (general_scalar_chain::convert_insn): Fix indentation whitespace. (timode_scalar_chain::fix_debug_reg_uses): Likewise. (timode_scalar_chain::convert_insn): Delete dead code. Update TImode REG_EQUAL_NOTE even if the SET_DEST is already V1TI. Fix indentation whitespace. (convertible_comparison_p): Likewise. (timode_scalar_to_vector_candidate_p): Likewise. gcc/testsuite/ChangeLog * gcc.dg/pr106278.c: New test case.
2022-07-15 15:39:28 +02:00
&& GET_MODE (op1) == mode))
Use PTEST to perform AND in TImode STV of (A & B) != 0 on x86_64. This x86_64 backend patch allows TImode STV to take advantage of the fact that the PTEST instruction performs an AND operation. Previously PTEST was (mostly) used for comparison against zero, by using the same operands. The benefits are demonstrated by the new test case: __int128 a,b; int foo() { return (a & b) != 0; } Currently with -O2 -msse4 we generate: movdqa a(%rip), %xmm0 pand b(%rip), %xmm0 xorl %eax, %eax ptest %xmm0, %xmm0 setne %al ret with this patch we now generate: movdqa a(%rip), %xmm0 xorl %eax, %eax ptest b(%rip), %xmm0 setne %al ret Technically, the magic happens using new define_insn_and_split patterns. Using two patterns allows this transformation to performed independently of whether TImode STV is run before or after combine. The one tricky case is that immediate constant operands of the AND behave slightly differently between TImode and V1TImode: All V1TImode immediate operands becomes loads, but for TImode only values that are not hilo_operands need to be loaded. Hence the new *testti_doubleword accepts any general_operand, but internally during split calls force_reg whenever the second operand is not x86_64_hilo_general_operand. This required (benefits from) some tweaks to TImode STV to support CONST_WIDE_INT in more places, using CONST_SCALAR_INT_P instead of just CONST_INT_P. 2022-08-09 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-features.cc (scalar_chain::convert_compare): Create new pseudos only when/if needed. Add support for TEST, i.e. (COMPARE (AND x y) (const_int 0)), using UNSPEC_PTEST. When broadcasting V2DImode and V4SImode use new pseudo register. (timode_scalar_chain::convert_op): Do nothing if operand is already V1TImode. Avoid generating useless SUBREG conversions, i.e. (SUBREG:V1TImode (REG:V1TImode) 0). Handle CONST_WIDE_INT in addition to CONST_INT by using CONST_SCALAR_INT_P. (convertible_comparison_p): Use CONST_SCALAR_INT_P to match both CONST_WIDE_INT and CONST_INT. Recognize new *testti_doubleword pattern as an STV candidate. (timode_scalar_to_vector_candidate_p): Allow CONST_SCALAR_INT_P operands in binary logic operations. * config/i386/i386.cc (ix86_rtx_costs) <case UNSPEC>: Add costs for UNSPEC_PTEST; a PTEST that performs an AND has the same cost as regular PTEST, i.e. cost->sse_op. * config/i386/i386.md (*testti_doubleword): New pre-reload define_insn_and_split that recognizes comparison of TI mode AND against zero. * config/i386/sse.md (*ptest<mode>_and): New pre-reload define_insn_and_split that recognizes UNSPEC_PTEST of identical AND operands. gcc/testsuite/ChangeLog * gcc.target/i386/sse4_1-stv-8.c: New test case.
2022-08-09 19:59:55 +02:00
&& (CONST_SCALAR_INT_P (op2)
|| ((REG_P (op2) || MEM_P (op2))
&& GET_MODE (op2) == mode)))
return true;
Use PTEST to perform AND in TImode STV of (A & B) != 0 on x86_64. This x86_64 backend patch allows TImode STV to take advantage of the fact that the PTEST instruction performs an AND operation. Previously PTEST was (mostly) used for comparison against zero, by using the same operands. The benefits are demonstrated by the new test case: __int128 a,b; int foo() { return (a & b) != 0; } Currently with -O2 -msse4 we generate: movdqa a(%rip), %xmm0 pand b(%rip), %xmm0 xorl %eax, %eax ptest %xmm0, %xmm0 setne %al ret with this patch we now generate: movdqa a(%rip), %xmm0 xorl %eax, %eax ptest b(%rip), %xmm0 setne %al ret Technically, the magic happens using new define_insn_and_split patterns. Using two patterns allows this transformation to performed independently of whether TImode STV is run before or after combine. The one tricky case is that immediate constant operands of the AND behave slightly differently between TImode and V1TImode: All V1TImode immediate operands becomes loads, but for TImode only values that are not hilo_operands need to be loaded. Hence the new *testti_doubleword accepts any general_operand, but internally during split calls force_reg whenever the second operand is not x86_64_hilo_general_operand. This required (benefits from) some tweaks to TImode STV to support CONST_WIDE_INT in more places, using CONST_SCALAR_INT_P instead of just CONST_INT_P. 2022-08-09 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-features.cc (scalar_chain::convert_compare): Create new pseudos only when/if needed. Add support for TEST, i.e. (COMPARE (AND x y) (const_int 0)), using UNSPEC_PTEST. When broadcasting V2DImode and V4SImode use new pseudo register. (timode_scalar_chain::convert_op): Do nothing if operand is already V1TImode. Avoid generating useless SUBREG conversions, i.e. (SUBREG:V1TImode (REG:V1TImode) 0). Handle CONST_WIDE_INT in addition to CONST_INT by using CONST_SCALAR_INT_P. (convertible_comparison_p): Use CONST_SCALAR_INT_P to match both CONST_WIDE_INT and CONST_INT. Recognize new *testti_doubleword pattern as an STV candidate. (timode_scalar_to_vector_candidate_p): Allow CONST_SCALAR_INT_P operands in binary logic operations. * config/i386/i386.cc (ix86_rtx_costs) <case UNSPEC>: Add costs for UNSPEC_PTEST; a PTEST that performs an AND has the same cost as regular PTEST, i.e. cost->sse_op. * config/i386/i386.md (*testti_doubleword): New pre-reload define_insn_and_split that recognizes comparison of TI mode AND against zero. * config/i386/sse.md (*ptest<mode>_and): New pre-reload define_insn_and_split that recognizes UNSPEC_PTEST of identical AND operands. gcc/testsuite/ChangeLog * gcc.target/i386/sse4_1-stv-8.c: New test case.
2022-08-09 19:59:55 +02:00
/* *testti_doubleword. */
if (op2 == const0_rtx
&& GET_CODE (op1) == AND
&& REG_P (XEXP (op1, 0)))
{
rtx op12 = XEXP (op1, 1);
return GET_MODE (XEXP (op1, 0)) == TImode
&& (CONST_SCALAR_INT_P (op12)
|| ((REG_P (op12) || MEM_P (op12))
&& GET_MODE (op12) == TImode));
}
/* *test<dwi>_not_doubleword. */
if (op2 == const0_rtx
&& GET_CODE (op1) == AND
&& GET_CODE (XEXP (op1, 0)) == NOT)
{
rtx op11 = XEXP (XEXP (op1, 0), 0);
rtx op12 = XEXP (op1, 1);
return (REG_P (op11) || MEM_P (op11))
&& (REG_P (op12) || MEM_P (op12))
&& GET_MODE (op11) == mode
&& GET_MODE (op12) == mode;
}
return false;
}
/* The general version of scalar_to_vector_candidate_p. */
static bool
re PR rtl-optimization/91154 (456.hmmer regression on Haswell caused by r272922) 2019-08-14 Richard Biener <rguenther@suse.de> Uroš Bizjak <ubizjak@gmail.com> PR target/91154 * config/i386/i386-features.h (scalar_chain::scalar_chain): Add mode arguments. (scalar_chain::smode): New member. (scalar_chain::vmode): Likewise. (dimode_scalar_chain): Rename to... (general_scalar_chain): ... this. (general_scalar_chain::general_scalar_chain): Take mode arguments. (timode_scalar_chain::timode_scalar_chain): Initialize scalar_chain base with TImode and V1TImode. * config/i386/i386-features.c (scalar_chain::scalar_chain): Adjust. (general_scalar_chain::vector_const_cost): Adjust for SImode chains. (general_scalar_chain::compute_convert_gain): Likewise. Add {S,U}{MIN,MAX} support. (general_scalar_chain::replace_with_subreg): Use vmode/smode. (general_scalar_chain::make_vector_copies): Likewise. Handle non-DImode chains appropriately. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::convert_op): Likewise. (general_scalar_chain::convert_insn): Likewise. Add fatal_insn_not_found if the result is not recognized. (convertible_comparison_p): Pass in the scalar mode and use that. (general_scalar_to_vector_candidate_p): Likewise. Rename from dimode_scalar_to_vector_candidate_p. Add {S,U}{MIN,MAX} support. (scalar_to_vector_candidate_p): Remove by inlining into single caller. (general_remove_non_convertible_regs): Rename from dimode_remove_non_convertible_regs. (remove_non_convertible_regs): Remove by inlining into single caller. (convert_scalars_to_vector): Handle SImode and DImode chains in addition to TImode chains. * config/i386/i386.md (<maxmin><MAXMIN_IMODE>3): New expander. (*<maxmin><MAXMIN_IMODE>3_1): New insn-and-split. (*<maxmin>di3_doubleword): Likewise. * gcc.target/i386/pr91154.c: New testcase. * gcc.target/i386/minmax-3.c: Likewise. * gcc.target/i386/minmax-4.c: Likewise. * gcc.target/i386/minmax-5.c: Likewise. * gcc.target/i386/minmax-6.c: Likewise. * gcc.target/i386/minmax-1.c: Add -mno-stv. * gcc.target/i386/minmax-2.c: Likewise. Co-Authored-By: Uros Bizjak <ubizjak@gmail.com> From-SVN: r274481
2019-08-14 14:04:05 +02:00
general_scalar_to_vector_candidate_p (rtx_insn *insn, enum machine_mode mode)
{
x86: Allow V1TI vector register pushes Add V1TI vector register push and split it after reload to a sequence of: (set (reg:P SP_REG) (plus:P SP_REG) (const_int -8))) (set (match_dup 0) (match_dup 1)) so that STV pass can convert TI mode integer push to V1TI vector register push. Rename has_non_address_hard_reg to pseudo_reg_set, combine calls of single_set and has_non_address_hard_reg to pseudo_reg_set, to ignore pseudo register push. Remove c-c++-common/dfp/func-vararg-mixed-2.c since it is compiled with -mpreferred-stack-boundary=2 and leads to segfault: Dump of assembler code for function __bid_nesd2: 0x08049210 <+0>: endbr32 0x08049214 <+4>: push %esi 0x08049215 <+5>: push %ebx 0x08049216 <+6>: call 0x8049130 <__x86.get_pc_thunk.bx> 0x0804921b <+11>: add $0x8de5,%ebx 0x08049221 <+17>: sub $0x20,%esp 0x08049224 <+20>: mov 0x30(%esp),%esi 0x08049228 <+24>: pushl 0x2c(%esp) 0x0804922c <+28>: call 0x804e600 <__bid32_to_bid64> 0x08049231 <+33>: mov %esi,(%esp) 0x08049234 <+36>: movd %edx,%xmm1 0x08049238 <+40>: movd %eax,%xmm0 0x0804923c <+44>: punpckldq %xmm1,%xmm0 => 0x08049240 <+48>: movaps %xmm0,0x10(%esp) 0x08049245 <+53>: call 0x804e600 <__bid32_to_bid64> 0x0804924a <+58>: push %edx 0x0804924b <+59>: push %eax 0x0804924c <+60>: pushl 0x1c(%esp) 0x08049250 <+64>: pushl 0x1c(%esp) 0x08049254 <+68>: call 0x804b260 <__bid64_quiet_not_equal> 0x08049259 <+73>: add $0x34,%esp 0x0804925c <+76>: pop %ebx 0x0804925d <+77>: pop %esi 0x0804925e <+78>: ret when libgcc is compiled with -msse2. According to GCC manual: '-mpreferred-stack-boundary=NUM' Attempt to keep the stack boundary aligned to a 2 raised to NUM byte boundary. If '-mpreferred-stack-boundary' is not specified, the default is 4 (16 bytes or 128-bits). *Warning:* If you use this switch, then you must build all modules with the same value, including any libraries. This includes the system libraries and startup modules. c-c++-common/dfp/func-vararg-mixed-2.c, which was added by commit 3b2488ca6ece182f2136a20ee5fa0bb92f935b0f Author: H.J. Lu <hongjiu.lu@intel.com> Date: Wed Jul 30 19:24:02 2008 +0000 func-vararg-alternate-d128-2.c: New. 2008-07-30 H.J. Lu <hongjiu.lu@intel.com> Joey Ye <joey.ye@intel.com> * gcc.dg/dfp/func-vararg-alternate-d128-2.c: New. * gcc.dg/dfp/func-vararg-mixed-2.c: Likewise. isn't expected to work with libgcc. gcc/ PR target/95021 * config/i386/i386-features.c (has_non_address_hard_reg): Renamed to ... (pseudo_reg_set): This. Return the SET expression. Ignore pseudo register push. (general_scalar_to_vector_candidate_p): Combine single_set and has_non_address_hard_reg calls to pseudo_reg_set. (timode_scalar_to_vector_candidate_p): Likewise. * config/i386/i386.md (*pushv1ti2): New pattern. gcc/testsuite/ PR target/95021 * c-c++-common/dfp/func-vararg-mixed-2.c: Removed. * gcc.target/i386/pr95021-1.c: New test. * gcc.target/i386/pr95021-2.c: Likewise. * gcc.target/i386/pr95021-3.c: Likewise. * gcc.target/i386/pr95021-4.c: Likewise. * gcc.target/i386/pr95021-5.c: Likewise.
2020-05-17 19:10:34 +02:00
rtx def_set = pseudo_reg_set (insn);
if (!def_set)
return false;
rtx src = SET_SRC (def_set);
rtx dst = SET_DEST (def_set);
if (GET_CODE (src) == COMPARE)
re PR rtl-optimization/91154 (456.hmmer regression on Haswell caused by r272922) 2019-08-14 Richard Biener <rguenther@suse.de> Uroš Bizjak <ubizjak@gmail.com> PR target/91154 * config/i386/i386-features.h (scalar_chain::scalar_chain): Add mode arguments. (scalar_chain::smode): New member. (scalar_chain::vmode): Likewise. (dimode_scalar_chain): Rename to... (general_scalar_chain): ... this. (general_scalar_chain::general_scalar_chain): Take mode arguments. (timode_scalar_chain::timode_scalar_chain): Initialize scalar_chain base with TImode and V1TImode. * config/i386/i386-features.c (scalar_chain::scalar_chain): Adjust. (general_scalar_chain::vector_const_cost): Adjust for SImode chains. (general_scalar_chain::compute_convert_gain): Likewise. Add {S,U}{MIN,MAX} support. (general_scalar_chain::replace_with_subreg): Use vmode/smode. (general_scalar_chain::make_vector_copies): Likewise. Handle non-DImode chains appropriately. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::convert_op): Likewise. (general_scalar_chain::convert_insn): Likewise. Add fatal_insn_not_found if the result is not recognized. (convertible_comparison_p): Pass in the scalar mode and use that. (general_scalar_to_vector_candidate_p): Likewise. Rename from dimode_scalar_to_vector_candidate_p. Add {S,U}{MIN,MAX} support. (scalar_to_vector_candidate_p): Remove by inlining into single caller. (general_remove_non_convertible_regs): Rename from dimode_remove_non_convertible_regs. (remove_non_convertible_regs): Remove by inlining into single caller. (convert_scalars_to_vector): Handle SImode and DImode chains in addition to TImode chains. * config/i386/i386.md (<maxmin><MAXMIN_IMODE>3): New expander. (*<maxmin><MAXMIN_IMODE>3_1): New insn-and-split. (*<maxmin>di3_doubleword): Likewise. * gcc.target/i386/pr91154.c: New testcase. * gcc.target/i386/minmax-3.c: Likewise. * gcc.target/i386/minmax-4.c: Likewise. * gcc.target/i386/minmax-5.c: Likewise. * gcc.target/i386/minmax-6.c: Likewise. * gcc.target/i386/minmax-1.c: Add -mno-stv. * gcc.target/i386/minmax-2.c: Likewise. Co-Authored-By: Uros Bizjak <ubizjak@gmail.com> From-SVN: r274481
2019-08-14 14:04:05 +02:00
return convertible_comparison_p (insn, mode);
/* We are interested in "mode" only. */
re PR rtl-optimization/91154 (456.hmmer regression on Haswell caused by r272922) 2019-08-14 Richard Biener <rguenther@suse.de> Uroš Bizjak <ubizjak@gmail.com> PR target/91154 * config/i386/i386-features.h (scalar_chain::scalar_chain): Add mode arguments. (scalar_chain::smode): New member. (scalar_chain::vmode): Likewise. (dimode_scalar_chain): Rename to... (general_scalar_chain): ... this. (general_scalar_chain::general_scalar_chain): Take mode arguments. (timode_scalar_chain::timode_scalar_chain): Initialize scalar_chain base with TImode and V1TImode. * config/i386/i386-features.c (scalar_chain::scalar_chain): Adjust. (general_scalar_chain::vector_const_cost): Adjust for SImode chains. (general_scalar_chain::compute_convert_gain): Likewise. Add {S,U}{MIN,MAX} support. (general_scalar_chain::replace_with_subreg): Use vmode/smode. (general_scalar_chain::make_vector_copies): Likewise. Handle non-DImode chains appropriately. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::convert_op): Likewise. (general_scalar_chain::convert_insn): Likewise. Add fatal_insn_not_found if the result is not recognized. (convertible_comparison_p): Pass in the scalar mode and use that. (general_scalar_to_vector_candidate_p): Likewise. Rename from dimode_scalar_to_vector_candidate_p. Add {S,U}{MIN,MAX} support. (scalar_to_vector_candidate_p): Remove by inlining into single caller. (general_remove_non_convertible_regs): Rename from dimode_remove_non_convertible_regs. (remove_non_convertible_regs): Remove by inlining into single caller. (convert_scalars_to_vector): Handle SImode and DImode chains in addition to TImode chains. * config/i386/i386.md (<maxmin><MAXMIN_IMODE>3): New expander. (*<maxmin><MAXMIN_IMODE>3_1): New insn-and-split. (*<maxmin>di3_doubleword): Likewise. * gcc.target/i386/pr91154.c: New testcase. * gcc.target/i386/minmax-3.c: Likewise. * gcc.target/i386/minmax-4.c: Likewise. * gcc.target/i386/minmax-5.c: Likewise. * gcc.target/i386/minmax-6.c: Likewise. * gcc.target/i386/minmax-1.c: Add -mno-stv. * gcc.target/i386/minmax-2.c: Likewise. Co-Authored-By: Uros Bizjak <ubizjak@gmail.com> From-SVN: r274481
2019-08-14 14:04:05 +02:00
if ((GET_MODE (src) != mode
&& !CONST_INT_P (src))
re PR rtl-optimization/91154 (456.hmmer regression on Haswell caused by r272922) 2019-08-14 Richard Biener <rguenther@suse.de> Uroš Bizjak <ubizjak@gmail.com> PR target/91154 * config/i386/i386-features.h (scalar_chain::scalar_chain): Add mode arguments. (scalar_chain::smode): New member. (scalar_chain::vmode): Likewise. (dimode_scalar_chain): Rename to... (general_scalar_chain): ... this. (general_scalar_chain::general_scalar_chain): Take mode arguments. (timode_scalar_chain::timode_scalar_chain): Initialize scalar_chain base with TImode and V1TImode. * config/i386/i386-features.c (scalar_chain::scalar_chain): Adjust. (general_scalar_chain::vector_const_cost): Adjust for SImode chains. (general_scalar_chain::compute_convert_gain): Likewise. Add {S,U}{MIN,MAX} support. (general_scalar_chain::replace_with_subreg): Use vmode/smode. (general_scalar_chain::make_vector_copies): Likewise. Handle non-DImode chains appropriately. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::convert_op): Likewise. (general_scalar_chain::convert_insn): Likewise. Add fatal_insn_not_found if the result is not recognized. (convertible_comparison_p): Pass in the scalar mode and use that. (general_scalar_to_vector_candidate_p): Likewise. Rename from dimode_scalar_to_vector_candidate_p. Add {S,U}{MIN,MAX} support. (scalar_to_vector_candidate_p): Remove by inlining into single caller. (general_remove_non_convertible_regs): Rename from dimode_remove_non_convertible_regs. (remove_non_convertible_regs): Remove by inlining into single caller. (convert_scalars_to_vector): Handle SImode and DImode chains in addition to TImode chains. * config/i386/i386.md (<maxmin><MAXMIN_IMODE>3): New expander. (*<maxmin><MAXMIN_IMODE>3_1): New insn-and-split. (*<maxmin>di3_doubleword): Likewise. * gcc.target/i386/pr91154.c: New testcase. * gcc.target/i386/minmax-3.c: Likewise. * gcc.target/i386/minmax-4.c: Likewise. * gcc.target/i386/minmax-5.c: Likewise. * gcc.target/i386/minmax-6.c: Likewise. * gcc.target/i386/minmax-1.c: Add -mno-stv. * gcc.target/i386/minmax-2.c: Likewise. Co-Authored-By: Uros Bizjak <ubizjak@gmail.com> From-SVN: r274481
2019-08-14 14:04:05 +02:00
|| GET_MODE (dst) != mode)
return false;
if (!REG_P (dst) && !MEM_P (dst))
return false;
switch (GET_CODE (src))
{
case ASHIFTRT:
if (!TARGET_AVX512VL)
return false;
/* FALLTHRU */
case ASHIFT:
case LSHIFTRT:
if (!CONST_INT_P (XEXP (src, 1))
|| !IN_RANGE (INTVAL (XEXP (src, 1)), 0, GET_MODE_BITSIZE (mode)-1))
return false;
break;
re PR rtl-optimization/91154 (456.hmmer regression on Haswell caused by r272922) 2019-08-14 Richard Biener <rguenther@suse.de> Uroš Bizjak <ubizjak@gmail.com> PR target/91154 * config/i386/i386-features.h (scalar_chain::scalar_chain): Add mode arguments. (scalar_chain::smode): New member. (scalar_chain::vmode): Likewise. (dimode_scalar_chain): Rename to... (general_scalar_chain): ... this. (general_scalar_chain::general_scalar_chain): Take mode arguments. (timode_scalar_chain::timode_scalar_chain): Initialize scalar_chain base with TImode and V1TImode. * config/i386/i386-features.c (scalar_chain::scalar_chain): Adjust. (general_scalar_chain::vector_const_cost): Adjust for SImode chains. (general_scalar_chain::compute_convert_gain): Likewise. Add {S,U}{MIN,MAX} support. (general_scalar_chain::replace_with_subreg): Use vmode/smode. (general_scalar_chain::make_vector_copies): Likewise. Handle non-DImode chains appropriately. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::convert_op): Likewise. (general_scalar_chain::convert_insn): Likewise. Add fatal_insn_not_found if the result is not recognized. (convertible_comparison_p): Pass in the scalar mode and use that. (general_scalar_to_vector_candidate_p): Likewise. Rename from dimode_scalar_to_vector_candidate_p. Add {S,U}{MIN,MAX} support. (scalar_to_vector_candidate_p): Remove by inlining into single caller. (general_remove_non_convertible_regs): Rename from dimode_remove_non_convertible_regs. (remove_non_convertible_regs): Remove by inlining into single caller. (convert_scalars_to_vector): Handle SImode and DImode chains in addition to TImode chains. * config/i386/i386.md (<maxmin><MAXMIN_IMODE>3): New expander. (*<maxmin><MAXMIN_IMODE>3_1): New insn-and-split. (*<maxmin>di3_doubleword): Likewise. * gcc.target/i386/pr91154.c: New testcase. * gcc.target/i386/minmax-3.c: Likewise. * gcc.target/i386/minmax-4.c: Likewise. * gcc.target/i386/minmax-5.c: Likewise. * gcc.target/i386/minmax-6.c: Likewise. * gcc.target/i386/minmax-1.c: Add -mno-stv. * gcc.target/i386/minmax-2.c: Likewise. Co-Authored-By: Uros Bizjak <ubizjak@gmail.com> From-SVN: r274481
2019-08-14 14:04:05 +02:00
case SMAX:
case SMIN:
case UMAX:
case UMIN:
if ((mode == DImode && !TARGET_AVX512VL)
|| (mode == SImode && !TARGET_SSE4_1))
return false;
/* Fallthru. */
i386: Add integer nabs instructions [PR101044] The patch adds integer nabs "(NEG (ABS (...)))" instructions, adds STV conversion and adjusts STV cost calculations accordingly. When CMOV instruction is used to implement abs, the sign is determined from the preceeding operand negation, and CMOVS is used to select between negated and non-negated value. To implement nabs, just reverse the condition and emit CMOVNS instead. The STV costs are adjusted for inherent NOT of nabs insn. V2DI NOT is somehow costly operation, since it is implemented as a load of zero, followed by a SUB insn. OTOH, integer nabs with inherent NOT is relatively cheap, so some STV chains became less profitable for conversion. The patch rewrites operand scanner in compute_convert_gain to a switch and reorders case instances in general_scalar_to_vector_candidate_p to benefit from fallthroughs, and to remove special processing of andnot in the later case. gcc/ 2021-07-01 Uroš Bizjak <ubizjak@gmail.com> PR target/101044 * config/i386/i386.md (*nabs<dwi>2_doubleword): New insn_and_split pattern. (*nabs<dwi>2_1): Ditto. * config/i386/i386-features.c (general_scalar_chain::compute_convert_gain): Handle (NEG (ABS (...))) RTX. Rewrite src code scanner as switch statement. (general_scalar_chain::convert_insn): Handle (NEG (ABS (...))) RTX. (general_scalar_to_vector_candidate_p): Detect (NEG (ABS (...))) RTX. Reorder case statements for (AND (NOT (...) ...)) fallthrough. gcc/testsuite/ 2021-07-01 Uroš Bizjak <ubizjak@gmail.com> PR target/101044 * gcc.target/i386/pr101044.c: New test.
2021-07-01 10:56:32 +02:00
case AND:
case IOR:
case XOR:
i386: Add integer nabs instructions [PR101044] The patch adds integer nabs "(NEG (ABS (...)))" instructions, adds STV conversion and adjusts STV cost calculations accordingly. When CMOV instruction is used to implement abs, the sign is determined from the preceeding operand negation, and CMOVS is used to select between negated and non-negated value. To implement nabs, just reverse the condition and emit CMOVNS instead. The STV costs are adjusted for inherent NOT of nabs insn. V2DI NOT is somehow costly operation, since it is implemented as a load of zero, followed by a SUB insn. OTOH, integer nabs with inherent NOT is relatively cheap, so some STV chains became less profitable for conversion. The patch rewrites operand scanner in compute_convert_gain to a switch and reorders case instances in general_scalar_to_vector_candidate_p to benefit from fallthroughs, and to remove special processing of andnot in the later case. gcc/ 2021-07-01 Uroš Bizjak <ubizjak@gmail.com> PR target/101044 * config/i386/i386.md (*nabs<dwi>2_doubleword): New insn_and_split pattern. (*nabs<dwi>2_1): Ditto. * config/i386/i386-features.c (general_scalar_chain::compute_convert_gain): Handle (NEG (ABS (...))) RTX. Rewrite src code scanner as switch statement. (general_scalar_chain::convert_insn): Handle (NEG (ABS (...))) RTX. (general_scalar_to_vector_candidate_p): Detect (NEG (ABS (...))) RTX. Reorder case statements for (AND (NOT (...) ...)) fallthrough. gcc/testsuite/ 2021-07-01 Uroš Bizjak <ubizjak@gmail.com> PR target/101044 * gcc.target/i386/pr101044.c: New test.
2021-07-01 10:56:32 +02:00
case PLUS:
case MINUS:
if (!REG_P (XEXP (src, 1))
&& !MEM_P (XEXP (src, 1))
&& !CONST_INT_P (XEXP (src, 1)))
return false;
re PR rtl-optimization/91154 (456.hmmer regression on Haswell caused by r272922) 2019-08-14 Richard Biener <rguenther@suse.de> Uroš Bizjak <ubizjak@gmail.com> PR target/91154 * config/i386/i386-features.h (scalar_chain::scalar_chain): Add mode arguments. (scalar_chain::smode): New member. (scalar_chain::vmode): Likewise. (dimode_scalar_chain): Rename to... (general_scalar_chain): ... this. (general_scalar_chain::general_scalar_chain): Take mode arguments. (timode_scalar_chain::timode_scalar_chain): Initialize scalar_chain base with TImode and V1TImode. * config/i386/i386-features.c (scalar_chain::scalar_chain): Adjust. (general_scalar_chain::vector_const_cost): Adjust for SImode chains. (general_scalar_chain::compute_convert_gain): Likewise. Add {S,U}{MIN,MAX} support. (general_scalar_chain::replace_with_subreg): Use vmode/smode. (general_scalar_chain::make_vector_copies): Likewise. Handle non-DImode chains appropriately. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::convert_op): Likewise. (general_scalar_chain::convert_insn): Likewise. Add fatal_insn_not_found if the result is not recognized. (convertible_comparison_p): Pass in the scalar mode and use that. (general_scalar_to_vector_candidate_p): Likewise. Rename from dimode_scalar_to_vector_candidate_p. Add {S,U}{MIN,MAX} support. (scalar_to_vector_candidate_p): Remove by inlining into single caller. (general_remove_non_convertible_regs): Rename from dimode_remove_non_convertible_regs. (remove_non_convertible_regs): Remove by inlining into single caller. (convert_scalars_to_vector): Handle SImode and DImode chains in addition to TImode chains. * config/i386/i386.md (<maxmin><MAXMIN_IMODE>3): New expander. (*<maxmin><MAXMIN_IMODE>3_1): New insn-and-split. (*<maxmin>di3_doubleword): Likewise. * gcc.target/i386/pr91154.c: New testcase. * gcc.target/i386/minmax-3.c: Likewise. * gcc.target/i386/minmax-4.c: Likewise. * gcc.target/i386/minmax-5.c: Likewise. * gcc.target/i386/minmax-6.c: Likewise. * gcc.target/i386/minmax-1.c: Add -mno-stv. * gcc.target/i386/minmax-2.c: Likewise. Co-Authored-By: Uros Bizjak <ubizjak@gmail.com> From-SVN: r274481
2019-08-14 14:04:05 +02:00
if (GET_MODE (XEXP (src, 1)) != mode
&& !CONST_INT_P (XEXP (src, 1)))
return false;
i386: Add integer nabs instructions [PR101044] The patch adds integer nabs "(NEG (ABS (...)))" instructions, adds STV conversion and adjusts STV cost calculations accordingly. When CMOV instruction is used to implement abs, the sign is determined from the preceeding operand negation, and CMOVS is used to select between negated and non-negated value. To implement nabs, just reverse the condition and emit CMOVNS instead. The STV costs are adjusted for inherent NOT of nabs insn. V2DI NOT is somehow costly operation, since it is implemented as a load of zero, followed by a SUB insn. OTOH, integer nabs with inherent NOT is relatively cheap, so some STV chains became less profitable for conversion. The patch rewrites operand scanner in compute_convert_gain to a switch and reorders case instances in general_scalar_to_vector_candidate_p to benefit from fallthroughs, and to remove special processing of andnot in the later case. gcc/ 2021-07-01 Uroš Bizjak <ubizjak@gmail.com> PR target/101044 * config/i386/i386.md (*nabs<dwi>2_doubleword): New insn_and_split pattern. (*nabs<dwi>2_1): Ditto. * config/i386/i386-features.c (general_scalar_chain::compute_convert_gain): Handle (NEG (ABS (...))) RTX. Rewrite src code scanner as switch statement. (general_scalar_chain::convert_insn): Handle (NEG (ABS (...))) RTX. (general_scalar_to_vector_candidate_p): Detect (NEG (ABS (...))) RTX. Reorder case statements for (AND (NOT (...) ...)) fallthrough. gcc/testsuite/ 2021-07-01 Uroš Bizjak <ubizjak@gmail.com> PR target/101044 * gcc.target/i386/pr101044.c: New test.
2021-07-01 10:56:32 +02:00
/* Check for andnot case. */
if (GET_CODE (src) != AND
|| GET_CODE (XEXP (src, 0)) != NOT)
break;
src = XEXP (src, 0);
/* FALLTHRU */
case NOT:
i386: Optimize abs expansion [PR97873] The patch introduces absM named pattern to generate optimal insn sequence for CMOVE_TARGET targets. Currently, the expansion goes through neg+max optabs, and the following code is generated: movl %edi, %eax negl %eax cmpl %edi, %eax cmovl %edi, %eax This sequence is unoptimal in two ways. a) The compare instruction is not needed, since NEG insn sets the sign flag based on the result. The CMOV can use sign flag to select between negated and original value: movl %edi, %eax negl %eax cmovs %edi, %eax b) On some targets, CMOV is undesirable due to its performance issues. In addition to TARGET_EXPAND_ABS bypass, the patch introduces STV conversion of abs RTX to use PABS SSE insn: vmovd %edi, %xmm0 vpabsd %xmm0, %xmm0 vmovd %xmm0, %eax The patch changes compare mode of NEG instruction to CCGOCmode, which is the same mode as the mode of SUB instruction. IOW, sign bit becomes usable. Also, the mode iterator of <maxmin:code><mode>3 pattern is changed to SWI48x instead of SWI248. The purpose of maxmin expander is to prepare max/min RTX for STV to eventually convert them to SSE PMAX/PMIN instructions, in order to *avoid* CMOV insns with general registers. 2020-11-20 Uroš Bizjak <ubizjak@gmail.com> gcc/ PR target/97873 * config/i386/i386.md (*neg<mode>2_2): Rename from "*neg<mode>2_cmpz". Use CCGOCmode instead of CCZmode. (*negsi2_zext): Rename from *negsi2_cmpz_zext. Use CCGOCmode instead of CCZmode. (*neg<mode>_ccc_1): New insn pattern. (*neg<dwi>2_doubleword): Use *neg<mode>_ccc_1. (abs<mode>2): Add FLAGS_REG clobber. Use TARGET_CMOVE insn predicate. (*abs<mode>2_1): New insn_and_split pattern. (*absdi2_doubleword): Ditto. (<maxmin:code><mode>3): Use SWI48x mode iterator. (*<maxmin:code><mode>3): Use SWI48 mode iterator. * config/i386/i386-features.c (general_scalar_chain::compute_convert_gain): Handle ABS code. (general_scalar_chain::convert_insn): Ditto. (general_scalar_to_vector_candidate_p): Ditto. gcc/testsuite/ PR target/97873 * gcc.target/i386/pr97873.c: New test. * gcc.target/i386/pr97873-1.c: New test.
2020-11-20 10:26:34 +01:00
break;
i386: Add integer nabs instructions [PR101044] The patch adds integer nabs "(NEG (ABS (...)))" instructions, adds STV conversion and adjusts STV cost calculations accordingly. When CMOV instruction is used to implement abs, the sign is determined from the preceeding operand negation, and CMOVS is used to select between negated and non-negated value. To implement nabs, just reverse the condition and emit CMOVNS instead. The STV costs are adjusted for inherent NOT of nabs insn. V2DI NOT is somehow costly operation, since it is implemented as a load of zero, followed by a SUB insn. OTOH, integer nabs with inherent NOT is relatively cheap, so some STV chains became less profitable for conversion. The patch rewrites operand scanner in compute_convert_gain to a switch and reorders case instances in general_scalar_to_vector_candidate_p to benefit from fallthroughs, and to remove special processing of andnot in the later case. gcc/ 2021-07-01 Uroš Bizjak <ubizjak@gmail.com> PR target/101044 * config/i386/i386.md (*nabs<dwi>2_doubleword): New insn_and_split pattern. (*nabs<dwi>2_1): Ditto. * config/i386/i386-features.c (general_scalar_chain::compute_convert_gain): Handle (NEG (ABS (...))) RTX. Rewrite src code scanner as switch statement. (general_scalar_chain::convert_insn): Handle (NEG (ABS (...))) RTX. (general_scalar_to_vector_candidate_p): Detect (NEG (ABS (...))) RTX. Reorder case statements for (AND (NOT (...) ...)) fallthrough. gcc/testsuite/ 2021-07-01 Uroš Bizjak <ubizjak@gmail.com> PR target/101044 * gcc.target/i386/pr101044.c: New test.
2021-07-01 10:56:32 +02:00
case NEG:
/* Check for nabs case. */
if (GET_CODE (XEXP (src, 0)) != ABS)
break;
src = XEXP (src, 0);
/* FALLTHRU */
i386: Optimize abs expansion [PR97873] The patch introduces absM named pattern to generate optimal insn sequence for CMOVE_TARGET targets. Currently, the expansion goes through neg+max optabs, and the following code is generated: movl %edi, %eax negl %eax cmpl %edi, %eax cmovl %edi, %eax This sequence is unoptimal in two ways. a) The compare instruction is not needed, since NEG insn sets the sign flag based on the result. The CMOV can use sign flag to select between negated and original value: movl %edi, %eax negl %eax cmovs %edi, %eax b) On some targets, CMOV is undesirable due to its performance issues. In addition to TARGET_EXPAND_ABS bypass, the patch introduces STV conversion of abs RTX to use PABS SSE insn: vmovd %edi, %xmm0 vpabsd %xmm0, %xmm0 vmovd %xmm0, %eax The patch changes compare mode of NEG instruction to CCGOCmode, which is the same mode as the mode of SUB instruction. IOW, sign bit becomes usable. Also, the mode iterator of <maxmin:code><mode>3 pattern is changed to SWI48x instead of SWI248. The purpose of maxmin expander is to prepare max/min RTX for STV to eventually convert them to SSE PMAX/PMIN instructions, in order to *avoid* CMOV insns with general registers. 2020-11-20 Uroš Bizjak <ubizjak@gmail.com> gcc/ PR target/97873 * config/i386/i386.md (*neg<mode>2_2): Rename from "*neg<mode>2_cmpz". Use CCGOCmode instead of CCZmode. (*negsi2_zext): Rename from *negsi2_cmpz_zext. Use CCGOCmode instead of CCZmode. (*neg<mode>_ccc_1): New insn pattern. (*neg<dwi>2_doubleword): Use *neg<mode>_ccc_1. (abs<mode>2): Add FLAGS_REG clobber. Use TARGET_CMOVE insn predicate. (*abs<mode>2_1): New insn_and_split pattern. (*absdi2_doubleword): Ditto. (<maxmin:code><mode>3): Use SWI48x mode iterator. (*<maxmin:code><mode>3): Use SWI48 mode iterator. * config/i386/i386-features.c (general_scalar_chain::compute_convert_gain): Handle ABS code. (general_scalar_chain::convert_insn): Ditto. (general_scalar_to_vector_candidate_p): Ditto. gcc/testsuite/ PR target/97873 * gcc.target/i386/pr97873.c: New test. * gcc.target/i386/pr97873-1.c: New test.
2020-11-20 10:26:34 +01:00
case ABS:
if ((mode == DImode && !TARGET_AVX512VL)
|| (mode == SImode && !TARGET_SSSE3))
return false;
break;
case REG:
return true;
case MEM:
case CONST_INT:
return REG_P (dst);
default:
return false;
}
if (!REG_P (XEXP (src, 0))
&& !MEM_P (XEXP (src, 0))
i386: Add integer nabs instructions [PR101044] The patch adds integer nabs "(NEG (ABS (...)))" instructions, adds STV conversion and adjusts STV cost calculations accordingly. When CMOV instruction is used to implement abs, the sign is determined from the preceeding operand negation, and CMOVS is used to select between negated and non-negated value. To implement nabs, just reverse the condition and emit CMOVNS instead. The STV costs are adjusted for inherent NOT of nabs insn. V2DI NOT is somehow costly operation, since it is implemented as a load of zero, followed by a SUB insn. OTOH, integer nabs with inherent NOT is relatively cheap, so some STV chains became less profitable for conversion. The patch rewrites operand scanner in compute_convert_gain to a switch and reorders case instances in general_scalar_to_vector_candidate_p to benefit from fallthroughs, and to remove special processing of andnot in the later case. gcc/ 2021-07-01 Uroš Bizjak <ubizjak@gmail.com> PR target/101044 * config/i386/i386.md (*nabs<dwi>2_doubleword): New insn_and_split pattern. (*nabs<dwi>2_1): Ditto. * config/i386/i386-features.c (general_scalar_chain::compute_convert_gain): Handle (NEG (ABS (...))) RTX. Rewrite src code scanner as switch statement. (general_scalar_chain::convert_insn): Handle (NEG (ABS (...))) RTX. (general_scalar_to_vector_candidate_p): Detect (NEG (ABS (...))) RTX. Reorder case statements for (AND (NOT (...) ...)) fallthrough. gcc/testsuite/ 2021-07-01 Uroš Bizjak <ubizjak@gmail.com> PR target/101044 * gcc.target/i386/pr101044.c: New test.
2021-07-01 10:56:32 +02:00
&& !CONST_INT_P (XEXP (src, 0)))
return false;
re PR rtl-optimization/91154 (456.hmmer regression on Haswell caused by r272922) 2019-08-14 Richard Biener <rguenther@suse.de> Uroš Bizjak <ubizjak@gmail.com> PR target/91154 * config/i386/i386-features.h (scalar_chain::scalar_chain): Add mode arguments. (scalar_chain::smode): New member. (scalar_chain::vmode): Likewise. (dimode_scalar_chain): Rename to... (general_scalar_chain): ... this. (general_scalar_chain::general_scalar_chain): Take mode arguments. (timode_scalar_chain::timode_scalar_chain): Initialize scalar_chain base with TImode and V1TImode. * config/i386/i386-features.c (scalar_chain::scalar_chain): Adjust. (general_scalar_chain::vector_const_cost): Adjust for SImode chains. (general_scalar_chain::compute_convert_gain): Likewise. Add {S,U}{MIN,MAX} support. (general_scalar_chain::replace_with_subreg): Use vmode/smode. (general_scalar_chain::make_vector_copies): Likewise. Handle non-DImode chains appropriately. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::convert_op): Likewise. (general_scalar_chain::convert_insn): Likewise. Add fatal_insn_not_found if the result is not recognized. (convertible_comparison_p): Pass in the scalar mode and use that. (general_scalar_to_vector_candidate_p): Likewise. Rename from dimode_scalar_to_vector_candidate_p. Add {S,U}{MIN,MAX} support. (scalar_to_vector_candidate_p): Remove by inlining into single caller. (general_remove_non_convertible_regs): Rename from dimode_remove_non_convertible_regs. (remove_non_convertible_regs): Remove by inlining into single caller. (convert_scalars_to_vector): Handle SImode and DImode chains in addition to TImode chains. * config/i386/i386.md (<maxmin><MAXMIN_IMODE>3): New expander. (*<maxmin><MAXMIN_IMODE>3_1): New insn-and-split. (*<maxmin>di3_doubleword): Likewise. * gcc.target/i386/pr91154.c: New testcase. * gcc.target/i386/minmax-3.c: Likewise. * gcc.target/i386/minmax-4.c: Likewise. * gcc.target/i386/minmax-5.c: Likewise. * gcc.target/i386/minmax-6.c: Likewise. * gcc.target/i386/minmax-1.c: Add -mno-stv. * gcc.target/i386/minmax-2.c: Likewise. Co-Authored-By: Uros Bizjak <ubizjak@gmail.com> From-SVN: r274481
2019-08-14 14:04:05 +02:00
if (GET_MODE (XEXP (src, 0)) != mode
&& !CONST_INT_P (XEXP (src, 0)))
return false;
return true;
}
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64. This patch upgrades x86_64's scalar-to-vector (STV) pass to more aggressively transform 128-bit scalar TImode operations into vector V1TImode operations performed on SSE registers. TImode functionality already exists in STV, but only for move operations. This change brings support for logical operations (AND, IOR, XOR, NOT and ANDN) and comparisons. The effect of these changes are conveniently demonstrated by the new sse4_1-stv-5.c test case: __int128 a[16]; __int128 b[16]; __int128 c[16]; void foo() { for (unsigned int i=0; i<16; i++) a[i] = b[i] & ~c[i]; } which when currently compiled on mainline wtih -O2 -msse4 produces: foo: xorl %eax, %eax .L2: movq c(%rax), %rsi movq c+8(%rax), %rdi addq $16, %rax notq %rsi notq %rdi andq b-16(%rax), %rsi andq b-8(%rax), %rdi movq %rsi, a-16(%rax) movq %rdi, a-8(%rax) cmpq $256, %rax jne .L2 ret but with this patch now produces: foo: xorl %eax, %eax .L2: movdqa c(%rax), %xmm0 pandn b(%rax), %xmm0 addq $16, %rax movaps %xmm0, a-16(%rax) cmpq $256, %rax jne .L2 ret Technically, the STV pass is implemented by three C++ classes, a common abstract base class "scalar_chain" that contains common functionality, and two derived classes: general_scalar_chain (which handles SI and DI modes) and timode_scalar_chain (which handles TI modes). As mentioned previously, because only TI mode moves were handled the two worker classes behaved significantly differently. These changes bring the functionality of these two classes closer together, which is reflected by refactoring more shared code from general_scalar_chain to the parent scalar_chain and reusing it from timode_scalar_chain. There still remain significant differences (and simplifications) so the existing division of classes (as specializations) continues to make sense. 2022-07-11 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-features.h (scalar_chain): Add fields insns_conv, n_sse_to_integer and n_integer_to_sse to this parent class, moved from general_scalar_chain. (scalar_chain::convert_compare): Protected method moved from general_scalar_chain. (mark_dual_mode_def): Make protected, not private virtual. (scalar_chain:convert_op): New private virtual method. (general_scalar_chain::general_scalar_chain): Simplify constructor. (general_scalar_chain::~general_scalar_chain): Delete destructor. (general_scalar_chain): Move insns_conv, n_sse_to_integer and n_integer_to_sse fields to parent class, scalar_chain. (general_scalar_chain::mark_dual_mode_def): Delete prototype. (general_scalar_chain::convert_compare): Delete prototype. (timode_scalar_chain::compute_convert_gain): Remove simplistic implementation, convert to a method prototype. (timode_scalar_chain::mark_dual_mode_def): Delete prototype. (timode_scalar_chain::convert_op): Prototype new virtual method. * config/i386/i386-features.cc (scalar_chain::scalar_chain): Allocate insns_conv and initialize n_sse_to_integer and n_integer_to_sse fields in constructor. (scalar_chain::scalar_chain): Free insns_conv in destructor. (general_scalar_chain::general_scalar_chain): Delete constructor, now defined in the class declaration. (general_scalar_chain::~general_scalar_chain): Delete destructor. (scalar_chain::mark_dual_mode_def): Renamed from general_scalar_chain::mark_dual_mode_def. (timode_scalar_chain::mark_dual_mode_def): Delete. (scalar_chain::convert_compare): Renamed from general_scalar_chain::convert_compare. (timode_scalar_chain::compute_convert_gain): New method to determine the gain from converting a TImode chain to V1TImode. (timode_scalar_chain::convert_op): New method to convert an operand from TImode to V1TImode. (timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE on REG_EQUAL notes that were originally TImode (not CONST_INT). Handle AND, ANDN, XOR, IOR, NOT and COMPARE. (timode_mem_p): Helper predicate to check where operand is memory reference with sufficient alignment for TImode STV. (timode_scalar_to_vector_candidate_p): Use convertible_comparison_p to check whether COMPARE is convertible. Handle SET_DESTs that that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT, CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT. gcc/testsuite/ChangeLog * gcc.target/i386/sse4_1-stv-2.c: New test case, pand. * gcc.target/i386/sse4_1-stv-3.c: New test case, por. * gcc.target/i386/sse4_1-stv-4.c: New test case, pxor. * gcc.target/i386/sse4_1-stv-5.c: New test case, pandn. * gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
/* Check for a suitable TImode memory operand. */
static bool
timode_mem_p (rtx x)
{
return MEM_P (x)
&& (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
|| !misaligned_operand (x, TImode));
}
/* The TImode version of scalar_to_vector_candidate_p. */
static bool
timode_scalar_to_vector_candidate_p (rtx_insn *insn)
{
x86: Allow V1TI vector register pushes Add V1TI vector register push and split it after reload to a sequence of: (set (reg:P SP_REG) (plus:P SP_REG) (const_int -8))) (set (match_dup 0) (match_dup 1)) so that STV pass can convert TI mode integer push to V1TI vector register push. Rename has_non_address_hard_reg to pseudo_reg_set, combine calls of single_set and has_non_address_hard_reg to pseudo_reg_set, to ignore pseudo register push. Remove c-c++-common/dfp/func-vararg-mixed-2.c since it is compiled with -mpreferred-stack-boundary=2 and leads to segfault: Dump of assembler code for function __bid_nesd2: 0x08049210 <+0>: endbr32 0x08049214 <+4>: push %esi 0x08049215 <+5>: push %ebx 0x08049216 <+6>: call 0x8049130 <__x86.get_pc_thunk.bx> 0x0804921b <+11>: add $0x8de5,%ebx 0x08049221 <+17>: sub $0x20,%esp 0x08049224 <+20>: mov 0x30(%esp),%esi 0x08049228 <+24>: pushl 0x2c(%esp) 0x0804922c <+28>: call 0x804e600 <__bid32_to_bid64> 0x08049231 <+33>: mov %esi,(%esp) 0x08049234 <+36>: movd %edx,%xmm1 0x08049238 <+40>: movd %eax,%xmm0 0x0804923c <+44>: punpckldq %xmm1,%xmm0 => 0x08049240 <+48>: movaps %xmm0,0x10(%esp) 0x08049245 <+53>: call 0x804e600 <__bid32_to_bid64> 0x0804924a <+58>: push %edx 0x0804924b <+59>: push %eax 0x0804924c <+60>: pushl 0x1c(%esp) 0x08049250 <+64>: pushl 0x1c(%esp) 0x08049254 <+68>: call 0x804b260 <__bid64_quiet_not_equal> 0x08049259 <+73>: add $0x34,%esp 0x0804925c <+76>: pop %ebx 0x0804925d <+77>: pop %esi 0x0804925e <+78>: ret when libgcc is compiled with -msse2. According to GCC manual: '-mpreferred-stack-boundary=NUM' Attempt to keep the stack boundary aligned to a 2 raised to NUM byte boundary. If '-mpreferred-stack-boundary' is not specified, the default is 4 (16 bytes or 128-bits). *Warning:* If you use this switch, then you must build all modules with the same value, including any libraries. This includes the system libraries and startup modules. c-c++-common/dfp/func-vararg-mixed-2.c, which was added by commit 3b2488ca6ece182f2136a20ee5fa0bb92f935b0f Author: H.J. Lu <hongjiu.lu@intel.com> Date: Wed Jul 30 19:24:02 2008 +0000 func-vararg-alternate-d128-2.c: New. 2008-07-30 H.J. Lu <hongjiu.lu@intel.com> Joey Ye <joey.ye@intel.com> * gcc.dg/dfp/func-vararg-alternate-d128-2.c: New. * gcc.dg/dfp/func-vararg-mixed-2.c: Likewise. isn't expected to work with libgcc. gcc/ PR target/95021 * config/i386/i386-features.c (has_non_address_hard_reg): Renamed to ... (pseudo_reg_set): This. Return the SET expression. Ignore pseudo register push. (general_scalar_to_vector_candidate_p): Combine single_set and has_non_address_hard_reg calls to pseudo_reg_set. (timode_scalar_to_vector_candidate_p): Likewise. * config/i386/i386.md (*pushv1ti2): New pattern. gcc/testsuite/ PR target/95021 * c-c++-common/dfp/func-vararg-mixed-2.c: Removed. * gcc.target/i386/pr95021-1.c: New test. * gcc.target/i386/pr95021-2.c: Likewise. * gcc.target/i386/pr95021-3.c: Likewise. * gcc.target/i386/pr95021-4.c: Likewise. * gcc.target/i386/pr95021-5.c: Likewise.
2020-05-17 19:10:34 +02:00
rtx def_set = pseudo_reg_set (insn);
if (!def_set)
return false;
rtx src = SET_SRC (def_set);
rtx dst = SET_DEST (def_set);
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64. This patch upgrades x86_64's scalar-to-vector (STV) pass to more aggressively transform 128-bit scalar TImode operations into vector V1TImode operations performed on SSE registers. TImode functionality already exists in STV, but only for move operations. This change brings support for logical operations (AND, IOR, XOR, NOT and ANDN) and comparisons. The effect of these changes are conveniently demonstrated by the new sse4_1-stv-5.c test case: __int128 a[16]; __int128 b[16]; __int128 c[16]; void foo() { for (unsigned int i=0; i<16; i++) a[i] = b[i] & ~c[i]; } which when currently compiled on mainline wtih -O2 -msse4 produces: foo: xorl %eax, %eax .L2: movq c(%rax), %rsi movq c+8(%rax), %rdi addq $16, %rax notq %rsi notq %rdi andq b-16(%rax), %rsi andq b-8(%rax), %rdi movq %rsi, a-16(%rax) movq %rdi, a-8(%rax) cmpq $256, %rax jne .L2 ret but with this patch now produces: foo: xorl %eax, %eax .L2: movdqa c(%rax), %xmm0 pandn b(%rax), %xmm0 addq $16, %rax movaps %xmm0, a-16(%rax) cmpq $256, %rax jne .L2 ret Technically, the STV pass is implemented by three C++ classes, a common abstract base class "scalar_chain" that contains common functionality, and two derived classes: general_scalar_chain (which handles SI and DI modes) and timode_scalar_chain (which handles TI modes). As mentioned previously, because only TI mode moves were handled the two worker classes behaved significantly differently. These changes bring the functionality of these two classes closer together, which is reflected by refactoring more shared code from general_scalar_chain to the parent scalar_chain and reusing it from timode_scalar_chain. There still remain significant differences (and simplifications) so the existing division of classes (as specializations) continues to make sense. 2022-07-11 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-features.h (scalar_chain): Add fields insns_conv, n_sse_to_integer and n_integer_to_sse to this parent class, moved from general_scalar_chain. (scalar_chain::convert_compare): Protected method moved from general_scalar_chain. (mark_dual_mode_def): Make protected, not private virtual. (scalar_chain:convert_op): New private virtual method. (general_scalar_chain::general_scalar_chain): Simplify constructor. (general_scalar_chain::~general_scalar_chain): Delete destructor. (general_scalar_chain): Move insns_conv, n_sse_to_integer and n_integer_to_sse fields to parent class, scalar_chain. (general_scalar_chain::mark_dual_mode_def): Delete prototype. (general_scalar_chain::convert_compare): Delete prototype. (timode_scalar_chain::compute_convert_gain): Remove simplistic implementation, convert to a method prototype. (timode_scalar_chain::mark_dual_mode_def): Delete prototype. (timode_scalar_chain::convert_op): Prototype new virtual method. * config/i386/i386-features.cc (scalar_chain::scalar_chain): Allocate insns_conv and initialize n_sse_to_integer and n_integer_to_sse fields in constructor. (scalar_chain::scalar_chain): Free insns_conv in destructor. (general_scalar_chain::general_scalar_chain): Delete constructor, now defined in the class declaration. (general_scalar_chain::~general_scalar_chain): Delete destructor. (scalar_chain::mark_dual_mode_def): Renamed from general_scalar_chain::mark_dual_mode_def. (timode_scalar_chain::mark_dual_mode_def): Delete. (scalar_chain::convert_compare): Renamed from general_scalar_chain::convert_compare. (timode_scalar_chain::compute_convert_gain): New method to determine the gain from converting a TImode chain to V1TImode. (timode_scalar_chain::convert_op): New method to convert an operand from TImode to V1TImode. (timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE on REG_EQUAL notes that were originally TImode (not CONST_INT). Handle AND, ANDN, XOR, IOR, NOT and COMPARE. (timode_mem_p): Helper predicate to check where operand is memory reference with sufficient alignment for TImode STV. (timode_scalar_to_vector_candidate_p): Use convertible_comparison_p to check whether COMPARE is convertible. Handle SET_DESTs that that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT, CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT. gcc/testsuite/ChangeLog * gcc.target/i386/sse4_1-stv-2.c: New test case, pand. * gcc.target/i386/sse4_1-stv-3.c: New test case, por. * gcc.target/i386/sse4_1-stv-4.c: New test case, pxor. * gcc.target/i386/sse4_1-stv-5.c: New test case, pandn. * gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
if (GET_CODE (src) == COMPARE)
return convertible_comparison_p (insn, TImode);
if (GET_MODE (dst) != TImode
|| (GET_MODE (src) != TImode
PR target/106278: Keep REG_EQUAL notes consistent during TImode STV on x86_64. This patch resolves PR target/106278 a regression on x86_64 caused by my recent TImode STV improvements. Now that TImode STV can handle comparisons such as "(set (regs:CC) (compare:CC (reg:TI) ...))" the convert_insn method sensibly checks that the mode of the SET_DEST is TImode before setting it to V1TImode [to avoid V1TImode appearing on the hard reg CC_FLAGS. Hence the current code looks like: if (GET_MODE (dst) == TImode) { tmp = find_reg_equal_equiv_note (insn); if (tmp && GET_MODE (XEXP (tmp, 0)) == TImode) PUT_MODE (XEXP (tmp, 0), V1TImode); PUT_MODE (dst, V1TImode); fix_debug_reg_uses (dst); } break; which checks GET_MODE (dst) before calling PUT_MODE, and when a change is made updating the REG_EQUAL_NOTE tmp if it exists. The logical flaw (oversight) is that due to RTL sharing, the destination of this set may already have been updated to V1TImode, as this chain is being converted, but we still need to update any REG_EQUAL_NOTE that still has TImode. Hence the correct code is actually: if (GET_MODE (dst) == TImode) { PUT_MODE (dst, V1TImode); fix_debug_reg_uses (dst); } if (GET_MODE (dst) == V1TImode) { tmp = find_reg_equal_equiv_note (insn); if (tmp && GET_MODE (XEXP (tmp, 0)) == TImode) PUT_MODE (XEXP (tmp, 0), V1TImode); } break; While fixing this behavior, I noticed I had some indentation whitespace issues and some vestigial dead code in this function/method that I've taken the liberty of cleaning up (as obvious) in this patch. 2022-07-15 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog PR target/106278 * config/i386/i386-features.cc (general_scalar_chain::convert_insn): Fix indentation whitespace. (timode_scalar_chain::fix_debug_reg_uses): Likewise. (timode_scalar_chain::convert_insn): Delete dead code. Update TImode REG_EQUAL_NOTE even if the SET_DEST is already V1TI. Fix indentation whitespace. (convertible_comparison_p): Likewise. (timode_scalar_to_vector_candidate_p): Likewise. gcc/testsuite/ChangeLog * gcc.dg/pr106278.c: New test case.
2022-07-15 15:39:28 +02:00
&& !CONST_SCALAR_INT_P (src)))
return false;
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64. This patch upgrades x86_64's scalar-to-vector (STV) pass to more aggressively transform 128-bit scalar TImode operations into vector V1TImode operations performed on SSE registers. TImode functionality already exists in STV, but only for move operations. This change brings support for logical operations (AND, IOR, XOR, NOT and ANDN) and comparisons. The effect of these changes are conveniently demonstrated by the new sse4_1-stv-5.c test case: __int128 a[16]; __int128 b[16]; __int128 c[16]; void foo() { for (unsigned int i=0; i<16; i++) a[i] = b[i] & ~c[i]; } which when currently compiled on mainline wtih -O2 -msse4 produces: foo: xorl %eax, %eax .L2: movq c(%rax), %rsi movq c+8(%rax), %rdi addq $16, %rax notq %rsi notq %rdi andq b-16(%rax), %rsi andq b-8(%rax), %rdi movq %rsi, a-16(%rax) movq %rdi, a-8(%rax) cmpq $256, %rax jne .L2 ret but with this patch now produces: foo: xorl %eax, %eax .L2: movdqa c(%rax), %xmm0 pandn b(%rax), %xmm0 addq $16, %rax movaps %xmm0, a-16(%rax) cmpq $256, %rax jne .L2 ret Technically, the STV pass is implemented by three C++ classes, a common abstract base class "scalar_chain" that contains common functionality, and two derived classes: general_scalar_chain (which handles SI and DI modes) and timode_scalar_chain (which handles TI modes). As mentioned previously, because only TI mode moves were handled the two worker classes behaved significantly differently. These changes bring the functionality of these two classes closer together, which is reflected by refactoring more shared code from general_scalar_chain to the parent scalar_chain and reusing it from timode_scalar_chain. There still remain significant differences (and simplifications) so the existing division of classes (as specializations) continues to make sense. 2022-07-11 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-features.h (scalar_chain): Add fields insns_conv, n_sse_to_integer and n_integer_to_sse to this parent class, moved from general_scalar_chain. (scalar_chain::convert_compare): Protected method moved from general_scalar_chain. (mark_dual_mode_def): Make protected, not private virtual. (scalar_chain:convert_op): New private virtual method. (general_scalar_chain::general_scalar_chain): Simplify constructor. (general_scalar_chain::~general_scalar_chain): Delete destructor. (general_scalar_chain): Move insns_conv, n_sse_to_integer and n_integer_to_sse fields to parent class, scalar_chain. (general_scalar_chain::mark_dual_mode_def): Delete prototype. (general_scalar_chain::convert_compare): Delete prototype. (timode_scalar_chain::compute_convert_gain): Remove simplistic implementation, convert to a method prototype. (timode_scalar_chain::mark_dual_mode_def): Delete prototype. (timode_scalar_chain::convert_op): Prototype new virtual method. * config/i386/i386-features.cc (scalar_chain::scalar_chain): Allocate insns_conv and initialize n_sse_to_integer and n_integer_to_sse fields in constructor. (scalar_chain::scalar_chain): Free insns_conv in destructor. (general_scalar_chain::general_scalar_chain): Delete constructor, now defined in the class declaration. (general_scalar_chain::~general_scalar_chain): Delete destructor. (scalar_chain::mark_dual_mode_def): Renamed from general_scalar_chain::mark_dual_mode_def. (timode_scalar_chain::mark_dual_mode_def): Delete. (scalar_chain::convert_compare): Renamed from general_scalar_chain::convert_compare. (timode_scalar_chain::compute_convert_gain): New method to determine the gain from converting a TImode chain to V1TImode. (timode_scalar_chain::convert_op): New method to convert an operand from TImode to V1TImode. (timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE on REG_EQUAL notes that were originally TImode (not CONST_INT). Handle AND, ANDN, XOR, IOR, NOT and COMPARE. (timode_mem_p): Helper predicate to check where operand is memory reference with sufficient alignment for TImode STV. (timode_scalar_to_vector_candidate_p): Use convertible_comparison_p to check whether COMPARE is convertible. Handle SET_DESTs that that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT, CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT. gcc/testsuite/ChangeLog * gcc.target/i386/sse4_1-stv-2.c: New test case, pand. * gcc.target/i386/sse4_1-stv-3.c: New test case, por. * gcc.target/i386/sse4_1-stv-4.c: New test case, pxor. * gcc.target/i386/sse4_1-stv-5.c: New test case, pandn. * gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
if (!REG_P (dst) && !MEM_P (dst))
return false;
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64. This patch upgrades x86_64's scalar-to-vector (STV) pass to more aggressively transform 128-bit scalar TImode operations into vector V1TImode operations performed on SSE registers. TImode functionality already exists in STV, but only for move operations. This change brings support for logical operations (AND, IOR, XOR, NOT and ANDN) and comparisons. The effect of these changes are conveniently demonstrated by the new sse4_1-stv-5.c test case: __int128 a[16]; __int128 b[16]; __int128 c[16]; void foo() { for (unsigned int i=0; i<16; i++) a[i] = b[i] & ~c[i]; } which when currently compiled on mainline wtih -O2 -msse4 produces: foo: xorl %eax, %eax .L2: movq c(%rax), %rsi movq c+8(%rax), %rdi addq $16, %rax notq %rsi notq %rdi andq b-16(%rax), %rsi andq b-8(%rax), %rdi movq %rsi, a-16(%rax) movq %rdi, a-8(%rax) cmpq $256, %rax jne .L2 ret but with this patch now produces: foo: xorl %eax, %eax .L2: movdqa c(%rax), %xmm0 pandn b(%rax), %xmm0 addq $16, %rax movaps %xmm0, a-16(%rax) cmpq $256, %rax jne .L2 ret Technically, the STV pass is implemented by three C++ classes, a common abstract base class "scalar_chain" that contains common functionality, and two derived classes: general_scalar_chain (which handles SI and DI modes) and timode_scalar_chain (which handles TI modes). As mentioned previously, because only TI mode moves were handled the two worker classes behaved significantly differently. These changes bring the functionality of these two classes closer together, which is reflected by refactoring more shared code from general_scalar_chain to the parent scalar_chain and reusing it from timode_scalar_chain. There still remain significant differences (and simplifications) so the existing division of classes (as specializations) continues to make sense. 2022-07-11 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-features.h (scalar_chain): Add fields insns_conv, n_sse_to_integer and n_integer_to_sse to this parent class, moved from general_scalar_chain. (scalar_chain::convert_compare): Protected method moved from general_scalar_chain. (mark_dual_mode_def): Make protected, not private virtual. (scalar_chain:convert_op): New private virtual method. (general_scalar_chain::general_scalar_chain): Simplify constructor. (general_scalar_chain::~general_scalar_chain): Delete destructor. (general_scalar_chain): Move insns_conv, n_sse_to_integer and n_integer_to_sse fields to parent class, scalar_chain. (general_scalar_chain::mark_dual_mode_def): Delete prototype. (general_scalar_chain::convert_compare): Delete prototype. (timode_scalar_chain::compute_convert_gain): Remove simplistic implementation, convert to a method prototype. (timode_scalar_chain::mark_dual_mode_def): Delete prototype. (timode_scalar_chain::convert_op): Prototype new virtual method. * config/i386/i386-features.cc (scalar_chain::scalar_chain): Allocate insns_conv and initialize n_sse_to_integer and n_integer_to_sse fields in constructor. (scalar_chain::scalar_chain): Free insns_conv in destructor. (general_scalar_chain::general_scalar_chain): Delete constructor, now defined in the class declaration. (general_scalar_chain::~general_scalar_chain): Delete destructor. (scalar_chain::mark_dual_mode_def): Renamed from general_scalar_chain::mark_dual_mode_def. (timode_scalar_chain::mark_dual_mode_def): Delete. (scalar_chain::convert_compare): Renamed from general_scalar_chain::convert_compare. (timode_scalar_chain::compute_convert_gain): New method to determine the gain from converting a TImode chain to V1TImode. (timode_scalar_chain::convert_op): New method to convert an operand from TImode to V1TImode. (timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE on REG_EQUAL notes that were originally TImode (not CONST_INT). Handle AND, ANDN, XOR, IOR, NOT and COMPARE. (timode_mem_p): Helper predicate to check where operand is memory reference with sufficient alignment for TImode STV. (timode_scalar_to_vector_candidate_p): Use convertible_comparison_p to check whether COMPARE is convertible. Handle SET_DESTs that that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT, CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT. gcc/testsuite/ChangeLog * gcc.target/i386/sse4_1-stv-2.c: New test case, pand. * gcc.target/i386/sse4_1-stv-3.c: New test case, por. * gcc.target/i386/sse4_1-stv-4.c: New test case, pxor. * gcc.target/i386/sse4_1-stv-5.c: New test case, pandn. * gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
if (MEM_P (dst)
&& misaligned_operand (dst, TImode)
&& !TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
return false;
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64. This patch upgrades x86_64's scalar-to-vector (STV) pass to more aggressively transform 128-bit scalar TImode operations into vector V1TImode operations performed on SSE registers. TImode functionality already exists in STV, but only for move operations. This change brings support for logical operations (AND, IOR, XOR, NOT and ANDN) and comparisons. The effect of these changes are conveniently demonstrated by the new sse4_1-stv-5.c test case: __int128 a[16]; __int128 b[16]; __int128 c[16]; void foo() { for (unsigned int i=0; i<16; i++) a[i] = b[i] & ~c[i]; } which when currently compiled on mainline wtih -O2 -msse4 produces: foo: xorl %eax, %eax .L2: movq c(%rax), %rsi movq c+8(%rax), %rdi addq $16, %rax notq %rsi notq %rdi andq b-16(%rax), %rsi andq b-8(%rax), %rdi movq %rsi, a-16(%rax) movq %rdi, a-8(%rax) cmpq $256, %rax jne .L2 ret but with this patch now produces: foo: xorl %eax, %eax .L2: movdqa c(%rax), %xmm0 pandn b(%rax), %xmm0 addq $16, %rax movaps %xmm0, a-16(%rax) cmpq $256, %rax jne .L2 ret Technically, the STV pass is implemented by three C++ classes, a common abstract base class "scalar_chain" that contains common functionality, and two derived classes: general_scalar_chain (which handles SI and DI modes) and timode_scalar_chain (which handles TI modes). As mentioned previously, because only TI mode moves were handled the two worker classes behaved significantly differently. These changes bring the functionality of these two classes closer together, which is reflected by refactoring more shared code from general_scalar_chain to the parent scalar_chain and reusing it from timode_scalar_chain. There still remain significant differences (and simplifications) so the existing division of classes (as specializations) continues to make sense. 2022-07-11 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-features.h (scalar_chain): Add fields insns_conv, n_sse_to_integer and n_integer_to_sse to this parent class, moved from general_scalar_chain. (scalar_chain::convert_compare): Protected method moved from general_scalar_chain. (mark_dual_mode_def): Make protected, not private virtual. (scalar_chain:convert_op): New private virtual method. (general_scalar_chain::general_scalar_chain): Simplify constructor. (general_scalar_chain::~general_scalar_chain): Delete destructor. (general_scalar_chain): Move insns_conv, n_sse_to_integer and n_integer_to_sse fields to parent class, scalar_chain. (general_scalar_chain::mark_dual_mode_def): Delete prototype. (general_scalar_chain::convert_compare): Delete prototype. (timode_scalar_chain::compute_convert_gain): Remove simplistic implementation, convert to a method prototype. (timode_scalar_chain::mark_dual_mode_def): Delete prototype. (timode_scalar_chain::convert_op): Prototype new virtual method. * config/i386/i386-features.cc (scalar_chain::scalar_chain): Allocate insns_conv and initialize n_sse_to_integer and n_integer_to_sse fields in constructor. (scalar_chain::scalar_chain): Free insns_conv in destructor. (general_scalar_chain::general_scalar_chain): Delete constructor, now defined in the class declaration. (general_scalar_chain::~general_scalar_chain): Delete destructor. (scalar_chain::mark_dual_mode_def): Renamed from general_scalar_chain::mark_dual_mode_def. (timode_scalar_chain::mark_dual_mode_def): Delete. (scalar_chain::convert_compare): Renamed from general_scalar_chain::convert_compare. (timode_scalar_chain::compute_convert_gain): New method to determine the gain from converting a TImode chain to V1TImode. (timode_scalar_chain::convert_op): New method to convert an operand from TImode to V1TImode. (timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE on REG_EQUAL notes that were originally TImode (not CONST_INT). Handle AND, ANDN, XOR, IOR, NOT and COMPARE. (timode_mem_p): Helper predicate to check where operand is memory reference with sufficient alignment for TImode STV. (timode_scalar_to_vector_candidate_p): Use convertible_comparison_p to check whether COMPARE is convertible. Handle SET_DESTs that that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT, CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT. gcc/testsuite/ChangeLog * gcc.target/i386/sse4_1-stv-2.c: New test case, pand. * gcc.target/i386/sse4_1-stv-3.c: New test case, por. * gcc.target/i386/sse4_1-stv-4.c: New test case, pxor. * gcc.target/i386/sse4_1-stv-5.c: New test case, pandn. * gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
switch (GET_CODE (src))
{
case REG:
case CONST_WIDE_INT:
return true;
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64. This patch upgrades x86_64's scalar-to-vector (STV) pass to more aggressively transform 128-bit scalar TImode operations into vector V1TImode operations performed on SSE registers. TImode functionality already exists in STV, but only for move operations. This change brings support for logical operations (AND, IOR, XOR, NOT and ANDN) and comparisons. The effect of these changes are conveniently demonstrated by the new sse4_1-stv-5.c test case: __int128 a[16]; __int128 b[16]; __int128 c[16]; void foo() { for (unsigned int i=0; i<16; i++) a[i] = b[i] & ~c[i]; } which when currently compiled on mainline wtih -O2 -msse4 produces: foo: xorl %eax, %eax .L2: movq c(%rax), %rsi movq c+8(%rax), %rdi addq $16, %rax notq %rsi notq %rdi andq b-16(%rax), %rsi andq b-8(%rax), %rdi movq %rsi, a-16(%rax) movq %rdi, a-8(%rax) cmpq $256, %rax jne .L2 ret but with this patch now produces: foo: xorl %eax, %eax .L2: movdqa c(%rax), %xmm0 pandn b(%rax), %xmm0 addq $16, %rax movaps %xmm0, a-16(%rax) cmpq $256, %rax jne .L2 ret Technically, the STV pass is implemented by three C++ classes, a common abstract base class "scalar_chain" that contains common functionality, and two derived classes: general_scalar_chain (which handles SI and DI modes) and timode_scalar_chain (which handles TI modes). As mentioned previously, because only TI mode moves were handled the two worker classes behaved significantly differently. These changes bring the functionality of these two classes closer together, which is reflected by refactoring more shared code from general_scalar_chain to the parent scalar_chain and reusing it from timode_scalar_chain. There still remain significant differences (and simplifications) so the existing division of classes (as specializations) continues to make sense. 2022-07-11 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-features.h (scalar_chain): Add fields insns_conv, n_sse_to_integer and n_integer_to_sse to this parent class, moved from general_scalar_chain. (scalar_chain::convert_compare): Protected method moved from general_scalar_chain. (mark_dual_mode_def): Make protected, not private virtual. (scalar_chain:convert_op): New private virtual method. (general_scalar_chain::general_scalar_chain): Simplify constructor. (general_scalar_chain::~general_scalar_chain): Delete destructor. (general_scalar_chain): Move insns_conv, n_sse_to_integer and n_integer_to_sse fields to parent class, scalar_chain. (general_scalar_chain::mark_dual_mode_def): Delete prototype. (general_scalar_chain::convert_compare): Delete prototype. (timode_scalar_chain::compute_convert_gain): Remove simplistic implementation, convert to a method prototype. (timode_scalar_chain::mark_dual_mode_def): Delete prototype. (timode_scalar_chain::convert_op): Prototype new virtual method. * config/i386/i386-features.cc (scalar_chain::scalar_chain): Allocate insns_conv and initialize n_sse_to_integer and n_integer_to_sse fields in constructor. (scalar_chain::scalar_chain): Free insns_conv in destructor. (general_scalar_chain::general_scalar_chain): Delete constructor, now defined in the class declaration. (general_scalar_chain::~general_scalar_chain): Delete destructor. (scalar_chain::mark_dual_mode_def): Renamed from general_scalar_chain::mark_dual_mode_def. (timode_scalar_chain::mark_dual_mode_def): Delete. (scalar_chain::convert_compare): Renamed from general_scalar_chain::convert_compare. (timode_scalar_chain::compute_convert_gain): New method to determine the gain from converting a TImode chain to V1TImode. (timode_scalar_chain::convert_op): New method to convert an operand from TImode to V1TImode. (timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE on REG_EQUAL notes that were originally TImode (not CONST_INT). Handle AND, ANDN, XOR, IOR, NOT and COMPARE. (timode_mem_p): Helper predicate to check where operand is memory reference with sufficient alignment for TImode STV. (timode_scalar_to_vector_candidate_p): Use convertible_comparison_p to check whether COMPARE is convertible. Handle SET_DESTs that that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT, CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT. gcc/testsuite/ChangeLog * gcc.target/i386/sse4_1-stv-2.c: New test case, pand. * gcc.target/i386/sse4_1-stv-3.c: New test case, por. * gcc.target/i386/sse4_1-stv-4.c: New test case, pxor. * gcc.target/i386/sse4_1-stv-5.c: New test case, pandn. * gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
case CONST_INT:
/* ??? Verify performance impact before enabling CONST_INT for
__int128 store. */
return standard_sse_constant_p (src, TImode);
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64. This patch upgrades x86_64's scalar-to-vector (STV) pass to more aggressively transform 128-bit scalar TImode operations into vector V1TImode operations performed on SSE registers. TImode functionality already exists in STV, but only for move operations. This change brings support for logical operations (AND, IOR, XOR, NOT and ANDN) and comparisons. The effect of these changes are conveniently demonstrated by the new sse4_1-stv-5.c test case: __int128 a[16]; __int128 b[16]; __int128 c[16]; void foo() { for (unsigned int i=0; i<16; i++) a[i] = b[i] & ~c[i]; } which when currently compiled on mainline wtih -O2 -msse4 produces: foo: xorl %eax, %eax .L2: movq c(%rax), %rsi movq c+8(%rax), %rdi addq $16, %rax notq %rsi notq %rdi andq b-16(%rax), %rsi andq b-8(%rax), %rdi movq %rsi, a-16(%rax) movq %rdi, a-8(%rax) cmpq $256, %rax jne .L2 ret but with this patch now produces: foo: xorl %eax, %eax .L2: movdqa c(%rax), %xmm0 pandn b(%rax), %xmm0 addq $16, %rax movaps %xmm0, a-16(%rax) cmpq $256, %rax jne .L2 ret Technically, the STV pass is implemented by three C++ classes, a common abstract base class "scalar_chain" that contains common functionality, and two derived classes: general_scalar_chain (which handles SI and DI modes) and timode_scalar_chain (which handles TI modes). As mentioned previously, because only TI mode moves were handled the two worker classes behaved significantly differently. These changes bring the functionality of these two classes closer together, which is reflected by refactoring more shared code from general_scalar_chain to the parent scalar_chain and reusing it from timode_scalar_chain. There still remain significant differences (and simplifications) so the existing division of classes (as specializations) continues to make sense. 2022-07-11 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-features.h (scalar_chain): Add fields insns_conv, n_sse_to_integer and n_integer_to_sse to this parent class, moved from general_scalar_chain. (scalar_chain::convert_compare): Protected method moved from general_scalar_chain. (mark_dual_mode_def): Make protected, not private virtual. (scalar_chain:convert_op): New private virtual method. (general_scalar_chain::general_scalar_chain): Simplify constructor. (general_scalar_chain::~general_scalar_chain): Delete destructor. (general_scalar_chain): Move insns_conv, n_sse_to_integer and n_integer_to_sse fields to parent class, scalar_chain. (general_scalar_chain::mark_dual_mode_def): Delete prototype. (general_scalar_chain::convert_compare): Delete prototype. (timode_scalar_chain::compute_convert_gain): Remove simplistic implementation, convert to a method prototype. (timode_scalar_chain::mark_dual_mode_def): Delete prototype. (timode_scalar_chain::convert_op): Prototype new virtual method. * config/i386/i386-features.cc (scalar_chain::scalar_chain): Allocate insns_conv and initialize n_sse_to_integer and n_integer_to_sse fields in constructor. (scalar_chain::scalar_chain): Free insns_conv in destructor. (general_scalar_chain::general_scalar_chain): Delete constructor, now defined in the class declaration. (general_scalar_chain::~general_scalar_chain): Delete destructor. (scalar_chain::mark_dual_mode_def): Renamed from general_scalar_chain::mark_dual_mode_def. (timode_scalar_chain::mark_dual_mode_def): Delete. (scalar_chain::convert_compare): Renamed from general_scalar_chain::convert_compare. (timode_scalar_chain::compute_convert_gain): New method to determine the gain from converting a TImode chain to V1TImode. (timode_scalar_chain::convert_op): New method to convert an operand from TImode to V1TImode. (timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE on REG_EQUAL notes that were originally TImode (not CONST_INT). Handle AND, ANDN, XOR, IOR, NOT and COMPARE. (timode_mem_p): Helper predicate to check where operand is memory reference with sufficient alignment for TImode STV. (timode_scalar_to_vector_candidate_p): Use convertible_comparison_p to check whether COMPARE is convertible. Handle SET_DESTs that that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT, CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT. gcc/testsuite/ChangeLog * gcc.target/i386/sse4_1-stv-2.c: New test case, pand. * gcc.target/i386/sse4_1-stv-3.c: New test case, por. * gcc.target/i386/sse4_1-stv-4.c: New test case, pxor. * gcc.target/i386/sse4_1-stv-5.c: New test case, pandn. * gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
case MEM:
/* Memory must be aligned or unaligned load is optimal. */
return (REG_P (dst)
&& (!misaligned_operand (src, TImode)
|| TARGET_SSE_UNALIGNED_LOAD_OPTIMAL));
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64. This patch upgrades x86_64's scalar-to-vector (STV) pass to more aggressively transform 128-bit scalar TImode operations into vector V1TImode operations performed on SSE registers. TImode functionality already exists in STV, but only for move operations. This change brings support for logical operations (AND, IOR, XOR, NOT and ANDN) and comparisons. The effect of these changes are conveniently demonstrated by the new sse4_1-stv-5.c test case: __int128 a[16]; __int128 b[16]; __int128 c[16]; void foo() { for (unsigned int i=0; i<16; i++) a[i] = b[i] & ~c[i]; } which when currently compiled on mainline wtih -O2 -msse4 produces: foo: xorl %eax, %eax .L2: movq c(%rax), %rsi movq c+8(%rax), %rdi addq $16, %rax notq %rsi notq %rdi andq b-16(%rax), %rsi andq b-8(%rax), %rdi movq %rsi, a-16(%rax) movq %rdi, a-8(%rax) cmpq $256, %rax jne .L2 ret but with this patch now produces: foo: xorl %eax, %eax .L2: movdqa c(%rax), %xmm0 pandn b(%rax), %xmm0 addq $16, %rax movaps %xmm0, a-16(%rax) cmpq $256, %rax jne .L2 ret Technically, the STV pass is implemented by three C++ classes, a common abstract base class "scalar_chain" that contains common functionality, and two derived classes: general_scalar_chain (which handles SI and DI modes) and timode_scalar_chain (which handles TI modes). As mentioned previously, because only TI mode moves were handled the two worker classes behaved significantly differently. These changes bring the functionality of these two classes closer together, which is reflected by refactoring more shared code from general_scalar_chain to the parent scalar_chain and reusing it from timode_scalar_chain. There still remain significant differences (and simplifications) so the existing division of classes (as specializations) continues to make sense. 2022-07-11 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-features.h (scalar_chain): Add fields insns_conv, n_sse_to_integer and n_integer_to_sse to this parent class, moved from general_scalar_chain. (scalar_chain::convert_compare): Protected method moved from general_scalar_chain. (mark_dual_mode_def): Make protected, not private virtual. (scalar_chain:convert_op): New private virtual method. (general_scalar_chain::general_scalar_chain): Simplify constructor. (general_scalar_chain::~general_scalar_chain): Delete destructor. (general_scalar_chain): Move insns_conv, n_sse_to_integer and n_integer_to_sse fields to parent class, scalar_chain. (general_scalar_chain::mark_dual_mode_def): Delete prototype. (general_scalar_chain::convert_compare): Delete prototype. (timode_scalar_chain::compute_convert_gain): Remove simplistic implementation, convert to a method prototype. (timode_scalar_chain::mark_dual_mode_def): Delete prototype. (timode_scalar_chain::convert_op): Prototype new virtual method. * config/i386/i386-features.cc (scalar_chain::scalar_chain): Allocate insns_conv and initialize n_sse_to_integer and n_integer_to_sse fields in constructor. (scalar_chain::scalar_chain): Free insns_conv in destructor. (general_scalar_chain::general_scalar_chain): Delete constructor, now defined in the class declaration. (general_scalar_chain::~general_scalar_chain): Delete destructor. (scalar_chain::mark_dual_mode_def): Renamed from general_scalar_chain::mark_dual_mode_def. (timode_scalar_chain::mark_dual_mode_def): Delete. (scalar_chain::convert_compare): Renamed from general_scalar_chain::convert_compare. (timode_scalar_chain::compute_convert_gain): New method to determine the gain from converting a TImode chain to V1TImode. (timode_scalar_chain::convert_op): New method to convert an operand from TImode to V1TImode. (timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE on REG_EQUAL notes that were originally TImode (not CONST_INT). Handle AND, ANDN, XOR, IOR, NOT and COMPARE. (timode_mem_p): Helper predicate to check where operand is memory reference with sufficient alignment for TImode STV. (timode_scalar_to_vector_candidate_p): Use convertible_comparison_p to check whether COMPARE is convertible. Handle SET_DESTs that that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT, CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT. gcc/testsuite/ChangeLog * gcc.target/i386/sse4_1-stv-2.c: New test case, pand. * gcc.target/i386/sse4_1-stv-3.c: New test case, por. * gcc.target/i386/sse4_1-stv-4.c: New test case, pxor. * gcc.target/i386/sse4_1-stv-5.c: New test case, pandn. * gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
case AND:
if (!MEM_P (dst)
&& GET_CODE (XEXP (src, 0)) == NOT
&& REG_P (XEXP (XEXP (src, 0), 0))
Use PTEST to perform AND in TImode STV of (A & B) != 0 on x86_64. This x86_64 backend patch allows TImode STV to take advantage of the fact that the PTEST instruction performs an AND operation. Previously PTEST was (mostly) used for comparison against zero, by using the same operands. The benefits are demonstrated by the new test case: __int128 a,b; int foo() { return (a & b) != 0; } Currently with -O2 -msse4 we generate: movdqa a(%rip), %xmm0 pand b(%rip), %xmm0 xorl %eax, %eax ptest %xmm0, %xmm0 setne %al ret with this patch we now generate: movdqa a(%rip), %xmm0 xorl %eax, %eax ptest b(%rip), %xmm0 setne %al ret Technically, the magic happens using new define_insn_and_split patterns. Using two patterns allows this transformation to performed independently of whether TImode STV is run before or after combine. The one tricky case is that immediate constant operands of the AND behave slightly differently between TImode and V1TImode: All V1TImode immediate operands becomes loads, but for TImode only values that are not hilo_operands need to be loaded. Hence the new *testti_doubleword accepts any general_operand, but internally during split calls force_reg whenever the second operand is not x86_64_hilo_general_operand. This required (benefits from) some tweaks to TImode STV to support CONST_WIDE_INT in more places, using CONST_SCALAR_INT_P instead of just CONST_INT_P. 2022-08-09 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-features.cc (scalar_chain::convert_compare): Create new pseudos only when/if needed. Add support for TEST, i.e. (COMPARE (AND x y) (const_int 0)), using UNSPEC_PTEST. When broadcasting V2DImode and V4SImode use new pseudo register. (timode_scalar_chain::convert_op): Do nothing if operand is already V1TImode. Avoid generating useless SUBREG conversions, i.e. (SUBREG:V1TImode (REG:V1TImode) 0). Handle CONST_WIDE_INT in addition to CONST_INT by using CONST_SCALAR_INT_P. (convertible_comparison_p): Use CONST_SCALAR_INT_P to match both CONST_WIDE_INT and CONST_INT. Recognize new *testti_doubleword pattern as an STV candidate. (timode_scalar_to_vector_candidate_p): Allow CONST_SCALAR_INT_P operands in binary logic operations. * config/i386/i386.cc (ix86_rtx_costs) <case UNSPEC>: Add costs for UNSPEC_PTEST; a PTEST that performs an AND has the same cost as regular PTEST, i.e. cost->sse_op. * config/i386/i386.md (*testti_doubleword): New pre-reload define_insn_and_split that recognizes comparison of TI mode AND against zero. * config/i386/sse.md (*ptest<mode>_and): New pre-reload define_insn_and_split that recognizes UNSPEC_PTEST of identical AND operands. gcc/testsuite/ChangeLog * gcc.target/i386/sse4_1-stv-8.c: New test case.
2022-08-09 19:59:55 +02:00
&& (REG_P (XEXP (src, 1))
|| CONST_SCALAR_INT_P (XEXP (src, 1))
|| timode_mem_p (XEXP (src, 1))))
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64. This patch upgrades x86_64's scalar-to-vector (STV) pass to more aggressively transform 128-bit scalar TImode operations into vector V1TImode operations performed on SSE registers. TImode functionality already exists in STV, but only for move operations. This change brings support for logical operations (AND, IOR, XOR, NOT and ANDN) and comparisons. The effect of these changes are conveniently demonstrated by the new sse4_1-stv-5.c test case: __int128 a[16]; __int128 b[16]; __int128 c[16]; void foo() { for (unsigned int i=0; i<16; i++) a[i] = b[i] & ~c[i]; } which when currently compiled on mainline wtih -O2 -msse4 produces: foo: xorl %eax, %eax .L2: movq c(%rax), %rsi movq c+8(%rax), %rdi addq $16, %rax notq %rsi notq %rdi andq b-16(%rax), %rsi andq b-8(%rax), %rdi movq %rsi, a-16(%rax) movq %rdi, a-8(%rax) cmpq $256, %rax jne .L2 ret but with this patch now produces: foo: xorl %eax, %eax .L2: movdqa c(%rax), %xmm0 pandn b(%rax), %xmm0 addq $16, %rax movaps %xmm0, a-16(%rax) cmpq $256, %rax jne .L2 ret Technically, the STV pass is implemented by three C++ classes, a common abstract base class "scalar_chain" that contains common functionality, and two derived classes: general_scalar_chain (which handles SI and DI modes) and timode_scalar_chain (which handles TI modes). As mentioned previously, because only TI mode moves were handled the two worker classes behaved significantly differently. These changes bring the functionality of these two classes closer together, which is reflected by refactoring more shared code from general_scalar_chain to the parent scalar_chain and reusing it from timode_scalar_chain. There still remain significant differences (and simplifications) so the existing division of classes (as specializations) continues to make sense. 2022-07-11 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-features.h (scalar_chain): Add fields insns_conv, n_sse_to_integer and n_integer_to_sse to this parent class, moved from general_scalar_chain. (scalar_chain::convert_compare): Protected method moved from general_scalar_chain. (mark_dual_mode_def): Make protected, not private virtual. (scalar_chain:convert_op): New private virtual method. (general_scalar_chain::general_scalar_chain): Simplify constructor. (general_scalar_chain::~general_scalar_chain): Delete destructor. (general_scalar_chain): Move insns_conv, n_sse_to_integer and n_integer_to_sse fields to parent class, scalar_chain. (general_scalar_chain::mark_dual_mode_def): Delete prototype. (general_scalar_chain::convert_compare): Delete prototype. (timode_scalar_chain::compute_convert_gain): Remove simplistic implementation, convert to a method prototype. (timode_scalar_chain::mark_dual_mode_def): Delete prototype. (timode_scalar_chain::convert_op): Prototype new virtual method. * config/i386/i386-features.cc (scalar_chain::scalar_chain): Allocate insns_conv and initialize n_sse_to_integer and n_integer_to_sse fields in constructor. (scalar_chain::scalar_chain): Free insns_conv in destructor. (general_scalar_chain::general_scalar_chain): Delete constructor, now defined in the class declaration. (general_scalar_chain::~general_scalar_chain): Delete destructor. (scalar_chain::mark_dual_mode_def): Renamed from general_scalar_chain::mark_dual_mode_def. (timode_scalar_chain::mark_dual_mode_def): Delete. (scalar_chain::convert_compare): Renamed from general_scalar_chain::convert_compare. (timode_scalar_chain::compute_convert_gain): New method to determine the gain from converting a TImode chain to V1TImode. (timode_scalar_chain::convert_op): New method to convert an operand from TImode to V1TImode. (timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE on REG_EQUAL notes that were originally TImode (not CONST_INT). Handle AND, ANDN, XOR, IOR, NOT and COMPARE. (timode_mem_p): Helper predicate to check where operand is memory reference with sufficient alignment for TImode STV. (timode_scalar_to_vector_candidate_p): Use convertible_comparison_p to check whether COMPARE is convertible. Handle SET_DESTs that that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT, CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT. gcc/testsuite/ChangeLog * gcc.target/i386/sse4_1-stv-2.c: New test case, pand. * gcc.target/i386/sse4_1-stv-3.c: New test case, por. * gcc.target/i386/sse4_1-stv-4.c: New test case, pxor. * gcc.target/i386/sse4_1-stv-5.c: New test case, pandn. * gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
return true;
return REG_P (XEXP (src, 0))
Use PTEST to perform AND in TImode STV of (A & B) != 0 on x86_64. This x86_64 backend patch allows TImode STV to take advantage of the fact that the PTEST instruction performs an AND operation. Previously PTEST was (mostly) used for comparison against zero, by using the same operands. The benefits are demonstrated by the new test case: __int128 a,b; int foo() { return (a & b) != 0; } Currently with -O2 -msse4 we generate: movdqa a(%rip), %xmm0 pand b(%rip), %xmm0 xorl %eax, %eax ptest %xmm0, %xmm0 setne %al ret with this patch we now generate: movdqa a(%rip), %xmm0 xorl %eax, %eax ptest b(%rip), %xmm0 setne %al ret Technically, the magic happens using new define_insn_and_split patterns. Using two patterns allows this transformation to performed independently of whether TImode STV is run before or after combine. The one tricky case is that immediate constant operands of the AND behave slightly differently between TImode and V1TImode: All V1TImode immediate operands becomes loads, but for TImode only values that are not hilo_operands need to be loaded. Hence the new *testti_doubleword accepts any general_operand, but internally during split calls force_reg whenever the second operand is not x86_64_hilo_general_operand. This required (benefits from) some tweaks to TImode STV to support CONST_WIDE_INT in more places, using CONST_SCALAR_INT_P instead of just CONST_INT_P. 2022-08-09 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-features.cc (scalar_chain::convert_compare): Create new pseudos only when/if needed. Add support for TEST, i.e. (COMPARE (AND x y) (const_int 0)), using UNSPEC_PTEST. When broadcasting V2DImode and V4SImode use new pseudo register. (timode_scalar_chain::convert_op): Do nothing if operand is already V1TImode. Avoid generating useless SUBREG conversions, i.e. (SUBREG:V1TImode (REG:V1TImode) 0). Handle CONST_WIDE_INT in addition to CONST_INT by using CONST_SCALAR_INT_P. (convertible_comparison_p): Use CONST_SCALAR_INT_P to match both CONST_WIDE_INT and CONST_INT. Recognize new *testti_doubleword pattern as an STV candidate. (timode_scalar_to_vector_candidate_p): Allow CONST_SCALAR_INT_P operands in binary logic operations. * config/i386/i386.cc (ix86_rtx_costs) <case UNSPEC>: Add costs for UNSPEC_PTEST; a PTEST that performs an AND has the same cost as regular PTEST, i.e. cost->sse_op. * config/i386/i386.md (*testti_doubleword): New pre-reload define_insn_and_split that recognizes comparison of TI mode AND against zero. * config/i386/sse.md (*ptest<mode>_and): New pre-reload define_insn_and_split that recognizes UNSPEC_PTEST of identical AND operands. gcc/testsuite/ChangeLog * gcc.target/i386/sse4_1-stv-8.c: New test case.
2022-08-09 19:59:55 +02:00
&& (REG_P (XEXP (src, 1))
|| CONST_SCALAR_INT_P (XEXP (src, 1))
|| timode_mem_p (XEXP (src, 1)));
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64. This patch upgrades x86_64's scalar-to-vector (STV) pass to more aggressively transform 128-bit scalar TImode operations into vector V1TImode operations performed on SSE registers. TImode functionality already exists in STV, but only for move operations. This change brings support for logical operations (AND, IOR, XOR, NOT and ANDN) and comparisons. The effect of these changes are conveniently demonstrated by the new sse4_1-stv-5.c test case: __int128 a[16]; __int128 b[16]; __int128 c[16]; void foo() { for (unsigned int i=0; i<16; i++) a[i] = b[i] & ~c[i]; } which when currently compiled on mainline wtih -O2 -msse4 produces: foo: xorl %eax, %eax .L2: movq c(%rax), %rsi movq c+8(%rax), %rdi addq $16, %rax notq %rsi notq %rdi andq b-16(%rax), %rsi andq b-8(%rax), %rdi movq %rsi, a-16(%rax) movq %rdi, a-8(%rax) cmpq $256, %rax jne .L2 ret but with this patch now produces: foo: xorl %eax, %eax .L2: movdqa c(%rax), %xmm0 pandn b(%rax), %xmm0 addq $16, %rax movaps %xmm0, a-16(%rax) cmpq $256, %rax jne .L2 ret Technically, the STV pass is implemented by three C++ classes, a common abstract base class "scalar_chain" that contains common functionality, and two derived classes: general_scalar_chain (which handles SI and DI modes) and timode_scalar_chain (which handles TI modes). As mentioned previously, because only TI mode moves were handled the two worker classes behaved significantly differently. These changes bring the functionality of these two classes closer together, which is reflected by refactoring more shared code from general_scalar_chain to the parent scalar_chain and reusing it from timode_scalar_chain. There still remain significant differences (and simplifications) so the existing division of classes (as specializations) continues to make sense. 2022-07-11 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-features.h (scalar_chain): Add fields insns_conv, n_sse_to_integer and n_integer_to_sse to this parent class, moved from general_scalar_chain. (scalar_chain::convert_compare): Protected method moved from general_scalar_chain. (mark_dual_mode_def): Make protected, not private virtual. (scalar_chain:convert_op): New private virtual method. (general_scalar_chain::general_scalar_chain): Simplify constructor. (general_scalar_chain::~general_scalar_chain): Delete destructor. (general_scalar_chain): Move insns_conv, n_sse_to_integer and n_integer_to_sse fields to parent class, scalar_chain. (general_scalar_chain::mark_dual_mode_def): Delete prototype. (general_scalar_chain::convert_compare): Delete prototype. (timode_scalar_chain::compute_convert_gain): Remove simplistic implementation, convert to a method prototype. (timode_scalar_chain::mark_dual_mode_def): Delete prototype. (timode_scalar_chain::convert_op): Prototype new virtual method. * config/i386/i386-features.cc (scalar_chain::scalar_chain): Allocate insns_conv and initialize n_sse_to_integer and n_integer_to_sse fields in constructor. (scalar_chain::scalar_chain): Free insns_conv in destructor. (general_scalar_chain::general_scalar_chain): Delete constructor, now defined in the class declaration. (general_scalar_chain::~general_scalar_chain): Delete destructor. (scalar_chain::mark_dual_mode_def): Renamed from general_scalar_chain::mark_dual_mode_def. (timode_scalar_chain::mark_dual_mode_def): Delete. (scalar_chain::convert_compare): Renamed from general_scalar_chain::convert_compare. (timode_scalar_chain::compute_convert_gain): New method to determine the gain from converting a TImode chain to V1TImode. (timode_scalar_chain::convert_op): New method to convert an operand from TImode to V1TImode. (timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE on REG_EQUAL notes that were originally TImode (not CONST_INT). Handle AND, ANDN, XOR, IOR, NOT and COMPARE. (timode_mem_p): Helper predicate to check where operand is memory reference with sufficient alignment for TImode STV. (timode_scalar_to_vector_candidate_p): Use convertible_comparison_p to check whether COMPARE is convertible. Handle SET_DESTs that that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT, CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT. gcc/testsuite/ChangeLog * gcc.target/i386/sse4_1-stv-2.c: New test case, pand. * gcc.target/i386/sse4_1-stv-3.c: New test case, por. * gcc.target/i386/sse4_1-stv-4.c: New test case, pxor. * gcc.target/i386/sse4_1-stv-5.c: New test case, pandn. * gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
case IOR:
case XOR:
return REG_P (XEXP (src, 0))
Use PTEST to perform AND in TImode STV of (A & B) != 0 on x86_64. This x86_64 backend patch allows TImode STV to take advantage of the fact that the PTEST instruction performs an AND operation. Previously PTEST was (mostly) used for comparison against zero, by using the same operands. The benefits are demonstrated by the new test case: __int128 a,b; int foo() { return (a & b) != 0; } Currently with -O2 -msse4 we generate: movdqa a(%rip), %xmm0 pand b(%rip), %xmm0 xorl %eax, %eax ptest %xmm0, %xmm0 setne %al ret with this patch we now generate: movdqa a(%rip), %xmm0 xorl %eax, %eax ptest b(%rip), %xmm0 setne %al ret Technically, the magic happens using new define_insn_and_split patterns. Using two patterns allows this transformation to performed independently of whether TImode STV is run before or after combine. The one tricky case is that immediate constant operands of the AND behave slightly differently between TImode and V1TImode: All V1TImode immediate operands becomes loads, but for TImode only values that are not hilo_operands need to be loaded. Hence the new *testti_doubleword accepts any general_operand, but internally during split calls force_reg whenever the second operand is not x86_64_hilo_general_operand. This required (benefits from) some tweaks to TImode STV to support CONST_WIDE_INT in more places, using CONST_SCALAR_INT_P instead of just CONST_INT_P. 2022-08-09 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-features.cc (scalar_chain::convert_compare): Create new pseudos only when/if needed. Add support for TEST, i.e. (COMPARE (AND x y) (const_int 0)), using UNSPEC_PTEST. When broadcasting V2DImode and V4SImode use new pseudo register. (timode_scalar_chain::convert_op): Do nothing if operand is already V1TImode. Avoid generating useless SUBREG conversions, i.e. (SUBREG:V1TImode (REG:V1TImode) 0). Handle CONST_WIDE_INT in addition to CONST_INT by using CONST_SCALAR_INT_P. (convertible_comparison_p): Use CONST_SCALAR_INT_P to match both CONST_WIDE_INT and CONST_INT. Recognize new *testti_doubleword pattern as an STV candidate. (timode_scalar_to_vector_candidate_p): Allow CONST_SCALAR_INT_P operands in binary logic operations. * config/i386/i386.cc (ix86_rtx_costs) <case UNSPEC>: Add costs for UNSPEC_PTEST; a PTEST that performs an AND has the same cost as regular PTEST, i.e. cost->sse_op. * config/i386/i386.md (*testti_doubleword): New pre-reload define_insn_and_split that recognizes comparison of TI mode AND against zero. * config/i386/sse.md (*ptest<mode>_and): New pre-reload define_insn_and_split that recognizes UNSPEC_PTEST of identical AND operands. gcc/testsuite/ChangeLog * gcc.target/i386/sse4_1-stv-8.c: New test case.
2022-08-09 19:59:55 +02:00
&& (REG_P (XEXP (src, 1))
|| CONST_SCALAR_INT_P (XEXP (src, 1))
|| timode_mem_p (XEXP (src, 1)));
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64. This patch upgrades x86_64's scalar-to-vector (STV) pass to more aggressively transform 128-bit scalar TImode operations into vector V1TImode operations performed on SSE registers. TImode functionality already exists in STV, but only for move operations. This change brings support for logical operations (AND, IOR, XOR, NOT and ANDN) and comparisons. The effect of these changes are conveniently demonstrated by the new sse4_1-stv-5.c test case: __int128 a[16]; __int128 b[16]; __int128 c[16]; void foo() { for (unsigned int i=0; i<16; i++) a[i] = b[i] & ~c[i]; } which when currently compiled on mainline wtih -O2 -msse4 produces: foo: xorl %eax, %eax .L2: movq c(%rax), %rsi movq c+8(%rax), %rdi addq $16, %rax notq %rsi notq %rdi andq b-16(%rax), %rsi andq b-8(%rax), %rdi movq %rsi, a-16(%rax) movq %rdi, a-8(%rax) cmpq $256, %rax jne .L2 ret but with this patch now produces: foo: xorl %eax, %eax .L2: movdqa c(%rax), %xmm0 pandn b(%rax), %xmm0 addq $16, %rax movaps %xmm0, a-16(%rax) cmpq $256, %rax jne .L2 ret Technically, the STV pass is implemented by three C++ classes, a common abstract base class "scalar_chain" that contains common functionality, and two derived classes: general_scalar_chain (which handles SI and DI modes) and timode_scalar_chain (which handles TI modes). As mentioned previously, because only TI mode moves were handled the two worker classes behaved significantly differently. These changes bring the functionality of these two classes closer together, which is reflected by refactoring more shared code from general_scalar_chain to the parent scalar_chain and reusing it from timode_scalar_chain. There still remain significant differences (and simplifications) so the existing division of classes (as specializations) continues to make sense. 2022-07-11 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-features.h (scalar_chain): Add fields insns_conv, n_sse_to_integer and n_integer_to_sse to this parent class, moved from general_scalar_chain. (scalar_chain::convert_compare): Protected method moved from general_scalar_chain. (mark_dual_mode_def): Make protected, not private virtual. (scalar_chain:convert_op): New private virtual method. (general_scalar_chain::general_scalar_chain): Simplify constructor. (general_scalar_chain::~general_scalar_chain): Delete destructor. (general_scalar_chain): Move insns_conv, n_sse_to_integer and n_integer_to_sse fields to parent class, scalar_chain. (general_scalar_chain::mark_dual_mode_def): Delete prototype. (general_scalar_chain::convert_compare): Delete prototype. (timode_scalar_chain::compute_convert_gain): Remove simplistic implementation, convert to a method prototype. (timode_scalar_chain::mark_dual_mode_def): Delete prototype. (timode_scalar_chain::convert_op): Prototype new virtual method. * config/i386/i386-features.cc (scalar_chain::scalar_chain): Allocate insns_conv and initialize n_sse_to_integer and n_integer_to_sse fields in constructor. (scalar_chain::scalar_chain): Free insns_conv in destructor. (general_scalar_chain::general_scalar_chain): Delete constructor, now defined in the class declaration. (general_scalar_chain::~general_scalar_chain): Delete destructor. (scalar_chain::mark_dual_mode_def): Renamed from general_scalar_chain::mark_dual_mode_def. (timode_scalar_chain::mark_dual_mode_def): Delete. (scalar_chain::convert_compare): Renamed from general_scalar_chain::convert_compare. (timode_scalar_chain::compute_convert_gain): New method to determine the gain from converting a TImode chain to V1TImode. (timode_scalar_chain::convert_op): New method to convert an operand from TImode to V1TImode. (timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE on REG_EQUAL notes that were originally TImode (not CONST_INT). Handle AND, ANDN, XOR, IOR, NOT and COMPARE. (timode_mem_p): Helper predicate to check where operand is memory reference with sufficient alignment for TImode STV. (timode_scalar_to_vector_candidate_p): Use convertible_comparison_p to check whether COMPARE is convertible. Handle SET_DESTs that that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT, CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT. gcc/testsuite/ChangeLog * gcc.target/i386/sse4_1-stv-2.c: New test case, pand. * gcc.target/i386/sse4_1-stv-3.c: New test case, por. * gcc.target/i386/sse4_1-stv-4.c: New test case, pxor. * gcc.target/i386/sse4_1-stv-5.c: New test case, pandn. * gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
case NOT:
return REG_P (XEXP (src, 0)) || timode_mem_p (XEXP (src, 0));
case ASHIFT:
case LSHIFTRT:
/* Handle logical shifts by integer constants between 0 and 120
that are multiples of 8. */
return REG_P (XEXP (src, 0))
&& CONST_INT_P (XEXP (src, 1))
&& (INTVAL (XEXP (src, 1)) & ~0x78) == 0;
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64. This patch upgrades x86_64's scalar-to-vector (STV) pass to more aggressively transform 128-bit scalar TImode operations into vector V1TImode operations performed on SSE registers. TImode functionality already exists in STV, but only for move operations. This change brings support for logical operations (AND, IOR, XOR, NOT and ANDN) and comparisons. The effect of these changes are conveniently demonstrated by the new sse4_1-stv-5.c test case: __int128 a[16]; __int128 b[16]; __int128 c[16]; void foo() { for (unsigned int i=0; i<16; i++) a[i] = b[i] & ~c[i]; } which when currently compiled on mainline wtih -O2 -msse4 produces: foo: xorl %eax, %eax .L2: movq c(%rax), %rsi movq c+8(%rax), %rdi addq $16, %rax notq %rsi notq %rdi andq b-16(%rax), %rsi andq b-8(%rax), %rdi movq %rsi, a-16(%rax) movq %rdi, a-8(%rax) cmpq $256, %rax jne .L2 ret but with this patch now produces: foo: xorl %eax, %eax .L2: movdqa c(%rax), %xmm0 pandn b(%rax), %xmm0 addq $16, %rax movaps %xmm0, a-16(%rax) cmpq $256, %rax jne .L2 ret Technically, the STV pass is implemented by three C++ classes, a common abstract base class "scalar_chain" that contains common functionality, and two derived classes: general_scalar_chain (which handles SI and DI modes) and timode_scalar_chain (which handles TI modes). As mentioned previously, because only TI mode moves were handled the two worker classes behaved significantly differently. These changes bring the functionality of these two classes closer together, which is reflected by refactoring more shared code from general_scalar_chain to the parent scalar_chain and reusing it from timode_scalar_chain. There still remain significant differences (and simplifications) so the existing division of classes (as specializations) continues to make sense. 2022-07-11 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-features.h (scalar_chain): Add fields insns_conv, n_sse_to_integer and n_integer_to_sse to this parent class, moved from general_scalar_chain. (scalar_chain::convert_compare): Protected method moved from general_scalar_chain. (mark_dual_mode_def): Make protected, not private virtual. (scalar_chain:convert_op): New private virtual method. (general_scalar_chain::general_scalar_chain): Simplify constructor. (general_scalar_chain::~general_scalar_chain): Delete destructor. (general_scalar_chain): Move insns_conv, n_sse_to_integer and n_integer_to_sse fields to parent class, scalar_chain. (general_scalar_chain::mark_dual_mode_def): Delete prototype. (general_scalar_chain::convert_compare): Delete prototype. (timode_scalar_chain::compute_convert_gain): Remove simplistic implementation, convert to a method prototype. (timode_scalar_chain::mark_dual_mode_def): Delete prototype. (timode_scalar_chain::convert_op): Prototype new virtual method. * config/i386/i386-features.cc (scalar_chain::scalar_chain): Allocate insns_conv and initialize n_sse_to_integer and n_integer_to_sse fields in constructor. (scalar_chain::scalar_chain): Free insns_conv in destructor. (general_scalar_chain::general_scalar_chain): Delete constructor, now defined in the class declaration. (general_scalar_chain::~general_scalar_chain): Delete destructor. (scalar_chain::mark_dual_mode_def): Renamed from general_scalar_chain::mark_dual_mode_def. (timode_scalar_chain::mark_dual_mode_def): Delete. (scalar_chain::convert_compare): Renamed from general_scalar_chain::convert_compare. (timode_scalar_chain::compute_convert_gain): New method to determine the gain from converting a TImode chain to V1TImode. (timode_scalar_chain::convert_op): New method to convert an operand from TImode to V1TImode. (timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE on REG_EQUAL notes that were originally TImode (not CONST_INT). Handle AND, ANDN, XOR, IOR, NOT and COMPARE. (timode_mem_p): Helper predicate to check where operand is memory reference with sufficient alignment for TImode STV. (timode_scalar_to_vector_candidate_p): Use convertible_comparison_p to check whether COMPARE is convertible. Handle SET_DESTs that that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT, CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT. gcc/testsuite/ChangeLog * gcc.target/i386/sse4_1-stv-2.c: New test case, pand. * gcc.target/i386/sse4_1-stv-3.c: New test case, por. * gcc.target/i386/sse4_1-stv-4.c: New test case, pxor. * gcc.target/i386/sse4_1-stv-5.c: New test case, pandn. * gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
default:
return false;
}
}
/* For a register REGNO, scan instructions for its defs and uses.
Put REGNO in REGS if a def or use isn't in CANDIDATES. */
static void
timode_check_non_convertible_regs (bitmap candidates, bitmap regs,
unsigned int regno)
{
PR target/106450: Tweak timode_remove_non_convertible_regs on x86_64. This patch resolves PR target/106450, some more fall-out from more aggressive TImode scalar-to-vector (STV) optimizations. I continue to be caught out by how far TImode STV has diverged from DImode/SImode STV, and therefore requires additional (unexpected) tweaking. Many thanks to H.J. Lu for pointing out timode_remove_non_convertible_regs needs to be extended to handle XOR (and other new operations). Unhelpfully the comment above this function states that it's the TImode version of "remove_non_convertible_regs", which doesn't exist anymore, so I've resurrected an explanatory comment from the git history. By refactoring the checks for hard regs and already "marked" regs into timode_check_non_convertible_regs itself, all of its callers are simplified. This patch then FOR_EACH_INSN_USE and FOR_EACH_INSN_DEF to generically handle arbitrary (non-move) instructions (including unary and binary operations), calling timode_check_non_convertible_regs on each TImode register USE and DEF. 2022-07-31 Roger Sayle <roger@nextmovesoftware.com> H.J. Lu <hjl.tools@gmail.com> gcc/ChangeLog PR target/106450 * config/i386/i386-features.cc (timode_check_non_convertible_regs): Do nothing if REGNO is set in the REGS bitmap, or is a hard reg. (timode_remove_non_convertible_regs): Update comment. Call timode_check_non_convertible_reg on all TImode register DEFs and USEs in each instruction. gcc/testsuite/ChangeLog PR target/106450 * gcc.target/i386/pr106450.c: New test case.
2022-07-31 22:44:51 +02:00
/* Do nothing if REGNO is already in REGS or is a hard reg. */
if (bitmap_bit_p (regs, regno)
|| HARD_REGISTER_NUM_P (regno))
return;
for (df_ref def = DF_REG_DEF_CHAIN (regno);
def;
def = DF_REF_NEXT_REG (def))
{
if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
{
if (dump_file)
fprintf (dump_file,
"r%d has non convertible def in insn %d\n",
regno, DF_REF_INSN_UID (def));
bitmap_set_bit (regs, regno);
break;
}
}
for (df_ref ref = DF_REG_USE_CHAIN (regno);
ref;
ref = DF_REF_NEXT_REG (ref))
{
/* Debug instructions are skipped. */
if (NONDEBUG_INSN_P (DF_REF_INSN (ref))
&& !bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
{
if (dump_file)
fprintf (dump_file,
"r%d has non convertible use in insn %d\n",
regno, DF_REF_INSN_UID (ref));
bitmap_set_bit (regs, regno);
break;
}
}
}
PR target/106450: Tweak timode_remove_non_convertible_regs on x86_64. This patch resolves PR target/106450, some more fall-out from more aggressive TImode scalar-to-vector (STV) optimizations. I continue to be caught out by how far TImode STV has diverged from DImode/SImode STV, and therefore requires additional (unexpected) tweaking. Many thanks to H.J. Lu for pointing out timode_remove_non_convertible_regs needs to be extended to handle XOR (and other new operations). Unhelpfully the comment above this function states that it's the TImode version of "remove_non_convertible_regs", which doesn't exist anymore, so I've resurrected an explanatory comment from the git history. By refactoring the checks for hard regs and already "marked" regs into timode_check_non_convertible_regs itself, all of its callers are simplified. This patch then FOR_EACH_INSN_USE and FOR_EACH_INSN_DEF to generically handle arbitrary (non-move) instructions (including unary and binary operations), calling timode_check_non_convertible_regs on each TImode register USE and DEF. 2022-07-31 Roger Sayle <roger@nextmovesoftware.com> H.J. Lu <hjl.tools@gmail.com> gcc/ChangeLog PR target/106450 * config/i386/i386-features.cc (timode_check_non_convertible_regs): Do nothing if REGNO is set in the REGS bitmap, or is a hard reg. (timode_remove_non_convertible_regs): Update comment. Call timode_check_non_convertible_reg on all TImode register DEFs and USEs in each instruction. gcc/testsuite/ChangeLog PR target/106450 * gcc.target/i386/pr106450.c: New test case.
2022-07-31 22:44:51 +02:00
/* For a given bitmap of insn UIDs scans all instructions and
remove insn from CANDIDATES in case it has both convertible
and not convertible definitions.
All insns in a bitmap are conversion candidates according to
scalar_to_vector_candidate_p. Currently it implies all insns
are single_set. */
static void
timode_remove_non_convertible_regs (bitmap candidates)
{
bitmap_iterator bi;
unsigned id;
bitmap regs = BITMAP_ALLOC (NULL);
PR target/106303: Fix TImode STV related failures on x86. This patch resolves PR target/106303 (and the related PRs 106347, 106404, 106407) which are ICEs caused by my improvements to x86_64's 128-bit TImode to V1TImode Scalar to Vector (STV) pass. My apologies for the breakage. The issue is that data flow analysis is used to partition usage of each TImode pseudo into "chains", where each chain is analyzed and if suitable converted to vector operations. The problems appears when some chains for a pseudo are converted, and others aren't as RTL sharing can result in some mode changes leaking into other instructions that aren't/shouldn't/can't be converted, which eventually leads to an ICE for mismatched modes. My first approach to a fix was to unify more of the STV infrastructure, reasoning that if TImode STV was exhibiting these problems, but DImode and SImode STV weren't, the issue was likely to be caused/resolved by these remaining differences. This appeared to fix some but not all of the reported PRs. A better solution was then proposed by H.J. Lu in Bugzilla, that we need to iterate the removal of candidates in the function timode_remove_non_convertible_regs until there are no further changes. As each chain is removed from consideration, it in turn may affect whether other insns/chains can safely be converted. 2022-07-24 Roger Sayle <roger@nextmovesoftware.com> H.J. Lu <hjl.tools@gmail.com> gcc/ChangeLog PR target/106303 PR target/106347 * config/i386/i386-features.cc (make_vector_copies): Move from general_scalar_chain to scalar_chain. (convert_reg): Likewise. (convert_insn_common): New scalar_chain method split out from general_scalar_chain convert_insn. (convert_registers): Move from general_scalar_chain to scalar_chain. (scalar_chain::convert): Call convert_insn_common before calling convert_insn. (timode_remove_non_convertible_regs): Iterate until there are no further changes to the candidates. * config/i386/i386-features.h (scalar_chain::hash_map): Move from general_scalar_chain. (scalar_chain::convert_reg): Likewise. (scalar_chain::convert_insn_common): New shared method. (scalar_chain::make_vector_copies): Move from general_scalar_chain. (scalar_chain::convert_registers): Likewise. No longer virtual. (general_scalar_chain::hash_map): Delete. Moved to scalar_chain. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::make_vector_copies): Likewise. (general_scalar_chain::convert_registers): Delete virtual method. (timode_scalar_chain::convert_registers): Likewise. gcc/testsuite/ChangeLog PR target/106303 PR target/106347 * gcc.target/i386/pr106303.c: New test case. * gcc.target/i386/pr106347.c: New test case.
2022-07-24 13:22:22 +02:00
bool changed;
PR target/106303: Fix TImode STV related failures on x86. This patch resolves PR target/106303 (and the related PRs 106347, 106404, 106407) which are ICEs caused by my improvements to x86_64's 128-bit TImode to V1TImode Scalar to Vector (STV) pass. My apologies for the breakage. The issue is that data flow analysis is used to partition usage of each TImode pseudo into "chains", where each chain is analyzed and if suitable converted to vector operations. The problems appears when some chains for a pseudo are converted, and others aren't as RTL sharing can result in some mode changes leaking into other instructions that aren't/shouldn't/can't be converted, which eventually leads to an ICE for mismatched modes. My first approach to a fix was to unify more of the STV infrastructure, reasoning that if TImode STV was exhibiting these problems, but DImode and SImode STV weren't, the issue was likely to be caused/resolved by these remaining differences. This appeared to fix some but not all of the reported PRs. A better solution was then proposed by H.J. Lu in Bugzilla, that we need to iterate the removal of candidates in the function timode_remove_non_convertible_regs until there are no further changes. As each chain is removed from consideration, it in turn may affect whether other insns/chains can safely be converted. 2022-07-24 Roger Sayle <roger@nextmovesoftware.com> H.J. Lu <hjl.tools@gmail.com> gcc/ChangeLog PR target/106303 PR target/106347 * config/i386/i386-features.cc (make_vector_copies): Move from general_scalar_chain to scalar_chain. (convert_reg): Likewise. (convert_insn_common): New scalar_chain method split out from general_scalar_chain convert_insn. (convert_registers): Move from general_scalar_chain to scalar_chain. (scalar_chain::convert): Call convert_insn_common before calling convert_insn. (timode_remove_non_convertible_regs): Iterate until there are no further changes to the candidates. * config/i386/i386-features.h (scalar_chain::hash_map): Move from general_scalar_chain. (scalar_chain::convert_reg): Likewise. (scalar_chain::convert_insn_common): New shared method. (scalar_chain::make_vector_copies): Move from general_scalar_chain. (scalar_chain::convert_registers): Likewise. No longer virtual. (general_scalar_chain::hash_map): Delete. Moved to scalar_chain. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::make_vector_copies): Likewise. (general_scalar_chain::convert_registers): Delete virtual method. (timode_scalar_chain::convert_registers): Likewise. gcc/testsuite/ChangeLog PR target/106303 PR target/106347 * gcc.target/i386/pr106303.c: New test case. * gcc.target/i386/pr106347.c: New test case.
2022-07-24 13:22:22 +02:00
do {
changed = false;
EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
{
PR target/106450: Tweak timode_remove_non_convertible_regs on x86_64. This patch resolves PR target/106450, some more fall-out from more aggressive TImode scalar-to-vector (STV) optimizations. I continue to be caught out by how far TImode STV has diverged from DImode/SImode STV, and therefore requires additional (unexpected) tweaking. Many thanks to H.J. Lu for pointing out timode_remove_non_convertible_regs needs to be extended to handle XOR (and other new operations). Unhelpfully the comment above this function states that it's the TImode version of "remove_non_convertible_regs", which doesn't exist anymore, so I've resurrected an explanatory comment from the git history. By refactoring the checks for hard regs and already "marked" regs into timode_check_non_convertible_regs itself, all of its callers are simplified. This patch then FOR_EACH_INSN_USE and FOR_EACH_INSN_DEF to generically handle arbitrary (non-move) instructions (including unary and binary operations), calling timode_check_non_convertible_regs on each TImode register USE and DEF. 2022-07-31 Roger Sayle <roger@nextmovesoftware.com> H.J. Lu <hjl.tools@gmail.com> gcc/ChangeLog PR target/106450 * config/i386/i386-features.cc (timode_check_non_convertible_regs): Do nothing if REGNO is set in the REGS bitmap, or is a hard reg. (timode_remove_non_convertible_regs): Update comment. Call timode_check_non_convertible_reg on all TImode register DEFs and USEs in each instruction. gcc/testsuite/ChangeLog PR target/106450 * gcc.target/i386/pr106450.c: New test case.
2022-07-31 22:44:51 +02:00
rtx_insn *insn = DF_INSN_UID_GET (id)->insn;
df_ref ref;
FOR_EACH_INSN_DEF (ref, insn)
if (!DF_REF_REG_MEM_P (ref)
&& GET_MODE (DF_REF_REG (ref)) == TImode)
timode_check_non_convertible_regs (candidates, regs,
DF_REF_REGNO (ref));
FOR_EACH_INSN_USE (ref, insn)
if (!DF_REF_REG_MEM_P (ref)
&& GET_MODE (DF_REF_REG (ref)) == TImode)
timode_check_non_convertible_regs (candidates, regs,
DF_REF_REGNO (ref));
PR target/106303: Fix TImode STV related failures on x86. This patch resolves PR target/106303 (and the related PRs 106347, 106404, 106407) which are ICEs caused by my improvements to x86_64's 128-bit TImode to V1TImode Scalar to Vector (STV) pass. My apologies for the breakage. The issue is that data flow analysis is used to partition usage of each TImode pseudo into "chains", where each chain is analyzed and if suitable converted to vector operations. The problems appears when some chains for a pseudo are converted, and others aren't as RTL sharing can result in some mode changes leaking into other instructions that aren't/shouldn't/can't be converted, which eventually leads to an ICE for mismatched modes. My first approach to a fix was to unify more of the STV infrastructure, reasoning that if TImode STV was exhibiting these problems, but DImode and SImode STV weren't, the issue was likely to be caused/resolved by these remaining differences. This appeared to fix some but not all of the reported PRs. A better solution was then proposed by H.J. Lu in Bugzilla, that we need to iterate the removal of candidates in the function timode_remove_non_convertible_regs until there are no further changes. As each chain is removed from consideration, it in turn may affect whether other insns/chains can safely be converted. 2022-07-24 Roger Sayle <roger@nextmovesoftware.com> H.J. Lu <hjl.tools@gmail.com> gcc/ChangeLog PR target/106303 PR target/106347 * config/i386/i386-features.cc (make_vector_copies): Move from general_scalar_chain to scalar_chain. (convert_reg): Likewise. (convert_insn_common): New scalar_chain method split out from general_scalar_chain convert_insn. (convert_registers): Move from general_scalar_chain to scalar_chain. (scalar_chain::convert): Call convert_insn_common before calling convert_insn. (timode_remove_non_convertible_regs): Iterate until there are no further changes to the candidates. * config/i386/i386-features.h (scalar_chain::hash_map): Move from general_scalar_chain. (scalar_chain::convert_reg): Likewise. (scalar_chain::convert_insn_common): New shared method. (scalar_chain::make_vector_copies): Move from general_scalar_chain. (scalar_chain::convert_registers): Likewise. No longer virtual. (general_scalar_chain::hash_map): Delete. Moved to scalar_chain. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::make_vector_copies): Likewise. (general_scalar_chain::convert_registers): Delete virtual method. (timode_scalar_chain::convert_registers): Likewise. gcc/testsuite/ChangeLog PR target/106303 PR target/106347 * gcc.target/i386/pr106303.c: New test case. * gcc.target/i386/pr106347.c: New test case.
2022-07-24 13:22:22 +02:00
}
PR target/106303: Fix TImode STV related failures on x86. This patch resolves PR target/106303 (and the related PRs 106347, 106404, 106407) which are ICEs caused by my improvements to x86_64's 128-bit TImode to V1TImode Scalar to Vector (STV) pass. My apologies for the breakage. The issue is that data flow analysis is used to partition usage of each TImode pseudo into "chains", where each chain is analyzed and if suitable converted to vector operations. The problems appears when some chains for a pseudo are converted, and others aren't as RTL sharing can result in some mode changes leaking into other instructions that aren't/shouldn't/can't be converted, which eventually leads to an ICE for mismatched modes. My first approach to a fix was to unify more of the STV infrastructure, reasoning that if TImode STV was exhibiting these problems, but DImode and SImode STV weren't, the issue was likely to be caused/resolved by these remaining differences. This appeared to fix some but not all of the reported PRs. A better solution was then proposed by H.J. Lu in Bugzilla, that we need to iterate the removal of candidates in the function timode_remove_non_convertible_regs until there are no further changes. As each chain is removed from consideration, it in turn may affect whether other insns/chains can safely be converted. 2022-07-24 Roger Sayle <roger@nextmovesoftware.com> H.J. Lu <hjl.tools@gmail.com> gcc/ChangeLog PR target/106303 PR target/106347 * config/i386/i386-features.cc (make_vector_copies): Move from general_scalar_chain to scalar_chain. (convert_reg): Likewise. (convert_insn_common): New scalar_chain method split out from general_scalar_chain convert_insn. (convert_registers): Move from general_scalar_chain to scalar_chain. (scalar_chain::convert): Call convert_insn_common before calling convert_insn. (timode_remove_non_convertible_regs): Iterate until there are no further changes to the candidates. * config/i386/i386-features.h (scalar_chain::hash_map): Move from general_scalar_chain. (scalar_chain::convert_reg): Likewise. (scalar_chain::convert_insn_common): New shared method. (scalar_chain::make_vector_copies): Move from general_scalar_chain. (scalar_chain::convert_registers): Likewise. No longer virtual. (general_scalar_chain::hash_map): Delete. Moved to scalar_chain. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::make_vector_copies): Likewise. (general_scalar_chain::convert_registers): Delete virtual method. (timode_scalar_chain::convert_registers): Likewise. gcc/testsuite/ChangeLog PR target/106303 PR target/106347 * gcc.target/i386/pr106303.c: New test case. * gcc.target/i386/pr106347.c: New test case.
2022-07-24 13:22:22 +02:00
EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
{
for (df_ref def = DF_REG_DEF_CHAIN (id);
def;
def = DF_REF_NEXT_REG (def))
if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
{
if (dump_file)
fprintf (dump_file, "Removing insn %d from candidates list\n",
DF_REF_INSN_UID (def));
PR target/106303: Fix TImode STV related failures on x86. This patch resolves PR target/106303 (and the related PRs 106347, 106404, 106407) which are ICEs caused by my improvements to x86_64's 128-bit TImode to V1TImode Scalar to Vector (STV) pass. My apologies for the breakage. The issue is that data flow analysis is used to partition usage of each TImode pseudo into "chains", where each chain is analyzed and if suitable converted to vector operations. The problems appears when some chains for a pseudo are converted, and others aren't as RTL sharing can result in some mode changes leaking into other instructions that aren't/shouldn't/can't be converted, which eventually leads to an ICE for mismatched modes. My first approach to a fix was to unify more of the STV infrastructure, reasoning that if TImode STV was exhibiting these problems, but DImode and SImode STV weren't, the issue was likely to be caused/resolved by these remaining differences. This appeared to fix some but not all of the reported PRs. A better solution was then proposed by H.J. Lu in Bugzilla, that we need to iterate the removal of candidates in the function timode_remove_non_convertible_regs until there are no further changes. As each chain is removed from consideration, it in turn may affect whether other insns/chains can safely be converted. 2022-07-24 Roger Sayle <roger@nextmovesoftware.com> H.J. Lu <hjl.tools@gmail.com> gcc/ChangeLog PR target/106303 PR target/106347 * config/i386/i386-features.cc (make_vector_copies): Move from general_scalar_chain to scalar_chain. (convert_reg): Likewise. (convert_insn_common): New scalar_chain method split out from general_scalar_chain convert_insn. (convert_registers): Move from general_scalar_chain to scalar_chain. (scalar_chain::convert): Call convert_insn_common before calling convert_insn. (timode_remove_non_convertible_regs): Iterate until there are no further changes to the candidates. * config/i386/i386-features.h (scalar_chain::hash_map): Move from general_scalar_chain. (scalar_chain::convert_reg): Likewise. (scalar_chain::convert_insn_common): New shared method. (scalar_chain::make_vector_copies): Move from general_scalar_chain. (scalar_chain::convert_registers): Likewise. No longer virtual. (general_scalar_chain::hash_map): Delete. Moved to scalar_chain. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::make_vector_copies): Likewise. (general_scalar_chain::convert_registers): Delete virtual method. (timode_scalar_chain::convert_registers): Likewise. gcc/testsuite/ChangeLog PR target/106303 PR target/106347 * gcc.target/i386/pr106303.c: New test case. * gcc.target/i386/pr106347.c: New test case.
2022-07-24 13:22:22 +02:00
bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
changed = true;
}
PR target/106303: Fix TImode STV related failures on x86. This patch resolves PR target/106303 (and the related PRs 106347, 106404, 106407) which are ICEs caused by my improvements to x86_64's 128-bit TImode to V1TImode Scalar to Vector (STV) pass. My apologies for the breakage. The issue is that data flow analysis is used to partition usage of each TImode pseudo into "chains", where each chain is analyzed and if suitable converted to vector operations. The problems appears when some chains for a pseudo are converted, and others aren't as RTL sharing can result in some mode changes leaking into other instructions that aren't/shouldn't/can't be converted, which eventually leads to an ICE for mismatched modes. My first approach to a fix was to unify more of the STV infrastructure, reasoning that if TImode STV was exhibiting these problems, but DImode and SImode STV weren't, the issue was likely to be caused/resolved by these remaining differences. This appeared to fix some but not all of the reported PRs. A better solution was then proposed by H.J. Lu in Bugzilla, that we need to iterate the removal of candidates in the function timode_remove_non_convertible_regs until there are no further changes. As each chain is removed from consideration, it in turn may affect whether other insns/chains can safely be converted. 2022-07-24 Roger Sayle <roger@nextmovesoftware.com> H.J. Lu <hjl.tools@gmail.com> gcc/ChangeLog PR target/106303 PR target/106347 * config/i386/i386-features.cc (make_vector_copies): Move from general_scalar_chain to scalar_chain. (convert_reg): Likewise. (convert_insn_common): New scalar_chain method split out from general_scalar_chain convert_insn. (convert_registers): Move from general_scalar_chain to scalar_chain. (scalar_chain::convert): Call convert_insn_common before calling convert_insn. (timode_remove_non_convertible_regs): Iterate until there are no further changes to the candidates. * config/i386/i386-features.h (scalar_chain::hash_map): Move from general_scalar_chain. (scalar_chain::convert_reg): Likewise. (scalar_chain::convert_insn_common): New shared method. (scalar_chain::make_vector_copies): Move from general_scalar_chain. (scalar_chain::convert_registers): Likewise. No longer virtual. (general_scalar_chain::hash_map): Delete. Moved to scalar_chain. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::make_vector_copies): Likewise. (general_scalar_chain::convert_registers): Delete virtual method. (timode_scalar_chain::convert_registers): Likewise. gcc/testsuite/ChangeLog PR target/106303 PR target/106347 * gcc.target/i386/pr106303.c: New test case. * gcc.target/i386/pr106347.c: New test case.
2022-07-24 13:22:22 +02:00
for (df_ref ref = DF_REG_USE_CHAIN (id);
ref;
ref = DF_REF_NEXT_REG (ref))
if (bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
{
if (dump_file)
fprintf (dump_file, "Removing insn %d from candidates list\n",
DF_REF_INSN_UID (ref));
PR target/106303: Fix TImode STV related failures on x86. This patch resolves PR target/106303 (and the related PRs 106347, 106404, 106407) which are ICEs caused by my improvements to x86_64's 128-bit TImode to V1TImode Scalar to Vector (STV) pass. My apologies for the breakage. The issue is that data flow analysis is used to partition usage of each TImode pseudo into "chains", where each chain is analyzed and if suitable converted to vector operations. The problems appears when some chains for a pseudo are converted, and others aren't as RTL sharing can result in some mode changes leaking into other instructions that aren't/shouldn't/can't be converted, which eventually leads to an ICE for mismatched modes. My first approach to a fix was to unify more of the STV infrastructure, reasoning that if TImode STV was exhibiting these problems, but DImode and SImode STV weren't, the issue was likely to be caused/resolved by these remaining differences. This appeared to fix some but not all of the reported PRs. A better solution was then proposed by H.J. Lu in Bugzilla, that we need to iterate the removal of candidates in the function timode_remove_non_convertible_regs until there are no further changes. As each chain is removed from consideration, it in turn may affect whether other insns/chains can safely be converted. 2022-07-24 Roger Sayle <roger@nextmovesoftware.com> H.J. Lu <hjl.tools@gmail.com> gcc/ChangeLog PR target/106303 PR target/106347 * config/i386/i386-features.cc (make_vector_copies): Move from general_scalar_chain to scalar_chain. (convert_reg): Likewise. (convert_insn_common): New scalar_chain method split out from general_scalar_chain convert_insn. (convert_registers): Move from general_scalar_chain to scalar_chain. (scalar_chain::convert): Call convert_insn_common before calling convert_insn. (timode_remove_non_convertible_regs): Iterate until there are no further changes to the candidates. * config/i386/i386-features.h (scalar_chain::hash_map): Move from general_scalar_chain. (scalar_chain::convert_reg): Likewise. (scalar_chain::convert_insn_common): New shared method. (scalar_chain::make_vector_copies): Move from general_scalar_chain. (scalar_chain::convert_registers): Likewise. No longer virtual. (general_scalar_chain::hash_map): Delete. Moved to scalar_chain. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::make_vector_copies): Likewise. (general_scalar_chain::convert_registers): Delete virtual method. (timode_scalar_chain::convert_registers): Likewise. gcc/testsuite/ChangeLog PR target/106303 PR target/106347 * gcc.target/i386/pr106303.c: New test case. * gcc.target/i386/pr106347.c: New test case.
2022-07-24 13:22:22 +02:00
bitmap_clear_bit (candidates, DF_REF_INSN_UID (ref));
changed = true;
}
}
} while (changed);
BITMAP_FREE (regs);
}
/* Main STV pass function. Find and convert scalar
instructions into vector mode when profitable. */
static unsigned int
convert_scalars_to_vector (bool timode_p)
{
basic_block bb;
int converted_insns = 0;
bitmap_obstack_initialize (NULL);
re PR rtl-optimization/91154 (456.hmmer regression on Haswell caused by r272922) 2019-08-14 Richard Biener <rguenther@suse.de> Uroš Bizjak <ubizjak@gmail.com> PR target/91154 * config/i386/i386-features.h (scalar_chain::scalar_chain): Add mode arguments. (scalar_chain::smode): New member. (scalar_chain::vmode): Likewise. (dimode_scalar_chain): Rename to... (general_scalar_chain): ... this. (general_scalar_chain::general_scalar_chain): Take mode arguments. (timode_scalar_chain::timode_scalar_chain): Initialize scalar_chain base with TImode and V1TImode. * config/i386/i386-features.c (scalar_chain::scalar_chain): Adjust. (general_scalar_chain::vector_const_cost): Adjust for SImode chains. (general_scalar_chain::compute_convert_gain): Likewise. Add {S,U}{MIN,MAX} support. (general_scalar_chain::replace_with_subreg): Use vmode/smode. (general_scalar_chain::make_vector_copies): Likewise. Handle non-DImode chains appropriately. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::convert_op): Likewise. (general_scalar_chain::convert_insn): Likewise. Add fatal_insn_not_found if the result is not recognized. (convertible_comparison_p): Pass in the scalar mode and use that. (general_scalar_to_vector_candidate_p): Likewise. Rename from dimode_scalar_to_vector_candidate_p. Add {S,U}{MIN,MAX} support. (scalar_to_vector_candidate_p): Remove by inlining into single caller. (general_remove_non_convertible_regs): Rename from dimode_remove_non_convertible_regs. (remove_non_convertible_regs): Remove by inlining into single caller. (convert_scalars_to_vector): Handle SImode and DImode chains in addition to TImode chains. * config/i386/i386.md (<maxmin><MAXMIN_IMODE>3): New expander. (*<maxmin><MAXMIN_IMODE>3_1): New insn-and-split. (*<maxmin>di3_doubleword): Likewise. * gcc.target/i386/pr91154.c: New testcase. * gcc.target/i386/minmax-3.c: Likewise. * gcc.target/i386/minmax-4.c: Likewise. * gcc.target/i386/minmax-5.c: Likewise. * gcc.target/i386/minmax-6.c: Likewise. * gcc.target/i386/minmax-1.c: Add -mno-stv. * gcc.target/i386/minmax-2.c: Likewise. Co-Authored-By: Uros Bizjak <ubizjak@gmail.com> From-SVN: r274481
2019-08-14 14:04:05 +02:00
const machine_mode cand_mode[3] = { SImode, DImode, TImode };
const machine_mode cand_vmode[3] = { V4SImode, V2DImode, V1TImode };
bitmap_head candidates[3]; /* { SImode, DImode, TImode } */
for (unsigned i = 0; i < 3; ++i)
bitmap_initialize (&candidates[i], &bitmap_default_obstack);
calculate_dominance_info (CDI_DOMINATORS);
df_set_flags (DF_DEFER_INSN_RESCAN | DF_RD_PRUNE_DEAD_DEFS);
df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
df_analyze ();
/* Find all instructions we want to convert into vector mode. */
if (dump_file)
fprintf (dump_file, "Searching for mode conversion candidates...\n");
FOR_EACH_BB_FN (bb, cfun)
{
rtx_insn *insn;
FOR_BB_INSNS (bb, insn)
if (timode_p
re PR rtl-optimization/91154 (456.hmmer regression on Haswell caused by r272922) 2019-08-14 Richard Biener <rguenther@suse.de> Uroš Bizjak <ubizjak@gmail.com> PR target/91154 * config/i386/i386-features.h (scalar_chain::scalar_chain): Add mode arguments. (scalar_chain::smode): New member. (scalar_chain::vmode): Likewise. (dimode_scalar_chain): Rename to... (general_scalar_chain): ... this. (general_scalar_chain::general_scalar_chain): Take mode arguments. (timode_scalar_chain::timode_scalar_chain): Initialize scalar_chain base with TImode and V1TImode. * config/i386/i386-features.c (scalar_chain::scalar_chain): Adjust. (general_scalar_chain::vector_const_cost): Adjust for SImode chains. (general_scalar_chain::compute_convert_gain): Likewise. Add {S,U}{MIN,MAX} support. (general_scalar_chain::replace_with_subreg): Use vmode/smode. (general_scalar_chain::make_vector_copies): Likewise. Handle non-DImode chains appropriately. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::convert_op): Likewise. (general_scalar_chain::convert_insn): Likewise. Add fatal_insn_not_found if the result is not recognized. (convertible_comparison_p): Pass in the scalar mode and use that. (general_scalar_to_vector_candidate_p): Likewise. Rename from dimode_scalar_to_vector_candidate_p. Add {S,U}{MIN,MAX} support. (scalar_to_vector_candidate_p): Remove by inlining into single caller. (general_remove_non_convertible_regs): Rename from dimode_remove_non_convertible_regs. (remove_non_convertible_regs): Remove by inlining into single caller. (convert_scalars_to_vector): Handle SImode and DImode chains in addition to TImode chains. * config/i386/i386.md (<maxmin><MAXMIN_IMODE>3): New expander. (*<maxmin><MAXMIN_IMODE>3_1): New insn-and-split. (*<maxmin>di3_doubleword): Likewise. * gcc.target/i386/pr91154.c: New testcase. * gcc.target/i386/minmax-3.c: Likewise. * gcc.target/i386/minmax-4.c: Likewise. * gcc.target/i386/minmax-5.c: Likewise. * gcc.target/i386/minmax-6.c: Likewise. * gcc.target/i386/minmax-1.c: Add -mno-stv. * gcc.target/i386/minmax-2.c: Likewise. Co-Authored-By: Uros Bizjak <ubizjak@gmail.com> From-SVN: r274481
2019-08-14 14:04:05 +02:00
&& timode_scalar_to_vector_candidate_p (insn))
{
if (dump_file)
re PR rtl-optimization/91154 (456.hmmer regression on Haswell caused by r272922) 2019-08-14 Richard Biener <rguenther@suse.de> Uroš Bizjak <ubizjak@gmail.com> PR target/91154 * config/i386/i386-features.h (scalar_chain::scalar_chain): Add mode arguments. (scalar_chain::smode): New member. (scalar_chain::vmode): Likewise. (dimode_scalar_chain): Rename to... (general_scalar_chain): ... this. (general_scalar_chain::general_scalar_chain): Take mode arguments. (timode_scalar_chain::timode_scalar_chain): Initialize scalar_chain base with TImode and V1TImode. * config/i386/i386-features.c (scalar_chain::scalar_chain): Adjust. (general_scalar_chain::vector_const_cost): Adjust for SImode chains. (general_scalar_chain::compute_convert_gain): Likewise. Add {S,U}{MIN,MAX} support. (general_scalar_chain::replace_with_subreg): Use vmode/smode. (general_scalar_chain::make_vector_copies): Likewise. Handle non-DImode chains appropriately. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::convert_op): Likewise. (general_scalar_chain::convert_insn): Likewise. Add fatal_insn_not_found if the result is not recognized. (convertible_comparison_p): Pass in the scalar mode and use that. (general_scalar_to_vector_candidate_p): Likewise. Rename from dimode_scalar_to_vector_candidate_p. Add {S,U}{MIN,MAX} support. (scalar_to_vector_candidate_p): Remove by inlining into single caller. (general_remove_non_convertible_regs): Rename from dimode_remove_non_convertible_regs. (remove_non_convertible_regs): Remove by inlining into single caller. (convert_scalars_to_vector): Handle SImode and DImode chains in addition to TImode chains. * config/i386/i386.md (<maxmin><MAXMIN_IMODE>3): New expander. (*<maxmin><MAXMIN_IMODE>3_1): New insn-and-split. (*<maxmin>di3_doubleword): Likewise. * gcc.target/i386/pr91154.c: New testcase. * gcc.target/i386/minmax-3.c: Likewise. * gcc.target/i386/minmax-4.c: Likewise. * gcc.target/i386/minmax-5.c: Likewise. * gcc.target/i386/minmax-6.c: Likewise. * gcc.target/i386/minmax-1.c: Add -mno-stv. * gcc.target/i386/minmax-2.c: Likewise. Co-Authored-By: Uros Bizjak <ubizjak@gmail.com> From-SVN: r274481
2019-08-14 14:04:05 +02:00
fprintf (dump_file, " insn %d is marked as a TImode candidate\n",
INSN_UID (insn));
re PR rtl-optimization/91154 (456.hmmer regression on Haswell caused by r272922) 2019-08-14 Richard Biener <rguenther@suse.de> Uroš Bizjak <ubizjak@gmail.com> PR target/91154 * config/i386/i386-features.h (scalar_chain::scalar_chain): Add mode arguments. (scalar_chain::smode): New member. (scalar_chain::vmode): Likewise. (dimode_scalar_chain): Rename to... (general_scalar_chain): ... this. (general_scalar_chain::general_scalar_chain): Take mode arguments. (timode_scalar_chain::timode_scalar_chain): Initialize scalar_chain base with TImode and V1TImode. * config/i386/i386-features.c (scalar_chain::scalar_chain): Adjust. (general_scalar_chain::vector_const_cost): Adjust for SImode chains. (general_scalar_chain::compute_convert_gain): Likewise. Add {S,U}{MIN,MAX} support. (general_scalar_chain::replace_with_subreg): Use vmode/smode. (general_scalar_chain::make_vector_copies): Likewise. Handle non-DImode chains appropriately. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::convert_op): Likewise. (general_scalar_chain::convert_insn): Likewise. Add fatal_insn_not_found if the result is not recognized. (convertible_comparison_p): Pass in the scalar mode and use that. (general_scalar_to_vector_candidate_p): Likewise. Rename from dimode_scalar_to_vector_candidate_p. Add {S,U}{MIN,MAX} support. (scalar_to_vector_candidate_p): Remove by inlining into single caller. (general_remove_non_convertible_regs): Rename from dimode_remove_non_convertible_regs. (remove_non_convertible_regs): Remove by inlining into single caller. (convert_scalars_to_vector): Handle SImode and DImode chains in addition to TImode chains. * config/i386/i386.md (<maxmin><MAXMIN_IMODE>3): New expander. (*<maxmin><MAXMIN_IMODE>3_1): New insn-and-split. (*<maxmin>di3_doubleword): Likewise. * gcc.target/i386/pr91154.c: New testcase. * gcc.target/i386/minmax-3.c: Likewise. * gcc.target/i386/minmax-4.c: Likewise. * gcc.target/i386/minmax-5.c: Likewise. * gcc.target/i386/minmax-6.c: Likewise. * gcc.target/i386/minmax-1.c: Add -mno-stv. * gcc.target/i386/minmax-2.c: Likewise. Co-Authored-By: Uros Bizjak <ubizjak@gmail.com> From-SVN: r274481
2019-08-14 14:04:05 +02:00
bitmap_set_bit (&candidates[2], INSN_UID (insn));
}
else if (!timode_p)
re PR rtl-optimization/91154 (456.hmmer regression on Haswell caused by r272922) 2019-08-14 Richard Biener <rguenther@suse.de> Uroš Bizjak <ubizjak@gmail.com> PR target/91154 * config/i386/i386-features.h (scalar_chain::scalar_chain): Add mode arguments. (scalar_chain::smode): New member. (scalar_chain::vmode): Likewise. (dimode_scalar_chain): Rename to... (general_scalar_chain): ... this. (general_scalar_chain::general_scalar_chain): Take mode arguments. (timode_scalar_chain::timode_scalar_chain): Initialize scalar_chain base with TImode and V1TImode. * config/i386/i386-features.c (scalar_chain::scalar_chain): Adjust. (general_scalar_chain::vector_const_cost): Adjust for SImode chains. (general_scalar_chain::compute_convert_gain): Likewise. Add {S,U}{MIN,MAX} support. (general_scalar_chain::replace_with_subreg): Use vmode/smode. (general_scalar_chain::make_vector_copies): Likewise. Handle non-DImode chains appropriately. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::convert_op): Likewise. (general_scalar_chain::convert_insn): Likewise. Add fatal_insn_not_found if the result is not recognized. (convertible_comparison_p): Pass in the scalar mode and use that. (general_scalar_to_vector_candidate_p): Likewise. Rename from dimode_scalar_to_vector_candidate_p. Add {S,U}{MIN,MAX} support. (scalar_to_vector_candidate_p): Remove by inlining into single caller. (general_remove_non_convertible_regs): Rename from dimode_remove_non_convertible_regs. (remove_non_convertible_regs): Remove by inlining into single caller. (convert_scalars_to_vector): Handle SImode and DImode chains in addition to TImode chains. * config/i386/i386.md (<maxmin><MAXMIN_IMODE>3): New expander. (*<maxmin><MAXMIN_IMODE>3_1): New insn-and-split. (*<maxmin>di3_doubleword): Likewise. * gcc.target/i386/pr91154.c: New testcase. * gcc.target/i386/minmax-3.c: Likewise. * gcc.target/i386/minmax-4.c: Likewise. * gcc.target/i386/minmax-5.c: Likewise. * gcc.target/i386/minmax-6.c: Likewise. * gcc.target/i386/minmax-1.c: Add -mno-stv. * gcc.target/i386/minmax-2.c: Likewise. Co-Authored-By: Uros Bizjak <ubizjak@gmail.com> From-SVN: r274481
2019-08-14 14:04:05 +02:00
{
/* Check {SI,DI}mode. */
for (unsigned i = 0; i <= 1; ++i)
if (general_scalar_to_vector_candidate_p (insn, cand_mode[i]))
{
if (dump_file)
fprintf (dump_file, " insn %d is marked as a %s candidate\n",
INSN_UID (insn), i == 0 ? "SImode" : "DImode");
bitmap_set_bit (&candidates[i], INSN_UID (insn));
break;
}
}
}
if (timode_p)
re PR rtl-optimization/91154 (456.hmmer regression on Haswell caused by r272922) 2019-08-14 Richard Biener <rguenther@suse.de> Uroš Bizjak <ubizjak@gmail.com> PR target/91154 * config/i386/i386-features.h (scalar_chain::scalar_chain): Add mode arguments. (scalar_chain::smode): New member. (scalar_chain::vmode): Likewise. (dimode_scalar_chain): Rename to... (general_scalar_chain): ... this. (general_scalar_chain::general_scalar_chain): Take mode arguments. (timode_scalar_chain::timode_scalar_chain): Initialize scalar_chain base with TImode and V1TImode. * config/i386/i386-features.c (scalar_chain::scalar_chain): Adjust. (general_scalar_chain::vector_const_cost): Adjust for SImode chains. (general_scalar_chain::compute_convert_gain): Likewise. Add {S,U}{MIN,MAX} support. (general_scalar_chain::replace_with_subreg): Use vmode/smode. (general_scalar_chain::make_vector_copies): Likewise. Handle non-DImode chains appropriately. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::convert_op): Likewise. (general_scalar_chain::convert_insn): Likewise. Add fatal_insn_not_found if the result is not recognized. (convertible_comparison_p): Pass in the scalar mode and use that. (general_scalar_to_vector_candidate_p): Likewise. Rename from dimode_scalar_to_vector_candidate_p. Add {S,U}{MIN,MAX} support. (scalar_to_vector_candidate_p): Remove by inlining into single caller. (general_remove_non_convertible_regs): Rename from dimode_remove_non_convertible_regs. (remove_non_convertible_regs): Remove by inlining into single caller. (convert_scalars_to_vector): Handle SImode and DImode chains in addition to TImode chains. * config/i386/i386.md (<maxmin><MAXMIN_IMODE>3): New expander. (*<maxmin><MAXMIN_IMODE>3_1): New insn-and-split. (*<maxmin>di3_doubleword): Likewise. * gcc.target/i386/pr91154.c: New testcase. * gcc.target/i386/minmax-3.c: Likewise. * gcc.target/i386/minmax-4.c: Likewise. * gcc.target/i386/minmax-5.c: Likewise. * gcc.target/i386/minmax-6.c: Likewise. * gcc.target/i386/minmax-1.c: Add -mno-stv. * gcc.target/i386/minmax-2.c: Likewise. Co-Authored-By: Uros Bizjak <ubizjak@gmail.com> From-SVN: r274481
2019-08-14 14:04:05 +02:00
timode_remove_non_convertible_regs (&candidates[2]);
re PR rtl-optimization/91154 (456.hmmer regression on Haswell caused by r272922) 2019-08-14 Richard Biener <rguenther@suse.de> Uroš Bizjak <ubizjak@gmail.com> PR target/91154 * config/i386/i386-features.h (scalar_chain::scalar_chain): Add mode arguments. (scalar_chain::smode): New member. (scalar_chain::vmode): Likewise. (dimode_scalar_chain): Rename to... (general_scalar_chain): ... this. (general_scalar_chain::general_scalar_chain): Take mode arguments. (timode_scalar_chain::timode_scalar_chain): Initialize scalar_chain base with TImode and V1TImode. * config/i386/i386-features.c (scalar_chain::scalar_chain): Adjust. (general_scalar_chain::vector_const_cost): Adjust for SImode chains. (general_scalar_chain::compute_convert_gain): Likewise. Add {S,U}{MIN,MAX} support. (general_scalar_chain::replace_with_subreg): Use vmode/smode. (general_scalar_chain::make_vector_copies): Likewise. Handle non-DImode chains appropriately. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::convert_op): Likewise. (general_scalar_chain::convert_insn): Likewise. Add fatal_insn_not_found if the result is not recognized. (convertible_comparison_p): Pass in the scalar mode and use that. (general_scalar_to_vector_candidate_p): Likewise. Rename from dimode_scalar_to_vector_candidate_p. Add {S,U}{MIN,MAX} support. (scalar_to_vector_candidate_p): Remove by inlining into single caller. (general_remove_non_convertible_regs): Rename from dimode_remove_non_convertible_regs. (remove_non_convertible_regs): Remove by inlining into single caller. (convert_scalars_to_vector): Handle SImode and DImode chains in addition to TImode chains. * config/i386/i386.md (<maxmin><MAXMIN_IMODE>3): New expander. (*<maxmin><MAXMIN_IMODE>3_1): New insn-and-split. (*<maxmin>di3_doubleword): Likewise. * gcc.target/i386/pr91154.c: New testcase. * gcc.target/i386/minmax-3.c: Likewise. * gcc.target/i386/minmax-4.c: Likewise. * gcc.target/i386/minmax-5.c: Likewise. * gcc.target/i386/minmax-6.c: Likewise. * gcc.target/i386/minmax-1.c: Add -mno-stv. * gcc.target/i386/minmax-2.c: Likewise. Co-Authored-By: Uros Bizjak <ubizjak@gmail.com> From-SVN: r274481
2019-08-14 14:04:05 +02:00
for (unsigned i = 0; i <= 2; ++i)
if (!bitmap_empty_p (&candidates[i]))
break;
else if (i == 2 && dump_file)
fprintf (dump_file, "There are no candidates for optimization.\n");
re PR rtl-optimization/91154 (456.hmmer regression on Haswell caused by r272922) 2019-08-14 Richard Biener <rguenther@suse.de> Uroš Bizjak <ubizjak@gmail.com> PR target/91154 * config/i386/i386-features.h (scalar_chain::scalar_chain): Add mode arguments. (scalar_chain::smode): New member. (scalar_chain::vmode): Likewise. (dimode_scalar_chain): Rename to... (general_scalar_chain): ... this. (general_scalar_chain::general_scalar_chain): Take mode arguments. (timode_scalar_chain::timode_scalar_chain): Initialize scalar_chain base with TImode and V1TImode. * config/i386/i386-features.c (scalar_chain::scalar_chain): Adjust. (general_scalar_chain::vector_const_cost): Adjust for SImode chains. (general_scalar_chain::compute_convert_gain): Likewise. Add {S,U}{MIN,MAX} support. (general_scalar_chain::replace_with_subreg): Use vmode/smode. (general_scalar_chain::make_vector_copies): Likewise. Handle non-DImode chains appropriately. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::convert_op): Likewise. (general_scalar_chain::convert_insn): Likewise. Add fatal_insn_not_found if the result is not recognized. (convertible_comparison_p): Pass in the scalar mode and use that. (general_scalar_to_vector_candidate_p): Likewise. Rename from dimode_scalar_to_vector_candidate_p. Add {S,U}{MIN,MAX} support. (scalar_to_vector_candidate_p): Remove by inlining into single caller. (general_remove_non_convertible_regs): Rename from dimode_remove_non_convertible_regs. (remove_non_convertible_regs): Remove by inlining into single caller. (convert_scalars_to_vector): Handle SImode and DImode chains in addition to TImode chains. * config/i386/i386.md (<maxmin><MAXMIN_IMODE>3): New expander. (*<maxmin><MAXMIN_IMODE>3_1): New insn-and-split. (*<maxmin>di3_doubleword): Likewise. * gcc.target/i386/pr91154.c: New testcase. * gcc.target/i386/minmax-3.c: Likewise. * gcc.target/i386/minmax-4.c: Likewise. * gcc.target/i386/minmax-5.c: Likewise. * gcc.target/i386/minmax-6.c: Likewise. * gcc.target/i386/minmax-1.c: Add -mno-stv. * gcc.target/i386/minmax-2.c: Likewise. Co-Authored-By: Uros Bizjak <ubizjak@gmail.com> From-SVN: r274481
2019-08-14 14:04:05 +02:00
for (unsigned i = 0; i <= 2; ++i)
while (!bitmap_empty_p (&candidates[i]))
{
unsigned uid = bitmap_first_set_bit (&candidates[i]);
scalar_chain *chain;
re PR rtl-optimization/91154 (456.hmmer regression on Haswell caused by r272922) 2019-08-14 Richard Biener <rguenther@suse.de> Uroš Bizjak <ubizjak@gmail.com> PR target/91154 * config/i386/i386-features.h (scalar_chain::scalar_chain): Add mode arguments. (scalar_chain::smode): New member. (scalar_chain::vmode): Likewise. (dimode_scalar_chain): Rename to... (general_scalar_chain): ... this. (general_scalar_chain::general_scalar_chain): Take mode arguments. (timode_scalar_chain::timode_scalar_chain): Initialize scalar_chain base with TImode and V1TImode. * config/i386/i386-features.c (scalar_chain::scalar_chain): Adjust. (general_scalar_chain::vector_const_cost): Adjust for SImode chains. (general_scalar_chain::compute_convert_gain): Likewise. Add {S,U}{MIN,MAX} support. (general_scalar_chain::replace_with_subreg): Use vmode/smode. (general_scalar_chain::make_vector_copies): Likewise. Handle non-DImode chains appropriately. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::convert_op): Likewise. (general_scalar_chain::convert_insn): Likewise. Add fatal_insn_not_found if the result is not recognized. (convertible_comparison_p): Pass in the scalar mode and use that. (general_scalar_to_vector_candidate_p): Likewise. Rename from dimode_scalar_to_vector_candidate_p. Add {S,U}{MIN,MAX} support. (scalar_to_vector_candidate_p): Remove by inlining into single caller. (general_remove_non_convertible_regs): Rename from dimode_remove_non_convertible_regs. (remove_non_convertible_regs): Remove by inlining into single caller. (convert_scalars_to_vector): Handle SImode and DImode chains in addition to TImode chains. * config/i386/i386.md (<maxmin><MAXMIN_IMODE>3): New expander. (*<maxmin><MAXMIN_IMODE>3_1): New insn-and-split. (*<maxmin>di3_doubleword): Likewise. * gcc.target/i386/pr91154.c: New testcase. * gcc.target/i386/minmax-3.c: Likewise. * gcc.target/i386/minmax-4.c: Likewise. * gcc.target/i386/minmax-5.c: Likewise. * gcc.target/i386/minmax-6.c: Likewise. * gcc.target/i386/minmax-1.c: Add -mno-stv. * gcc.target/i386/minmax-2.c: Likewise. Co-Authored-By: Uros Bizjak <ubizjak@gmail.com> From-SVN: r274481
2019-08-14 14:04:05 +02:00
if (cand_mode[i] == TImode)
chain = new timode_scalar_chain;
else
chain = new general_scalar_chain (cand_mode[i], cand_vmode[i]);
re PR rtl-optimization/91154 (456.hmmer regression on Haswell caused by r272922) 2019-08-14 Richard Biener <rguenther@suse.de> Uroš Bizjak <ubizjak@gmail.com> PR target/91154 * config/i386/i386-features.h (scalar_chain::scalar_chain): Add mode arguments. (scalar_chain::smode): New member. (scalar_chain::vmode): Likewise. (dimode_scalar_chain): Rename to... (general_scalar_chain): ... this. (general_scalar_chain::general_scalar_chain): Take mode arguments. (timode_scalar_chain::timode_scalar_chain): Initialize scalar_chain base with TImode and V1TImode. * config/i386/i386-features.c (scalar_chain::scalar_chain): Adjust. (general_scalar_chain::vector_const_cost): Adjust for SImode chains. (general_scalar_chain::compute_convert_gain): Likewise. Add {S,U}{MIN,MAX} support. (general_scalar_chain::replace_with_subreg): Use vmode/smode. (general_scalar_chain::make_vector_copies): Likewise. Handle non-DImode chains appropriately. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::convert_op): Likewise. (general_scalar_chain::convert_insn): Likewise. Add fatal_insn_not_found if the result is not recognized. (convertible_comparison_p): Pass in the scalar mode and use that. (general_scalar_to_vector_candidate_p): Likewise. Rename from dimode_scalar_to_vector_candidate_p. Add {S,U}{MIN,MAX} support. (scalar_to_vector_candidate_p): Remove by inlining into single caller. (general_remove_non_convertible_regs): Rename from dimode_remove_non_convertible_regs. (remove_non_convertible_regs): Remove by inlining into single caller. (convert_scalars_to_vector): Handle SImode and DImode chains in addition to TImode chains. * config/i386/i386.md (<maxmin><MAXMIN_IMODE>3): New expander. (*<maxmin><MAXMIN_IMODE>3_1): New insn-and-split. (*<maxmin>di3_doubleword): Likewise. * gcc.target/i386/pr91154.c: New testcase. * gcc.target/i386/minmax-3.c: Likewise. * gcc.target/i386/minmax-4.c: Likewise. * gcc.target/i386/minmax-5.c: Likewise. * gcc.target/i386/minmax-6.c: Likewise. * gcc.target/i386/minmax-1.c: Add -mno-stv. * gcc.target/i386/minmax-2.c: Likewise. Co-Authored-By: Uros Bizjak <ubizjak@gmail.com> From-SVN: r274481
2019-08-14 14:04:05 +02:00
/* Find instructions chain we want to convert to vector mode.
Check all uses and definitions to estimate all required
conversions. */
chain->build (&candidates[i], uid);
re PR rtl-optimization/91154 (456.hmmer regression on Haswell caused by r272922) 2019-08-14 Richard Biener <rguenther@suse.de> Uroš Bizjak <ubizjak@gmail.com> PR target/91154 * config/i386/i386-features.h (scalar_chain::scalar_chain): Add mode arguments. (scalar_chain::smode): New member. (scalar_chain::vmode): Likewise. (dimode_scalar_chain): Rename to... (general_scalar_chain): ... this. (general_scalar_chain::general_scalar_chain): Take mode arguments. (timode_scalar_chain::timode_scalar_chain): Initialize scalar_chain base with TImode and V1TImode. * config/i386/i386-features.c (scalar_chain::scalar_chain): Adjust. (general_scalar_chain::vector_const_cost): Adjust for SImode chains. (general_scalar_chain::compute_convert_gain): Likewise. Add {S,U}{MIN,MAX} support. (general_scalar_chain::replace_with_subreg): Use vmode/smode. (general_scalar_chain::make_vector_copies): Likewise. Handle non-DImode chains appropriately. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::convert_op): Likewise. (general_scalar_chain::convert_insn): Likewise. Add fatal_insn_not_found if the result is not recognized. (convertible_comparison_p): Pass in the scalar mode and use that. (general_scalar_to_vector_candidate_p): Likewise. Rename from dimode_scalar_to_vector_candidate_p. Add {S,U}{MIN,MAX} support. (scalar_to_vector_candidate_p): Remove by inlining into single caller. (general_remove_non_convertible_regs): Rename from dimode_remove_non_convertible_regs. (remove_non_convertible_regs): Remove by inlining into single caller. (convert_scalars_to_vector): Handle SImode and DImode chains in addition to TImode chains. * config/i386/i386.md (<maxmin><MAXMIN_IMODE>3): New expander. (*<maxmin><MAXMIN_IMODE>3_1): New insn-and-split. (*<maxmin>di3_doubleword): Likewise. * gcc.target/i386/pr91154.c: New testcase. * gcc.target/i386/minmax-3.c: Likewise. * gcc.target/i386/minmax-4.c: Likewise. * gcc.target/i386/minmax-5.c: Likewise. * gcc.target/i386/minmax-6.c: Likewise. * gcc.target/i386/minmax-1.c: Add -mno-stv. * gcc.target/i386/minmax-2.c: Likewise. Co-Authored-By: Uros Bizjak <ubizjak@gmail.com> From-SVN: r274481
2019-08-14 14:04:05 +02:00
if (chain->compute_convert_gain () > 0)
converted_insns += chain->convert ();
else
if (dump_file)
fprintf (dump_file, "Chain #%d conversion is not profitable\n",
chain->chain_id);
re PR rtl-optimization/91154 (456.hmmer regression on Haswell caused by r272922) 2019-08-14 Richard Biener <rguenther@suse.de> Uroš Bizjak <ubizjak@gmail.com> PR target/91154 * config/i386/i386-features.h (scalar_chain::scalar_chain): Add mode arguments. (scalar_chain::smode): New member. (scalar_chain::vmode): Likewise. (dimode_scalar_chain): Rename to... (general_scalar_chain): ... this. (general_scalar_chain::general_scalar_chain): Take mode arguments. (timode_scalar_chain::timode_scalar_chain): Initialize scalar_chain base with TImode and V1TImode. * config/i386/i386-features.c (scalar_chain::scalar_chain): Adjust. (general_scalar_chain::vector_const_cost): Adjust for SImode chains. (general_scalar_chain::compute_convert_gain): Likewise. Add {S,U}{MIN,MAX} support. (general_scalar_chain::replace_with_subreg): Use vmode/smode. (general_scalar_chain::make_vector_copies): Likewise. Handle non-DImode chains appropriately. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::convert_op): Likewise. (general_scalar_chain::convert_insn): Likewise. Add fatal_insn_not_found if the result is not recognized. (convertible_comparison_p): Pass in the scalar mode and use that. (general_scalar_to_vector_candidate_p): Likewise. Rename from dimode_scalar_to_vector_candidate_p. Add {S,U}{MIN,MAX} support. (scalar_to_vector_candidate_p): Remove by inlining into single caller. (general_remove_non_convertible_regs): Rename from dimode_remove_non_convertible_regs. (remove_non_convertible_regs): Remove by inlining into single caller. (convert_scalars_to_vector): Handle SImode and DImode chains in addition to TImode chains. * config/i386/i386.md (<maxmin><MAXMIN_IMODE>3): New expander. (*<maxmin><MAXMIN_IMODE>3_1): New insn-and-split. (*<maxmin>di3_doubleword): Likewise. * gcc.target/i386/pr91154.c: New testcase. * gcc.target/i386/minmax-3.c: Likewise. * gcc.target/i386/minmax-4.c: Likewise. * gcc.target/i386/minmax-5.c: Likewise. * gcc.target/i386/minmax-6.c: Likewise. * gcc.target/i386/minmax-1.c: Add -mno-stv. * gcc.target/i386/minmax-2.c: Likewise. Co-Authored-By: Uros Bizjak <ubizjak@gmail.com> From-SVN: r274481
2019-08-14 14:04:05 +02:00
delete chain;
}
if (dump_file)
fprintf (dump_file, "Total insns converted: %d\n", converted_insns);
re PR rtl-optimization/91154 (456.hmmer regression on Haswell caused by r272922) 2019-08-14 Richard Biener <rguenther@suse.de> Uroš Bizjak <ubizjak@gmail.com> PR target/91154 * config/i386/i386-features.h (scalar_chain::scalar_chain): Add mode arguments. (scalar_chain::smode): New member. (scalar_chain::vmode): Likewise. (dimode_scalar_chain): Rename to... (general_scalar_chain): ... this. (general_scalar_chain::general_scalar_chain): Take mode arguments. (timode_scalar_chain::timode_scalar_chain): Initialize scalar_chain base with TImode and V1TImode. * config/i386/i386-features.c (scalar_chain::scalar_chain): Adjust. (general_scalar_chain::vector_const_cost): Adjust for SImode chains. (general_scalar_chain::compute_convert_gain): Likewise. Add {S,U}{MIN,MAX} support. (general_scalar_chain::replace_with_subreg): Use vmode/smode. (general_scalar_chain::make_vector_copies): Likewise. Handle non-DImode chains appropriately. (general_scalar_chain::convert_reg): Likewise. (general_scalar_chain::convert_op): Likewise. (general_scalar_chain::convert_insn): Likewise. Add fatal_insn_not_found if the result is not recognized. (convertible_comparison_p): Pass in the scalar mode and use that. (general_scalar_to_vector_candidate_p): Likewise. Rename from dimode_scalar_to_vector_candidate_p. Add {S,U}{MIN,MAX} support. (scalar_to_vector_candidate_p): Remove by inlining into single caller. (general_remove_non_convertible_regs): Rename from dimode_remove_non_convertible_regs. (remove_non_convertible_regs): Remove by inlining into single caller. (convert_scalars_to_vector): Handle SImode and DImode chains in addition to TImode chains. * config/i386/i386.md (<maxmin><MAXMIN_IMODE>3): New expander. (*<maxmin><MAXMIN_IMODE>3_1): New insn-and-split. (*<maxmin>di3_doubleword): Likewise. * gcc.target/i386/pr91154.c: New testcase. * gcc.target/i386/minmax-3.c: Likewise. * gcc.target/i386/minmax-4.c: Likewise. * gcc.target/i386/minmax-5.c: Likewise. * gcc.target/i386/minmax-6.c: Likewise. * gcc.target/i386/minmax-1.c: Add -mno-stv. * gcc.target/i386/minmax-2.c: Likewise. Co-Authored-By: Uros Bizjak <ubizjak@gmail.com> From-SVN: r274481
2019-08-14 14:04:05 +02:00
for (unsigned i = 0; i <= 2; ++i)
bitmap_release (&candidates[i]);
bitmap_obstack_release (NULL);
df_process_deferred_rescans ();
/* Conversion means we may have 128bit register spills/fills
which require aligned stack. */
if (converted_insns)
{
if (crtl->stack_alignment_needed < 128)
crtl->stack_alignment_needed = 128;
if (crtl->stack_alignment_estimated < 128)
crtl->stack_alignment_estimated = 128;
crtl->stack_realign_needed
= INCOMING_STACK_BOUNDARY < crtl->stack_alignment_estimated;
crtl->stack_realign_tried = crtl->stack_realign_needed;
crtl->stack_realign_processed = true;
if (!crtl->drap_reg)
{
rtx drap_rtx = targetm.calls.get_drap_rtx ();
/* stack_realign_drap and drap_rtx must match. */
gcc_assert ((stack_realign_drap != 0) == (drap_rtx != NULL));
/* Do nothing if NULL is returned,
which means DRAP is not needed. */
if (drap_rtx != NULL)
{
crtl->args.internal_arg_pointer = drap_rtx;
/* Call fixup_tail_calls to clean up
REG_EQUIV note if DRAP is needed. */
fixup_tail_calls ();
}
}
/* Fix up DECL_RTL/DECL_INCOMING_RTL of arguments. */
if (TARGET_64BIT)
for (tree parm = DECL_ARGUMENTS (current_function_decl);
parm; parm = DECL_CHAIN (parm))
{
if (TYPE_MODE (TREE_TYPE (parm)) != TImode)
continue;
if (DECL_RTL_SET_P (parm)
&& GET_MODE (DECL_RTL (parm)) == V1TImode)
{
rtx r = DECL_RTL (parm);
if (REG_P (r))
SET_DECL_RTL (parm, gen_rtx_SUBREG (TImode, r, 0));
}
if (DECL_INCOMING_RTL (parm)
&& GET_MODE (DECL_INCOMING_RTL (parm)) == V1TImode)
{
rtx r = DECL_INCOMING_RTL (parm);
if (REG_P (r))
DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG (TImode, r, 0);
}
}
}
return 0;
}
static unsigned int
rest_of_handle_insert_vzeroupper (void)
{
Fix _mm256_zeroupper by representing the instructions as call_insns in which the call has a special vzeroupper ABI. When __builtin_ia32_vzeroupper is called explicitly, the corresponding vzeroupper pattern does not carry any CLOBBERS or SETs before LRA, which leads to incorrect optimization in pass_reload. In order to solve this problem, this patch refine instructions as call_insns in which the call has a special vzeroupper ABI. gcc/ChangeLog: PR target/82735 * config/i386/i386-expand.c (ix86_expand_builtin): Remove assignment of cfun->machine->has_explicit_vzeroupper. * config/i386/i386-features.c (ix86_add_reg_usage_to_vzerouppers): Delete. (ix86_add_reg_usage_to_vzeroupper): Ditto. (rest_of_handle_insert_vzeroupper): Remove ix86_add_reg_usage_to_vzerouppers, add df_analyze at the end of the function. (gate): Remove cfun->machine->has_explicit_vzeroupper. * config/i386/i386-protos.h (ix86_expand_avx_vzeroupper): Declared. * config/i386/i386.c (ix86_insn_callee_abi): New function. (ix86_initialize_callee_abi): Ditto. (ix86_expand_avx_vzeroupper): Ditto. (ix86_hard_regno_call_part_clobbered): Adjust for vzeroupper ABI. (TARGET_INSN_CALLEE_ABI): Define as ix86_insn_callee_abi. (ix86_emit_mode_set): Call ix86_expand_avx_vzeroupper directly. * config/i386/i386.h (struct GTY(()) machine_function): Delete has_explicit_vzeroupper. * config/i386/i386.md (enum unspec): New member UNSPEC_CALLEE_ABI. (ABI_DEFAULT,ABI_VZEROUPPER,ABI_UNKNOWN): New define_constants for insn callee abi index. * config/i386/predicates.md (vzeroupper_pattern): Adjust. * config/i386/sse.md (UNSPECV_VZEROUPPER): Deleted. (avx_vzeroupper): Call ix86_expand_avx_vzeroupper. (*avx_vzeroupper): Rename to .. (avx_vzeroupper_callee_abi): .. this, and adjust pattern as call_insn which has a special vzeroupper ABI. (*avx_vzeroupper_1): Deleted. gcc/testsuite/ChangeLog: PR target/82735 * gcc.target/i386/pr82735-1.c: New test. * gcc.target/i386/pr82735-2.c: New test. * gcc.target/i386/pr82735-3.c: New test. * gcc.target/i386/pr82735-4.c: New test. * gcc.target/i386/pr82735-5.c: New test.
2021-06-01 03:09:44 +02:00
/* vzeroupper instructions are inserted immediately after reload to
account for possible spills from 256bit or 512bit registers. The pass
reuses mode switching infrastructure by re-running mode insertion
pass, so disable entities that have already been processed. */
for (int i = 0; i < MAX_386_ENTITIES; i++)
ix86_optimize_mode_switching[i] = 0;
Fix _mm256_zeroupper by representing the instructions as call_insns in which the call has a special vzeroupper ABI. When __builtin_ia32_vzeroupper is called explicitly, the corresponding vzeroupper pattern does not carry any CLOBBERS or SETs before LRA, which leads to incorrect optimization in pass_reload. In order to solve this problem, this patch refine instructions as call_insns in which the call has a special vzeroupper ABI. gcc/ChangeLog: PR target/82735 * config/i386/i386-expand.c (ix86_expand_builtin): Remove assignment of cfun->machine->has_explicit_vzeroupper. * config/i386/i386-features.c (ix86_add_reg_usage_to_vzerouppers): Delete. (ix86_add_reg_usage_to_vzeroupper): Ditto. (rest_of_handle_insert_vzeroupper): Remove ix86_add_reg_usage_to_vzerouppers, add df_analyze at the end of the function. (gate): Remove cfun->machine->has_explicit_vzeroupper. * config/i386/i386-protos.h (ix86_expand_avx_vzeroupper): Declared. * config/i386/i386.c (ix86_insn_callee_abi): New function. (ix86_initialize_callee_abi): Ditto. (ix86_expand_avx_vzeroupper): Ditto. (ix86_hard_regno_call_part_clobbered): Adjust for vzeroupper ABI. (TARGET_INSN_CALLEE_ABI): Define as ix86_insn_callee_abi. (ix86_emit_mode_set): Call ix86_expand_avx_vzeroupper directly. * config/i386/i386.h (struct GTY(()) machine_function): Delete has_explicit_vzeroupper. * config/i386/i386.md (enum unspec): New member UNSPEC_CALLEE_ABI. (ABI_DEFAULT,ABI_VZEROUPPER,ABI_UNKNOWN): New define_constants for insn callee abi index. * config/i386/predicates.md (vzeroupper_pattern): Adjust. * config/i386/sse.md (UNSPECV_VZEROUPPER): Deleted. (avx_vzeroupper): Call ix86_expand_avx_vzeroupper. (*avx_vzeroupper): Rename to .. (avx_vzeroupper_callee_abi): .. this, and adjust pattern as call_insn which has a special vzeroupper ABI. (*avx_vzeroupper_1): Deleted. gcc/testsuite/ChangeLog: PR target/82735 * gcc.target/i386/pr82735-1.c: New test. * gcc.target/i386/pr82735-2.c: New test. * gcc.target/i386/pr82735-3.c: New test. * gcc.target/i386/pr82735-4.c: New test. * gcc.target/i386/pr82735-5.c: New test.
2021-06-01 03:09:44 +02:00
ix86_optimize_mode_switching[AVX_U128] = 1;
Fix _mm256_zeroupper by representing the instructions as call_insns in which the call has a special vzeroupper ABI. When __builtin_ia32_vzeroupper is called explicitly, the corresponding vzeroupper pattern does not carry any CLOBBERS or SETs before LRA, which leads to incorrect optimization in pass_reload. In order to solve this problem, this patch refine instructions as call_insns in which the call has a special vzeroupper ABI. gcc/ChangeLog: PR target/82735 * config/i386/i386-expand.c (ix86_expand_builtin): Remove assignment of cfun->machine->has_explicit_vzeroupper. * config/i386/i386-features.c (ix86_add_reg_usage_to_vzerouppers): Delete. (ix86_add_reg_usage_to_vzeroupper): Ditto. (rest_of_handle_insert_vzeroupper): Remove ix86_add_reg_usage_to_vzerouppers, add df_analyze at the end of the function. (gate): Remove cfun->machine->has_explicit_vzeroupper. * config/i386/i386-protos.h (ix86_expand_avx_vzeroupper): Declared. * config/i386/i386.c (ix86_insn_callee_abi): New function. (ix86_initialize_callee_abi): Ditto. (ix86_expand_avx_vzeroupper): Ditto. (ix86_hard_regno_call_part_clobbered): Adjust for vzeroupper ABI. (TARGET_INSN_CALLEE_ABI): Define as ix86_insn_callee_abi. (ix86_emit_mode_set): Call ix86_expand_avx_vzeroupper directly. * config/i386/i386.h (struct GTY(()) machine_function): Delete has_explicit_vzeroupper. * config/i386/i386.md (enum unspec): New member UNSPEC_CALLEE_ABI. (ABI_DEFAULT,ABI_VZEROUPPER,ABI_UNKNOWN): New define_constants for insn callee abi index. * config/i386/predicates.md (vzeroupper_pattern): Adjust. * config/i386/sse.md (UNSPECV_VZEROUPPER): Deleted. (avx_vzeroupper): Call ix86_expand_avx_vzeroupper. (*avx_vzeroupper): Rename to .. (avx_vzeroupper_callee_abi): .. this, and adjust pattern as call_insn which has a special vzeroupper ABI. (*avx_vzeroupper_1): Deleted. gcc/testsuite/ChangeLog: PR target/82735 * gcc.target/i386/pr82735-1.c: New test. * gcc.target/i386/pr82735-2.c: New test. * gcc.target/i386/pr82735-3.c: New test. * gcc.target/i386/pr82735-4.c: New test. * gcc.target/i386/pr82735-5.c: New test.
2021-06-01 03:09:44 +02:00
/* Call optimize_mode_switching. */
g->get_passes ()->execute_pass_mode_switching ();
df_analyze ();
return 0;
}
namespace {
const pass_data pass_data_insert_vzeroupper =
{
RTL_PASS, /* type */
"vzeroupper", /* name */
OPTGROUP_NONE, /* optinfo_flags */
TV_MACH_DEP, /* tv_id */
0, /* properties_required */
0, /* properties_provided */
0, /* properties_destroyed */
0, /* todo_flags_start */
TODO_df_finish, /* todo_flags_finish */
};
class pass_insert_vzeroupper : public rtl_opt_pass
{
public:
pass_insert_vzeroupper(gcc::context *ctxt)
: rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
{}
/* opt_pass methods: */
Add 'final' and 'override' to opt_pass vfunc impls gcc/ChangeLog: * adjust-alignment.cc: Add "final" and "override" to opt_pass vfunc implementations, removing redundant "virtual" as appropriate. * asan.cc: Likewise. * auto-inc-dec.cc: Likewise. * auto-profile.cc: Likewise. * bb-reorder.cc: Likewise. * cfgcleanup.cc: Likewise. * cfgexpand.cc: Likewise. * cfgrtl.cc: Likewise. * cgraphbuild.cc: Likewise. * combine-stack-adj.cc: Likewise. * combine.cc: Likewise. * compare-elim.cc: Likewise. * config/i386/i386-features.cc: Likewise. * coroutine-passes.cc: Likewise. * cprop.cc: Likewise. * cse.cc: Likewise. * dce.cc: Likewise. * df-core.cc: Likewise. * dse.cc: Likewise. * dwarf2cfi.cc: Likewise. * early-remat.cc: Likewise. * except.cc: Likewise. * final.cc: Likewise. * function.cc: Likewise. * fwprop.cc: Likewise. * gcse.cc: Likewise. * gimple-harden-conditionals.cc: Likewise. * gimple-if-to-switch.cc: Likewise. * gimple-isel.cc: Likewise. * gimple-laddress.cc: Likewise. * gimple-loop-interchange.cc: Likewise. * gimple-loop-jam.cc: Likewise. * gimple-loop-versioning.cc: Likewise. * gimple-low.cc: Likewise. * gimple-ssa-backprop.cc: Likewise. * gimple-ssa-evrp.cc: Likewise. * gimple-ssa-isolate-paths.cc: Likewise. * gimple-ssa-nonnull-compare.cc: Likewise. * gimple-ssa-split-paths.cc: Likewise. * gimple-ssa-store-merging.cc: Likewise. * gimple-ssa-strength-reduction.cc: Likewise. * gimple-ssa-warn-access.cc: Likewise. * gimple-ssa-warn-alloca.cc: Likewise. * gimple-ssa-warn-restrict.cc: Likewise. * gimple-warn-recursion.cc: Likewise. * graphite.cc: Likewise. * ifcvt.cc: Likewise. * init-regs.cc: Likewise. * ipa-comdats.cc: Likewise. * ipa-cp.cc: Likewise. * ipa-devirt.cc: Likewise. * ipa-fnsummary.cc: Likewise. * ipa-free-lang-data.cc: Likewise. * ipa-icf.cc: Likewise. * ipa-inline.cc: Likewise. * ipa-modref.cc: Likewise. * ipa-profile.cc: Likewise. * ipa-pure-const.cc: Likewise. * ipa-reference.cc: Likewise. * ipa-split.cc: Likewise. * ipa-sra.cc: Likewise. * ipa-visibility.cc: Likewise. * ipa.cc: Likewise. * ira.cc: Likewise. * jump.cc: Likewise. * loop-init.cc: Likewise. * lower-subreg.cc: Likewise. * mode-switching.cc: Likewise. * modulo-sched.cc: Likewise. * multiple_target.cc: Likewise. * omp-expand.cc: Likewise. * omp-low.cc: Likewise. * omp-oacc-kernels-decompose.cc: Likewise. * omp-oacc-neuter-broadcast.cc: Likewise. * omp-offload.cc: Likewise. * omp-simd-clone.cc: Likewise. * passes.cc: Likewise. * postreload-gcse.cc: Likewise. * postreload.cc: Likewise. * predict.cc: Likewise. * recog.cc: Likewise. * ree.cc: Likewise. * reg-stack.cc: Likewise. * regcprop.cc: Likewise. * reginfo.cc: Likewise. * regrename.cc: Likewise. * reorg.cc: Likewise. * sancov.cc: Likewise. * sanopt.cc: Likewise. * sched-rgn.cc: Likewise. * stack-ptr-mod.cc: Likewise. * store-motion.cc: Likewise. * tracer.cc: Likewise. * trans-mem.cc: Likewise. * tree-call-cdce.cc: Likewise. * tree-cfg.cc: Likewise. * tree-cfgcleanup.cc: Likewise. * tree-complex.cc: Likewise. * tree-eh.cc: Likewise. * tree-emutls.cc: Likewise. * tree-if-conv.cc: Likewise. * tree-into-ssa.cc: Likewise. * tree-loop-distribution.cc: Likewise. * tree-nrv.cc: Likewise. * tree-object-size.cc: Likewise. * tree-parloops.cc: Likewise. * tree-predcom.cc: Likewise. * tree-profile.cc: Likewise. * tree-sra.cc: Likewise. * tree-ssa-ccp.cc: Likewise. * tree-ssa-copy.cc: Likewise. * tree-ssa-dce.cc: Likewise. * tree-ssa-dom.cc: Likewise. * tree-ssa-dse.cc: Likewise. * tree-ssa-forwprop.cc: Likewise. * tree-ssa-ifcombine.cc: Likewise. * tree-ssa-loop-ch.cc: Likewise. * tree-ssa-loop-im.cc: Likewise. * tree-ssa-loop-ivcanon.cc: Likewise. * tree-ssa-loop-prefetch.cc: Likewise. * tree-ssa-loop-split.cc: Likewise. * tree-ssa-loop-unswitch.cc: Likewise. * tree-ssa-loop.cc: Likewise. * tree-ssa-math-opts.cc: Likewise. * tree-ssa-phiopt.cc: Likewise. * tree-ssa-phiprop.cc: Likewise. * tree-ssa-pre.cc: Likewise. * tree-ssa-reassoc.cc: Likewise. * tree-ssa-sccvn.cc: Likewise. * tree-ssa-sink.cc: Likewise. * tree-ssa-strlen.cc: Likewise. * tree-ssa-structalias.cc: Likewise. * tree-ssa-uncprop.cc: Likewise. * tree-ssa-uninit.cc: Likewise. * tree-ssanames.cc: Likewise. * tree-stdarg.cc: Likewise. * tree-switch-conversion.cc: Likewise. * tree-tailcall.cc: Likewise. * tree-vect-generic.cc: Likewise. * tree-vectorizer.cc: Likewise. * tree-vrp.cc: Likewise. * tsan.cc: Likewise. * ubsan.cc: Likewise. * var-tracking.cc: Likewise. * vtable-verify.cc: Likewise. * web.cc: Likewise. Signed-off-by: David Malcolm <dmalcolm@redhat.com>
2022-06-27 23:00:33 +02:00
bool gate (function *) final override
{
Fix _mm256_zeroupper by representing the instructions as call_insns in which the call has a special vzeroupper ABI. When __builtin_ia32_vzeroupper is called explicitly, the corresponding vzeroupper pattern does not carry any CLOBBERS or SETs before LRA, which leads to incorrect optimization in pass_reload. In order to solve this problem, this patch refine instructions as call_insns in which the call has a special vzeroupper ABI. gcc/ChangeLog: PR target/82735 * config/i386/i386-expand.c (ix86_expand_builtin): Remove assignment of cfun->machine->has_explicit_vzeroupper. * config/i386/i386-features.c (ix86_add_reg_usage_to_vzerouppers): Delete. (ix86_add_reg_usage_to_vzeroupper): Ditto. (rest_of_handle_insert_vzeroupper): Remove ix86_add_reg_usage_to_vzerouppers, add df_analyze at the end of the function. (gate): Remove cfun->machine->has_explicit_vzeroupper. * config/i386/i386-protos.h (ix86_expand_avx_vzeroupper): Declared. * config/i386/i386.c (ix86_insn_callee_abi): New function. (ix86_initialize_callee_abi): Ditto. (ix86_expand_avx_vzeroupper): Ditto. (ix86_hard_regno_call_part_clobbered): Adjust for vzeroupper ABI. (TARGET_INSN_CALLEE_ABI): Define as ix86_insn_callee_abi. (ix86_emit_mode_set): Call ix86_expand_avx_vzeroupper directly. * config/i386/i386.h (struct GTY(()) machine_function): Delete has_explicit_vzeroupper. * config/i386/i386.md (enum unspec): New member UNSPEC_CALLEE_ABI. (ABI_DEFAULT,ABI_VZEROUPPER,ABI_UNKNOWN): New define_constants for insn callee abi index. * config/i386/predicates.md (vzeroupper_pattern): Adjust. * config/i386/sse.md (UNSPECV_VZEROUPPER): Deleted. (avx_vzeroupper): Call ix86_expand_avx_vzeroupper. (*avx_vzeroupper): Rename to .. (avx_vzeroupper_callee_abi): .. this, and adjust pattern as call_insn which has a special vzeroupper ABI. (*avx_vzeroupper_1): Deleted. gcc/testsuite/ChangeLog: PR target/82735 * gcc.target/i386/pr82735-1.c: New test. * gcc.target/i386/pr82735-2.c: New test. * gcc.target/i386/pr82735-3.c: New test. * gcc.target/i386/pr82735-4.c: New test. * gcc.target/i386/pr82735-5.c: New test.
2021-06-01 03:09:44 +02:00
return TARGET_AVX && TARGET_VZEROUPPER
&& flag_expensive_optimizations && !optimize_size;
}
Add 'final' and 'override' to opt_pass vfunc impls gcc/ChangeLog: * adjust-alignment.cc: Add "final" and "override" to opt_pass vfunc implementations, removing redundant "virtual" as appropriate. * asan.cc: Likewise. * auto-inc-dec.cc: Likewise. * auto-profile.cc: Likewise. * bb-reorder.cc: Likewise. * cfgcleanup.cc: Likewise. * cfgexpand.cc: Likewise. * cfgrtl.cc: Likewise. * cgraphbuild.cc: Likewise. * combine-stack-adj.cc: Likewise. * combine.cc: Likewise. * compare-elim.cc: Likewise. * config/i386/i386-features.cc: Likewise. * coroutine-passes.cc: Likewise. * cprop.cc: Likewise. * cse.cc: Likewise. * dce.cc: Likewise. * df-core.cc: Likewise. * dse.cc: Likewise. * dwarf2cfi.cc: Likewise. * early-remat.cc: Likewise. * except.cc: Likewise. * final.cc: Likewise. * function.cc: Likewise. * fwprop.cc: Likewise. * gcse.cc: Likewise. * gimple-harden-conditionals.cc: Likewise. * gimple-if-to-switch.cc: Likewise. * gimple-isel.cc: Likewise. * gimple-laddress.cc: Likewise. * gimple-loop-interchange.cc: Likewise. * gimple-loop-jam.cc: Likewise. * gimple-loop-versioning.cc: Likewise. * gimple-low.cc: Likewise. * gimple-ssa-backprop.cc: Likewise. * gimple-ssa-evrp.cc: Likewise. * gimple-ssa-isolate-paths.cc: Likewise. * gimple-ssa-nonnull-compare.cc: Likewise. * gimple-ssa-split-paths.cc: Likewise. * gimple-ssa-store-merging.cc: Likewise. * gimple-ssa-strength-reduction.cc: Likewise. * gimple-ssa-warn-access.cc: Likewise. * gimple-ssa-warn-alloca.cc: Likewise. * gimple-ssa-warn-restrict.cc: Likewise. * gimple-warn-recursion.cc: Likewise. * graphite.cc: Likewise. * ifcvt.cc: Likewise. * init-regs.cc: Likewise. * ipa-comdats.cc: Likewise. * ipa-cp.cc: Likewise. * ipa-devirt.cc: Likewise. * ipa-fnsummary.cc: Likewise. * ipa-free-lang-data.cc: Likewise. * ipa-icf.cc: Likewise. * ipa-inline.cc: Likewise. * ipa-modref.cc: Likewise. * ipa-profile.cc: Likewise. * ipa-pure-const.cc: Likewise. * ipa-reference.cc: Likewise. * ipa-split.cc: Likewise. * ipa-sra.cc: Likewise. * ipa-visibility.cc: Likewise. * ipa.cc: Likewise. * ira.cc: Likewise. * jump.cc: Likewise. * loop-init.cc: Likewise. * lower-subreg.cc: Likewise. * mode-switching.cc: Likewise. * modulo-sched.cc: Likewise. * multiple_target.cc: Likewise. * omp-expand.cc: Likewise. * omp-low.cc: Likewise. * omp-oacc-kernels-decompose.cc: Likewise. * omp-oacc-neuter-broadcast.cc: Likewise. * omp-offload.cc: Likewise. * omp-simd-clone.cc: Likewise. * passes.cc: Likewise. * postreload-gcse.cc: Likewise. * postreload.cc: Likewise. * predict.cc: Likewise. * recog.cc: Likewise. * ree.cc: Likewise. * reg-stack.cc: Likewise. * regcprop.cc: Likewise. * reginfo.cc: Likewise. * regrename.cc: Likewise. * reorg.cc: Likewise. * sancov.cc: Likewise. * sanopt.cc: Likewise. * sched-rgn.cc: Likewise. * stack-ptr-mod.cc: Likewise. * store-motion.cc: Likewise. * tracer.cc: Likewise. * trans-mem.cc: Likewise. * tree-call-cdce.cc: Likewise. * tree-cfg.cc: Likewise. * tree-cfgcleanup.cc: Likewise. * tree-complex.cc: Likewise. * tree-eh.cc: Likewise. * tree-emutls.cc: Likewise. * tree-if-conv.cc: Likewise. * tree-into-ssa.cc: Likewise. * tree-loop-distribution.cc: Likewise. * tree-nrv.cc: Likewise. * tree-object-size.cc: Likewise. * tree-parloops.cc: Likewise. * tree-predcom.cc: Likewise. * tree-profile.cc: Likewise. * tree-sra.cc: Likewise. * tree-ssa-ccp.cc: Likewise. * tree-ssa-copy.cc: Likewise. * tree-ssa-dce.cc: Likewise. * tree-ssa-dom.cc: Likewise. * tree-ssa-dse.cc: Likewise. * tree-ssa-forwprop.cc: Likewise. * tree-ssa-ifcombine.cc: Likewise. * tree-ssa-loop-ch.cc: Likewise. * tree-ssa-loop-im.cc: Likewise. * tree-ssa-loop-ivcanon.cc: Likewise. * tree-ssa-loop-prefetch.cc: Likewise. * tree-ssa-loop-split.cc: Likewise. * tree-ssa-loop-unswitch.cc: Likewise. * tree-ssa-loop.cc: Likewise. * tree-ssa-math-opts.cc: Likewise. * tree-ssa-phiopt.cc: Likewise. * tree-ssa-phiprop.cc: Likewise. * tree-ssa-pre.cc: Likewise. * tree-ssa-reassoc.cc: Likewise. * tree-ssa-sccvn.cc: Likewise. * tree-ssa-sink.cc: Likewise. * tree-ssa-strlen.cc: Likewise. * tree-ssa-structalias.cc: Likewise. * tree-ssa-uncprop.cc: Likewise. * tree-ssa-uninit.cc: Likewise. * tree-ssanames.cc: Likewise. * tree-stdarg.cc: Likewise. * tree-switch-conversion.cc: Likewise. * tree-tailcall.cc: Likewise. * tree-vect-generic.cc: Likewise. * tree-vectorizer.cc: Likewise. * tree-vrp.cc: Likewise. * tsan.cc: Likewise. * ubsan.cc: Likewise. * var-tracking.cc: Likewise. * vtable-verify.cc: Likewise. * web.cc: Likewise. Signed-off-by: David Malcolm <dmalcolm@redhat.com>
2022-06-27 23:00:33 +02:00
unsigned int execute (function *) final override
{
return rest_of_handle_insert_vzeroupper ();
}
}; // class pass_insert_vzeroupper
const pass_data pass_data_stv =
{
RTL_PASS, /* type */
"stv", /* name */
OPTGROUP_NONE, /* optinfo_flags */
TV_MACH_DEP, /* tv_id */
0, /* properties_required */
0, /* properties_provided */
0, /* properties_destroyed */
0, /* todo_flags_start */
TODO_df_finish, /* todo_flags_finish */
};
class pass_stv : public rtl_opt_pass
{
public:
pass_stv (gcc::context *ctxt)
: rtl_opt_pass (pass_data_stv, ctxt),
timode_p (false)
{}
/* opt_pass methods: */
Add 'final' and 'override' to opt_pass vfunc impls gcc/ChangeLog: * adjust-alignment.cc: Add "final" and "override" to opt_pass vfunc implementations, removing redundant "virtual" as appropriate. * asan.cc: Likewise. * auto-inc-dec.cc: Likewise. * auto-profile.cc: Likewise. * bb-reorder.cc: Likewise. * cfgcleanup.cc: Likewise. * cfgexpand.cc: Likewise. * cfgrtl.cc: Likewise. * cgraphbuild.cc: Likewise. * combine-stack-adj.cc: Likewise. * combine.cc: Likewise. * compare-elim.cc: Likewise. * config/i386/i386-features.cc: Likewise. * coroutine-passes.cc: Likewise. * cprop.cc: Likewise. * cse.cc: Likewise. * dce.cc: Likewise. * df-core.cc: Likewise. * dse.cc: Likewise. * dwarf2cfi.cc: Likewise. * early-remat.cc: Likewise. * except.cc: Likewise. * final.cc: Likewise. * function.cc: Likewise. * fwprop.cc: Likewise. * gcse.cc: Likewise. * gimple-harden-conditionals.cc: Likewise. * gimple-if-to-switch.cc: Likewise. * gimple-isel.cc: Likewise. * gimple-laddress.cc: Likewise. * gimple-loop-interchange.cc: Likewise. * gimple-loop-jam.cc: Likewise. * gimple-loop-versioning.cc: Likewise. * gimple-low.cc: Likewise. * gimple-ssa-backprop.cc: Likewise. * gimple-ssa-evrp.cc: Likewise. * gimple-ssa-isolate-paths.cc: Likewise. * gimple-ssa-nonnull-compare.cc: Likewise. * gimple-ssa-split-paths.cc: Likewise. * gimple-ssa-store-merging.cc: Likewise. * gimple-ssa-strength-reduction.cc: Likewise. * gimple-ssa-warn-access.cc: Likewise. * gimple-ssa-warn-alloca.cc: Likewise. * gimple-ssa-warn-restrict.cc: Likewise. * gimple-warn-recursion.cc: Likewise. * graphite.cc: Likewise. * ifcvt.cc: Likewise. * init-regs.cc: Likewise. * ipa-comdats.cc: Likewise. * ipa-cp.cc: Likewise. * ipa-devirt.cc: Likewise. * ipa-fnsummary.cc: Likewise. * ipa-free-lang-data.cc: Likewise. * ipa-icf.cc: Likewise. * ipa-inline.cc: Likewise. * ipa-modref.cc: Likewise. * ipa-profile.cc: Likewise. * ipa-pure-const.cc: Likewise. * ipa-reference.cc: Likewise. * ipa-split.cc: Likewise. * ipa-sra.cc: Likewise. * ipa-visibility.cc: Likewise. * ipa.cc: Likewise. * ira.cc: Likewise. * jump.cc: Likewise. * loop-init.cc: Likewise. * lower-subreg.cc: Likewise. * mode-switching.cc: Likewise. * modulo-sched.cc: Likewise. * multiple_target.cc: Likewise. * omp-expand.cc: Likewise. * omp-low.cc: Likewise. * omp-oacc-kernels-decompose.cc: Likewise. * omp-oacc-neuter-broadcast.cc: Likewise. * omp-offload.cc: Likewise. * omp-simd-clone.cc: Likewise. * passes.cc: Likewise. * postreload-gcse.cc: Likewise. * postreload.cc: Likewise. * predict.cc: Likewise. * recog.cc: Likewise. * ree.cc: Likewise. * reg-stack.cc: Likewise. * regcprop.cc: Likewise. * reginfo.cc: Likewise. * regrename.cc: Likewise. * reorg.cc: Likewise. * sancov.cc: Likewise. * sanopt.cc: Likewise. * sched-rgn.cc: Likewise. * stack-ptr-mod.cc: Likewise. * store-motion.cc: Likewise. * tracer.cc: Likewise. * trans-mem.cc: Likewise. * tree-call-cdce.cc: Likewise. * tree-cfg.cc: Likewise. * tree-cfgcleanup.cc: Likewise. * tree-complex.cc: Likewise. * tree-eh.cc: Likewise. * tree-emutls.cc: Likewise. * tree-if-conv.cc: Likewise. * tree-into-ssa.cc: Likewise. * tree-loop-distribution.cc: Likewise. * tree-nrv.cc: Likewise. * tree-object-size.cc: Likewise. * tree-parloops.cc: Likewise. * tree-predcom.cc: Likewise. * tree-profile.cc: Likewise. * tree-sra.cc: Likewise. * tree-ssa-ccp.cc: Likewise. * tree-ssa-copy.cc: Likewise. * tree-ssa-dce.cc: Likewise. * tree-ssa-dom.cc: Likewise. * tree-ssa-dse.cc: Likewise. * tree-ssa-forwprop.cc: Likewise. * tree-ssa-ifcombine.cc: Likewise. * tree-ssa-loop-ch.cc: Likewise. * tree-ssa-loop-im.cc: Likewise. * tree-ssa-loop-ivcanon.cc: Likewise. * tree-ssa-loop-prefetch.cc: Likewise. * tree-ssa-loop-split.cc: Likewise. * tree-ssa-loop-unswitch.cc: Likewise. * tree-ssa-loop.cc: Likewise. * tree-ssa-math-opts.cc: Likewise. * tree-ssa-phiopt.cc: Likewise. * tree-ssa-phiprop.cc: Likewise. * tree-ssa-pre.cc: Likewise. * tree-ssa-reassoc.cc: Likewise. * tree-ssa-sccvn.cc: Likewise. * tree-ssa-sink.cc: Likewise. * tree-ssa-strlen.cc: Likewise. * tree-ssa-structalias.cc: Likewise. * tree-ssa-uncprop.cc: Likewise. * tree-ssa-uninit.cc: Likewise. * tree-ssanames.cc: Likewise. * tree-stdarg.cc: Likewise. * tree-switch-conversion.cc: Likewise. * tree-tailcall.cc: Likewise. * tree-vect-generic.cc: Likewise. * tree-vectorizer.cc: Likewise. * tree-vrp.cc: Likewise. * tsan.cc: Likewise. * ubsan.cc: Likewise. * var-tracking.cc: Likewise. * vtable-verify.cc: Likewise. * web.cc: Likewise. Signed-off-by: David Malcolm <dmalcolm@redhat.com>
2022-06-27 23:00:33 +02:00
bool gate (function *) final override
{
return ((!timode_p || TARGET_64BIT)
&& TARGET_STV && TARGET_SSE2 && optimize > 1);
}
Add 'final' and 'override' to opt_pass vfunc impls gcc/ChangeLog: * adjust-alignment.cc: Add "final" and "override" to opt_pass vfunc implementations, removing redundant "virtual" as appropriate. * asan.cc: Likewise. * auto-inc-dec.cc: Likewise. * auto-profile.cc: Likewise. * bb-reorder.cc: Likewise. * cfgcleanup.cc: Likewise. * cfgexpand.cc: Likewise. * cfgrtl.cc: Likewise. * cgraphbuild.cc: Likewise. * combine-stack-adj.cc: Likewise. * combine.cc: Likewise. * compare-elim.cc: Likewise. * config/i386/i386-features.cc: Likewise. * coroutine-passes.cc: Likewise. * cprop.cc: Likewise. * cse.cc: Likewise. * dce.cc: Likewise. * df-core.cc: Likewise. * dse.cc: Likewise. * dwarf2cfi.cc: Likewise. * early-remat.cc: Likewise. * except.cc: Likewise. * final.cc: Likewise. * function.cc: Likewise. * fwprop.cc: Likewise. * gcse.cc: Likewise. * gimple-harden-conditionals.cc: Likewise. * gimple-if-to-switch.cc: Likewise. * gimple-isel.cc: Likewise. * gimple-laddress.cc: Likewise. * gimple-loop-interchange.cc: Likewise. * gimple-loop-jam.cc: Likewise. * gimple-loop-versioning.cc: Likewise. * gimple-low.cc: Likewise. * gimple-ssa-backprop.cc: Likewise. * gimple-ssa-evrp.cc: Likewise. * gimple-ssa-isolate-paths.cc: Likewise. * gimple-ssa-nonnull-compare.cc: Likewise. * gimple-ssa-split-paths.cc: Likewise. * gimple-ssa-store-merging.cc: Likewise. * gimple-ssa-strength-reduction.cc: Likewise. * gimple-ssa-warn-access.cc: Likewise. * gimple-ssa-warn-alloca.cc: Likewise. * gimple-ssa-warn-restrict.cc: Likewise. * gimple-warn-recursion.cc: Likewise. * graphite.cc: Likewise. * ifcvt.cc: Likewise. * init-regs.cc: Likewise. * ipa-comdats.cc: Likewise. * ipa-cp.cc: Likewise. * ipa-devirt.cc: Likewise. * ipa-fnsummary.cc: Likewise. * ipa-free-lang-data.cc: Likewise. * ipa-icf.cc: Likewise. * ipa-inline.cc: Likewise. * ipa-modref.cc: Likewise. * ipa-profile.cc: Likewise. * ipa-pure-const.cc: Likewise. * ipa-reference.cc: Likewise. * ipa-split.cc: Likewise. * ipa-sra.cc: Likewise. * ipa-visibility.cc: Likewise. * ipa.cc: Likewise. * ira.cc: Likewise. * jump.cc: Likewise. * loop-init.cc: Likewise. * lower-subreg.cc: Likewise. * mode-switching.cc: Likewise. * modulo-sched.cc: Likewise. * multiple_target.cc: Likewise. * omp-expand.cc: Likewise. * omp-low.cc: Likewise. * omp-oacc-kernels-decompose.cc: Likewise. * omp-oacc-neuter-broadcast.cc: Likewise. * omp-offload.cc: Likewise. * omp-simd-clone.cc: Likewise. * passes.cc: Likewise. * postreload-gcse.cc: Likewise. * postreload.cc: Likewise. * predict.cc: Likewise. * recog.cc: Likewise. * ree.cc: Likewise. * reg-stack.cc: Likewise. * regcprop.cc: Likewise. * reginfo.cc: Likewise. * regrename.cc: Likewise. * reorg.cc: Likewise. * sancov.cc: Likewise. * sanopt.cc: Likewise. * sched-rgn.cc: Likewise. * stack-ptr-mod.cc: Likewise. * store-motion.cc: Likewise. * tracer.cc: Likewise. * trans-mem.cc: Likewise. * tree-call-cdce.cc: Likewise. * tree-cfg.cc: Likewise. * tree-cfgcleanup.cc: Likewise. * tree-complex.cc: Likewise. * tree-eh.cc: Likewise. * tree-emutls.cc: Likewise. * tree-if-conv.cc: Likewise. * tree-into-ssa.cc: Likewise. * tree-loop-distribution.cc: Likewise. * tree-nrv.cc: Likewise. * tree-object-size.cc: Likewise. * tree-parloops.cc: Likewise. * tree-predcom.cc: Likewise. * tree-profile.cc: Likewise. * tree-sra.cc: Likewise. * tree-ssa-ccp.cc: Likewise. * tree-ssa-copy.cc: Likewise. * tree-ssa-dce.cc: Likewise. * tree-ssa-dom.cc: Likewise. * tree-ssa-dse.cc: Likewise. * tree-ssa-forwprop.cc: Likewise. * tree-ssa-ifcombine.cc: Likewise. * tree-ssa-loop-ch.cc: Likewise. * tree-ssa-loop-im.cc: Likewise. * tree-ssa-loop-ivcanon.cc: Likewise. * tree-ssa-loop-prefetch.cc: Likewise. * tree-ssa-loop-split.cc: Likewise. * tree-ssa-loop-unswitch.cc: Likewise. * tree-ssa-loop.cc: Likewise. * tree-ssa-math-opts.cc: Likewise. * tree-ssa-phiopt.cc: Likewise. * tree-ssa-phiprop.cc: Likewise. * tree-ssa-pre.cc: Likewise. * tree-ssa-reassoc.cc: Likewise. * tree-ssa-sccvn.cc: Likewise. * tree-ssa-sink.cc: Likewise. * tree-ssa-strlen.cc: Likewise. * tree-ssa-structalias.cc: Likewise. * tree-ssa-uncprop.cc: Likewise. * tree-ssa-uninit.cc: Likewise. * tree-ssanames.cc: Likewise. * tree-stdarg.cc: Likewise. * tree-switch-conversion.cc: Likewise. * tree-tailcall.cc: Likewise. * tree-vect-generic.cc: Likewise. * tree-vectorizer.cc: Likewise. * tree-vrp.cc: Likewise. * tsan.cc: Likewise. * ubsan.cc: Likewise. * var-tracking.cc: Likewise. * vtable-verify.cc: Likewise. * web.cc: Likewise. Signed-off-by: David Malcolm <dmalcolm@redhat.com>
2022-06-27 23:00:33 +02:00
unsigned int execute (function *) final override
{
return convert_scalars_to_vector (timode_p);
}
Add 'final' and 'override' to opt_pass vfunc impls gcc/ChangeLog: * adjust-alignment.cc: Add "final" and "override" to opt_pass vfunc implementations, removing redundant "virtual" as appropriate. * asan.cc: Likewise. * auto-inc-dec.cc: Likewise. * auto-profile.cc: Likewise. * bb-reorder.cc: Likewise. * cfgcleanup.cc: Likewise. * cfgexpand.cc: Likewise. * cfgrtl.cc: Likewise. * cgraphbuild.cc: Likewise. * combine-stack-adj.cc: Likewise. * combine.cc: Likewise. * compare-elim.cc: Likewise. * config/i386/i386-features.cc: Likewise. * coroutine-passes.cc: Likewise. * cprop.cc: Likewise. * cse.cc: Likewise. * dce.cc: Likewise. * df-core.cc: Likewise. * dse.cc: Likewise. * dwarf2cfi.cc: Likewise. * early-remat.cc: Likewise. * except.cc: Likewise. * final.cc: Likewise. * function.cc: Likewise. * fwprop.cc: Likewise. * gcse.cc: Likewise. * gimple-harden-conditionals.cc: Likewise. * gimple-if-to-switch.cc: Likewise. * gimple-isel.cc: Likewise. * gimple-laddress.cc: Likewise. * gimple-loop-interchange.cc: Likewise. * gimple-loop-jam.cc: Likewise. * gimple-loop-versioning.cc: Likewise. * gimple-low.cc: Likewise. * gimple-ssa-backprop.cc: Likewise. * gimple-ssa-evrp.cc: Likewise. * gimple-ssa-isolate-paths.cc: Likewise. * gimple-ssa-nonnull-compare.cc: Likewise. * gimple-ssa-split-paths.cc: Likewise. * gimple-ssa-store-merging.cc: Likewise. * gimple-ssa-strength-reduction.cc: Likewise. * gimple-ssa-warn-access.cc: Likewise. * gimple-ssa-warn-alloca.cc: Likewise. * gimple-ssa-warn-restrict.cc: Likewise. * gimple-warn-recursion.cc: Likewise. * graphite.cc: Likewise. * ifcvt.cc: Likewise. * init-regs.cc: Likewise. * ipa-comdats.cc: Likewise. * ipa-cp.cc: Likewise. * ipa-devirt.cc: Likewise. * ipa-fnsummary.cc: Likewise. * ipa-free-lang-data.cc: Likewise. * ipa-icf.cc: Likewise. * ipa-inline.cc: Likewise. * ipa-modref.cc: Likewise. * ipa-profile.cc: Likewise. * ipa-pure-const.cc: Likewise. * ipa-reference.cc: Likewise. * ipa-split.cc: Likewise. * ipa-sra.cc: Likewise. * ipa-visibility.cc: Likewise. * ipa.cc: Likewise. * ira.cc: Likewise. * jump.cc: Likewise. * loop-init.cc: Likewise. * lower-subreg.cc: Likewise. * mode-switching.cc: Likewise. * modulo-sched.cc: Likewise. * multiple_target.cc: Likewise. * omp-expand.cc: Likewise. * omp-low.cc: Likewise. * omp-oacc-kernels-decompose.cc: Likewise. * omp-oacc-neuter-broadcast.cc: Likewise. * omp-offload.cc: Likewise. * omp-simd-clone.cc: Likewise. * passes.cc: Likewise. * postreload-gcse.cc: Likewise. * postreload.cc: Likewise. * predict.cc: Likewise. * recog.cc: Likewise. * ree.cc: Likewise. * reg-stack.cc: Likewise. * regcprop.cc: Likewise. * reginfo.cc: Likewise. * regrename.cc: Likewise. * reorg.cc: Likewise. * sancov.cc: Likewise. * sanopt.cc: Likewise. * sched-rgn.cc: Likewise. * stack-ptr-mod.cc: Likewise. * store-motion.cc: Likewise. * tracer.cc: Likewise. * trans-mem.cc: Likewise. * tree-call-cdce.cc: Likewise. * tree-cfg.cc: Likewise. * tree-cfgcleanup.cc: Likewise. * tree-complex.cc: Likewise. * tree-eh.cc: Likewise. * tree-emutls.cc: Likewise. * tree-if-conv.cc: Likewise. * tree-into-ssa.cc: Likewise. * tree-loop-distribution.cc: Likewise. * tree-nrv.cc: Likewise. * tree-object-size.cc: Likewise. * tree-parloops.cc: Likewise. * tree-predcom.cc: Likewise. * tree-profile.cc: Likewise. * tree-sra.cc: Likewise. * tree-ssa-ccp.cc: Likewise. * tree-ssa-copy.cc: Likewise. * tree-ssa-dce.cc: Likewise. * tree-ssa-dom.cc: Likewise. * tree-ssa-dse.cc: Likewise. * tree-ssa-forwprop.cc: Likewise. * tree-ssa-ifcombine.cc: Likewise. * tree-ssa-loop-ch.cc: Likewise. * tree-ssa-loop-im.cc: Likewise. * tree-ssa-loop-ivcanon.cc: Likewise. * tree-ssa-loop-prefetch.cc: Likewise. * tree-ssa-loop-split.cc: Likewise. * tree-ssa-loop-unswitch.cc: Likewise. * tree-ssa-loop.cc: Likewise. * tree-ssa-math-opts.cc: Likewise. * tree-ssa-phiopt.cc: Likewise. * tree-ssa-phiprop.cc: Likewise. * tree-ssa-pre.cc: Likewise. * tree-ssa-reassoc.cc: Likewise. * tree-ssa-sccvn.cc: Likewise. * tree-ssa-sink.cc: Likewise. * tree-ssa-strlen.cc: Likewise. * tree-ssa-structalias.cc: Likewise. * tree-ssa-uncprop.cc: Likewise. * tree-ssa-uninit.cc: Likewise. * tree-ssanames.cc: Likewise. * tree-stdarg.cc: Likewise. * tree-switch-conversion.cc: Likewise. * tree-tailcall.cc: Likewise. * tree-vect-generic.cc: Likewise. * tree-vectorizer.cc: Likewise. * tree-vrp.cc: Likewise. * tsan.cc: Likewise. * ubsan.cc: Likewise. * var-tracking.cc: Likewise. * vtable-verify.cc: Likewise. * web.cc: Likewise. Signed-off-by: David Malcolm <dmalcolm@redhat.com>
2022-06-27 23:00:33 +02:00
opt_pass *clone () final override
{
return new pass_stv (m_ctxt);
}
Add 'final' and 'override' to opt_pass vfunc impls gcc/ChangeLog: * adjust-alignment.cc: Add "final" and "override" to opt_pass vfunc implementations, removing redundant "virtual" as appropriate. * asan.cc: Likewise. * auto-inc-dec.cc: Likewise. * auto-profile.cc: Likewise. * bb-reorder.cc: Likewise. * cfgcleanup.cc: Likewise. * cfgexpand.cc: Likewise. * cfgrtl.cc: Likewise. * cgraphbuild.cc: Likewise. * combine-stack-adj.cc: Likewise. * combine.cc: Likewise. * compare-elim.cc: Likewise. * config/i386/i386-features.cc: Likewise. * coroutine-passes.cc: Likewise. * cprop.cc: Likewise. * cse.cc: Likewise. * dce.cc: Likewise. * df-core.cc: Likewise. * dse.cc: Likewise. * dwarf2cfi.cc: Likewise. * early-remat.cc: Likewise. * except.cc: Likewise. * final.cc: Likewise. * function.cc: Likewise. * fwprop.cc: Likewise. * gcse.cc: Likewise. * gimple-harden-conditionals.cc: Likewise. * gimple-if-to-switch.cc: Likewise. * gimple-isel.cc: Likewise. * gimple-laddress.cc: Likewise. * gimple-loop-interchange.cc: Likewise. * gimple-loop-jam.cc: Likewise. * gimple-loop-versioning.cc: Likewise. * gimple-low.cc: Likewise. * gimple-ssa-backprop.cc: Likewise. * gimple-ssa-evrp.cc: Likewise. * gimple-ssa-isolate-paths.cc: Likewise. * gimple-ssa-nonnull-compare.cc: Likewise. * gimple-ssa-split-paths.cc: Likewise. * gimple-ssa-store-merging.cc: Likewise. * gimple-ssa-strength-reduction.cc: Likewise. * gimple-ssa-warn-access.cc: Likewise. * gimple-ssa-warn-alloca.cc: Likewise. * gimple-ssa-warn-restrict.cc: Likewise. * gimple-warn-recursion.cc: Likewise. * graphite.cc: Likewise. * ifcvt.cc: Likewise. * init-regs.cc: Likewise. * ipa-comdats.cc: Likewise. * ipa-cp.cc: Likewise. * ipa-devirt.cc: Likewise. * ipa-fnsummary.cc: Likewise. * ipa-free-lang-data.cc: Likewise. * ipa-icf.cc: Likewise. * ipa-inline.cc: Likewise. * ipa-modref.cc: Likewise. * ipa-profile.cc: Likewise. * ipa-pure-const.cc: Likewise. * ipa-reference.cc: Likewise. * ipa-split.cc: Likewise. * ipa-sra.cc: Likewise. * ipa-visibility.cc: Likewise. * ipa.cc: Likewise. * ira.cc: Likewise. * jump.cc: Likewise. * loop-init.cc: Likewise. * lower-subreg.cc: Likewise. * mode-switching.cc: Likewise. * modulo-sched.cc: Likewise. * multiple_target.cc: Likewise. * omp-expand.cc: Likewise. * omp-low.cc: Likewise. * omp-oacc-kernels-decompose.cc: Likewise. * omp-oacc-neuter-broadcast.cc: Likewise. * omp-offload.cc: Likewise. * omp-simd-clone.cc: Likewise. * passes.cc: Likewise. * postreload-gcse.cc: Likewise. * postreload.cc: Likewise. * predict.cc: Likewise. * recog.cc: Likewise. * ree.cc: Likewise. * reg-stack.cc: Likewise. * regcprop.cc: Likewise. * reginfo.cc: Likewise. * regrename.cc: Likewise. * reorg.cc: Likewise. * sancov.cc: Likewise. * sanopt.cc: Likewise. * sched-rgn.cc: Likewise. * stack-ptr-mod.cc: Likewise. * store-motion.cc: Likewise. * tracer.cc: Likewise. * trans-mem.cc: Likewise. * tree-call-cdce.cc: Likewise. * tree-cfg.cc: Likewise. * tree-cfgcleanup.cc: Likewise. * tree-complex.cc: Likewise. * tree-eh.cc: Likewise. * tree-emutls.cc: Likewise. * tree-if-conv.cc: Likewise. * tree-into-ssa.cc: Likewise. * tree-loop-distribution.cc: Likewise. * tree-nrv.cc: Likewise. * tree-object-size.cc: Likewise. * tree-parloops.cc: Likewise. * tree-predcom.cc: Likewise. * tree-profile.cc: Likewise. * tree-sra.cc: Likewise. * tree-ssa-ccp.cc: Likewise. * tree-ssa-copy.cc: Likewise. * tree-ssa-dce.cc: Likewise. * tree-ssa-dom.cc: Likewise. * tree-ssa-dse.cc: Likewise. * tree-ssa-forwprop.cc: Likewise. * tree-ssa-ifcombine.cc: Likewise. * tree-ssa-loop-ch.cc: Likewise. * tree-ssa-loop-im.cc: Likewise. * tree-ssa-loop-ivcanon.cc: Likewise. * tree-ssa-loop-prefetch.cc: Likewise. * tree-ssa-loop-split.cc: Likewise. * tree-ssa-loop-unswitch.cc: Likewise. * tree-ssa-loop.cc: Likewise. * tree-ssa-math-opts.cc: Likewise. * tree-ssa-phiopt.cc: Likewise. * tree-ssa-phiprop.cc: Likewise. * tree-ssa-pre.cc: Likewise. * tree-ssa-reassoc.cc: Likewise. * tree-ssa-sccvn.cc: Likewise. * tree-ssa-sink.cc: Likewise. * tree-ssa-strlen.cc: Likewise. * tree-ssa-structalias.cc: Likewise. * tree-ssa-uncprop.cc: Likewise. * tree-ssa-uninit.cc: Likewise. * tree-ssanames.cc: Likewise. * tree-stdarg.cc: Likewise. * tree-switch-conversion.cc: Likewise. * tree-tailcall.cc: Likewise. * tree-vect-generic.cc: Likewise. * tree-vectorizer.cc: Likewise. * tree-vrp.cc: Likewise. * tsan.cc: Likewise. * ubsan.cc: Likewise. * var-tracking.cc: Likewise. * vtable-verify.cc: Likewise. * web.cc: Likewise. Signed-off-by: David Malcolm <dmalcolm@redhat.com>
2022-06-27 23:00:33 +02:00
void set_pass_param (unsigned int n, bool param) final override
{
gcc_assert (n == 0);
timode_p = param;
}
private:
bool timode_p;
}; // class pass_stv
} // anon namespace
rtl_opt_pass *
make_pass_insert_vzeroupper (gcc::context *ctxt)
{
return new pass_insert_vzeroupper (ctxt);
}
rtl_opt_pass *
make_pass_stv (gcc::context *ctxt)
{
return new pass_stv (ctxt);
}
x86: Add UNSPECV_PATCHABLE_AREA Currently patchable area is at the wrong place. It is placed immediately after function label, before both .cfi_startproc and ENDBR. This patch adds UNSPECV_PATCHABLE_AREA for pseudo patchable area instruction and changes ENDBR insertion pass to also insert patchable area instruction. TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY is defined to avoid placing patchable area before .cfi_startproc and ENDBR. gcc/ PR target/93492 * config/i386/i386-features.c (rest_of_insert_endbranch): Renamed to ... (rest_of_insert_endbr_and_patchable_area): Change return type to void. Add need_endbr and patchable_area_size arguments. Don't call timevar_push nor timevar_pop. Replace endbr_queued_at_entrance with insn_queued_at_entrance. Insert UNSPECV_PATCHABLE_AREA for patchable area. (pass_data_insert_endbranch): Renamed to ... (pass_data_insert_endbr_and_patchable_area): This. Change pass name to endbr_and_patchable_area. (pass_insert_endbranch): Renamed to ... (pass_insert_endbr_and_patchable_area): This. Add need_endbr and patchable_area_size;. (pass_insert_endbr_and_patchable_area::gate): Set and check need_endbr and patchable_area_size. (pass_insert_endbr_and_patchable_area::execute): Call timevar_push and timevar_pop. Pass need_endbr and patchable_area_size to rest_of_insert_endbr_and_patchable_area. (make_pass_insert_endbranch): Renamed to ... (make_pass_insert_endbr_and_patchable_area): This. * config/i386/i386-passes.def: Replace pass_insert_endbranch with pass_insert_endbr_and_patchable_area. * config/i386/i386-protos.h (ix86_output_patchable_area): New. (make_pass_insert_endbranch): Renamed to ... (make_pass_insert_endbr_and_patchable_area): This. * config/i386/i386.c (ix86_asm_output_function_label): Set function_label_emitted to true. (ix86_print_patchable_function_entry): New function. (ix86_output_patchable_area): Likewise. (x86_function_profiler): Replace endbr_queued_at_entrance with insn_queued_at_entrance. Generate ENDBR only for TYPE_ENDBR. Call ix86_output_patchable_area to generate patchable area if needed. (TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY): New. * config/i386/i386.h (queued_insn_type): New. (machine_function): Add function_label_emitted. Replace endbr_queued_at_entrance with insn_queued_at_entrance. * config/i386/i386.md (UNSPECV_PATCHABLE_AREA): New. (patchable_area): New. gcc/testsuite/ PR target/93492 * gcc.target/i386/pr93492-1.c: New test. * gcc.target/i386/pr93492-2.c: Likewise. * gcc.target/i386/pr93492-3.c: Likewise. * gcc.target/i386/pr93492-4.c: Likewise. * gcc.target/i386/pr93492-5.c: Likewise.
2020-02-03 19:22:57 +01:00
/* Inserting ENDBR and pseudo patchable-area instructions. */
x86: Add UNSPECV_PATCHABLE_AREA Currently patchable area is at the wrong place. It is placed immediately after function label, before both .cfi_startproc and ENDBR. This patch adds UNSPECV_PATCHABLE_AREA for pseudo patchable area instruction and changes ENDBR insertion pass to also insert patchable area instruction. TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY is defined to avoid placing patchable area before .cfi_startproc and ENDBR. gcc/ PR target/93492 * config/i386/i386-features.c (rest_of_insert_endbranch): Renamed to ... (rest_of_insert_endbr_and_patchable_area): Change return type to void. Add need_endbr and patchable_area_size arguments. Don't call timevar_push nor timevar_pop. Replace endbr_queued_at_entrance with insn_queued_at_entrance. Insert UNSPECV_PATCHABLE_AREA for patchable area. (pass_data_insert_endbranch): Renamed to ... (pass_data_insert_endbr_and_patchable_area): This. Change pass name to endbr_and_patchable_area. (pass_insert_endbranch): Renamed to ... (pass_insert_endbr_and_patchable_area): This. Add need_endbr and patchable_area_size;. (pass_insert_endbr_and_patchable_area::gate): Set and check need_endbr and patchable_area_size. (pass_insert_endbr_and_patchable_area::execute): Call timevar_push and timevar_pop. Pass need_endbr and patchable_area_size to rest_of_insert_endbr_and_patchable_area. (make_pass_insert_endbranch): Renamed to ... (make_pass_insert_endbr_and_patchable_area): This. * config/i386/i386-passes.def: Replace pass_insert_endbranch with pass_insert_endbr_and_patchable_area. * config/i386/i386-protos.h (ix86_output_patchable_area): New. (make_pass_insert_endbranch): Renamed to ... (make_pass_insert_endbr_and_patchable_area): This. * config/i386/i386.c (ix86_asm_output_function_label): Set function_label_emitted to true. (ix86_print_patchable_function_entry): New function. (ix86_output_patchable_area): Likewise. (x86_function_profiler): Replace endbr_queued_at_entrance with insn_queued_at_entrance. Generate ENDBR only for TYPE_ENDBR. Call ix86_output_patchable_area to generate patchable area if needed. (TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY): New. * config/i386/i386.h (queued_insn_type): New. (machine_function): Add function_label_emitted. Replace endbr_queued_at_entrance with insn_queued_at_entrance. * config/i386/i386.md (UNSPECV_PATCHABLE_AREA): New. (patchable_area): New. gcc/testsuite/ PR target/93492 * gcc.target/i386/pr93492-1.c: New test. * gcc.target/i386/pr93492-2.c: Likewise. * gcc.target/i386/pr93492-3.c: Likewise. * gcc.target/i386/pr93492-4.c: Likewise. * gcc.target/i386/pr93492-5.c: Likewise.
2020-02-03 19:22:57 +01:00
static void
rest_of_insert_endbr_and_patchable_area (bool need_endbr,
unsigned int patchable_area_size)
{
x86: Add UNSPECV_PATCHABLE_AREA Currently patchable area is at the wrong place. It is placed immediately after function label, before both .cfi_startproc and ENDBR. This patch adds UNSPECV_PATCHABLE_AREA for pseudo patchable area instruction and changes ENDBR insertion pass to also insert patchable area instruction. TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY is defined to avoid placing patchable area before .cfi_startproc and ENDBR. gcc/ PR target/93492 * config/i386/i386-features.c (rest_of_insert_endbranch): Renamed to ... (rest_of_insert_endbr_and_patchable_area): Change return type to void. Add need_endbr and patchable_area_size arguments. Don't call timevar_push nor timevar_pop. Replace endbr_queued_at_entrance with insn_queued_at_entrance. Insert UNSPECV_PATCHABLE_AREA for patchable area. (pass_data_insert_endbranch): Renamed to ... (pass_data_insert_endbr_and_patchable_area): This. Change pass name to endbr_and_patchable_area. (pass_insert_endbranch): Renamed to ... (pass_insert_endbr_and_patchable_area): This. Add need_endbr and patchable_area_size;. (pass_insert_endbr_and_patchable_area::gate): Set and check need_endbr and patchable_area_size. (pass_insert_endbr_and_patchable_area::execute): Call timevar_push and timevar_pop. Pass need_endbr and patchable_area_size to rest_of_insert_endbr_and_patchable_area. (make_pass_insert_endbranch): Renamed to ... (make_pass_insert_endbr_and_patchable_area): This. * config/i386/i386-passes.def: Replace pass_insert_endbranch with pass_insert_endbr_and_patchable_area. * config/i386/i386-protos.h (ix86_output_patchable_area): New. (make_pass_insert_endbranch): Renamed to ... (make_pass_insert_endbr_and_patchable_area): This. * config/i386/i386.c (ix86_asm_output_function_label): Set function_label_emitted to true. (ix86_print_patchable_function_entry): New function. (ix86_output_patchable_area): Likewise. (x86_function_profiler): Replace endbr_queued_at_entrance with insn_queued_at_entrance. Generate ENDBR only for TYPE_ENDBR. Call ix86_output_patchable_area to generate patchable area if needed. (TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY): New. * config/i386/i386.h (queued_insn_type): New. (machine_function): Add function_label_emitted. Replace endbr_queued_at_entrance with insn_queued_at_entrance. * config/i386/i386.md (UNSPECV_PATCHABLE_AREA): New. (patchable_area): New. gcc/testsuite/ PR target/93492 * gcc.target/i386/pr93492-1.c: New test. * gcc.target/i386/pr93492-2.c: Likewise. * gcc.target/i386/pr93492-3.c: Likewise. * gcc.target/i386/pr93492-4.c: Likewise. * gcc.target/i386/pr93492-5.c: Likewise.
2020-02-03 19:22:57 +01:00
rtx endbr;
rtx_insn *insn;
x86: Add UNSPECV_PATCHABLE_AREA Currently patchable area is at the wrong place. It is placed immediately after function label, before both .cfi_startproc and ENDBR. This patch adds UNSPECV_PATCHABLE_AREA for pseudo patchable area instruction and changes ENDBR insertion pass to also insert patchable area instruction. TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY is defined to avoid placing patchable area before .cfi_startproc and ENDBR. gcc/ PR target/93492 * config/i386/i386-features.c (rest_of_insert_endbranch): Renamed to ... (rest_of_insert_endbr_and_patchable_area): Change return type to void. Add need_endbr and patchable_area_size arguments. Don't call timevar_push nor timevar_pop. Replace endbr_queued_at_entrance with insn_queued_at_entrance. Insert UNSPECV_PATCHABLE_AREA for patchable area. (pass_data_insert_endbranch): Renamed to ... (pass_data_insert_endbr_and_patchable_area): This. Change pass name to endbr_and_patchable_area. (pass_insert_endbranch): Renamed to ... (pass_insert_endbr_and_patchable_area): This. Add need_endbr and patchable_area_size;. (pass_insert_endbr_and_patchable_area::gate): Set and check need_endbr and patchable_area_size. (pass_insert_endbr_and_patchable_area::execute): Call timevar_push and timevar_pop. Pass need_endbr and patchable_area_size to rest_of_insert_endbr_and_patchable_area. (make_pass_insert_endbranch): Renamed to ... (make_pass_insert_endbr_and_patchable_area): This. * config/i386/i386-passes.def: Replace pass_insert_endbranch with pass_insert_endbr_and_patchable_area. * config/i386/i386-protos.h (ix86_output_patchable_area): New. (make_pass_insert_endbranch): Renamed to ... (make_pass_insert_endbr_and_patchable_area): This. * config/i386/i386.c (ix86_asm_output_function_label): Set function_label_emitted to true. (ix86_print_patchable_function_entry): New function. (ix86_output_patchable_area): Likewise. (x86_function_profiler): Replace endbr_queued_at_entrance with insn_queued_at_entrance. Generate ENDBR only for TYPE_ENDBR. Call ix86_output_patchable_area to generate patchable area if needed. (TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY): New. * config/i386/i386.h (queued_insn_type): New. (machine_function): Add function_label_emitted. Replace endbr_queued_at_entrance with insn_queued_at_entrance. * config/i386/i386.md (UNSPECV_PATCHABLE_AREA): New. (patchable_area): New. gcc/testsuite/ PR target/93492 * gcc.target/i386/pr93492-1.c: New test. * gcc.target/i386/pr93492-2.c: Likewise. * gcc.target/i386/pr93492-3.c: Likewise. * gcc.target/i386/pr93492-4.c: Likewise. * gcc.target/i386/pr93492-5.c: Likewise.
2020-02-03 19:22:57 +01:00
rtx_insn *endbr_insn = NULL;
basic_block bb;
x86: Add UNSPECV_PATCHABLE_AREA Currently patchable area is at the wrong place. It is placed immediately after function label, before both .cfi_startproc and ENDBR. This patch adds UNSPECV_PATCHABLE_AREA for pseudo patchable area instruction and changes ENDBR insertion pass to also insert patchable area instruction. TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY is defined to avoid placing patchable area before .cfi_startproc and ENDBR. gcc/ PR target/93492 * config/i386/i386-features.c (rest_of_insert_endbranch): Renamed to ... (rest_of_insert_endbr_and_patchable_area): Change return type to void. Add need_endbr and patchable_area_size arguments. Don't call timevar_push nor timevar_pop. Replace endbr_queued_at_entrance with insn_queued_at_entrance. Insert UNSPECV_PATCHABLE_AREA for patchable area. (pass_data_insert_endbranch): Renamed to ... (pass_data_insert_endbr_and_patchable_area): This. Change pass name to endbr_and_patchable_area. (pass_insert_endbranch): Renamed to ... (pass_insert_endbr_and_patchable_area): This. Add need_endbr and patchable_area_size;. (pass_insert_endbr_and_patchable_area::gate): Set and check need_endbr and patchable_area_size. (pass_insert_endbr_and_patchable_area::execute): Call timevar_push and timevar_pop. Pass need_endbr and patchable_area_size to rest_of_insert_endbr_and_patchable_area. (make_pass_insert_endbranch): Renamed to ... (make_pass_insert_endbr_and_patchable_area): This. * config/i386/i386-passes.def: Replace pass_insert_endbranch with pass_insert_endbr_and_patchable_area. * config/i386/i386-protos.h (ix86_output_patchable_area): New. (make_pass_insert_endbranch): Renamed to ... (make_pass_insert_endbr_and_patchable_area): This. * config/i386/i386.c (ix86_asm_output_function_label): Set function_label_emitted to true. (ix86_print_patchable_function_entry): New function. (ix86_output_patchable_area): Likewise. (x86_function_profiler): Replace endbr_queued_at_entrance with insn_queued_at_entrance. Generate ENDBR only for TYPE_ENDBR. Call ix86_output_patchable_area to generate patchable area if needed. (TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY): New. * config/i386/i386.h (queued_insn_type): New. (machine_function): Add function_label_emitted. Replace endbr_queued_at_entrance with insn_queued_at_entrance. * config/i386/i386.md (UNSPECV_PATCHABLE_AREA): New. (patchable_area): New. gcc/testsuite/ PR target/93492 * gcc.target/i386/pr93492-1.c: New test. * gcc.target/i386/pr93492-2.c: Likewise. * gcc.target/i386/pr93492-3.c: Likewise. * gcc.target/i386/pr93492-4.c: Likewise. * gcc.target/i386/pr93492-5.c: Likewise.
2020-02-03 19:22:57 +01:00
if (need_endbr)
{
/* Currently emit EB if it's a tracking function, i.e. 'nocf_check'
is absent among function attributes. Later an optimization will
be introduced to make analysis if an address of a static function
is taken. A static function whose address is not taken will get
a nocf_check attribute. This will allow to reduce the number of
EB. */
if (!lookup_attribute ("nocf_check",
TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
&& (!flag_manual_endbr
|| lookup_attribute ("cf_check",
DECL_ATTRIBUTES (cfun->decl)))
&& (!cgraph_node::get (cfun->decl)->only_called_directly_p ()
|| ix86_cmodel == CM_LARGE
|| ix86_cmodel == CM_LARGE_PIC
|| flag_force_indirect_call
|| (TARGET_DLLIMPORT_DECL_ATTRIBUTES
&& DECL_DLLIMPORT_P (cfun->decl))))
{
if (crtl->profile && flag_fentry)
{
/* Queue ENDBR insertion to x86_function_profiler.
NB: Any patchable-area insn will be inserted after
ENDBR. */
cfun->machine->insn_queued_at_entrance = TYPE_ENDBR;
}
else
{
endbr = gen_nop_endbr ();
bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
rtx_insn *insn = BB_HEAD (bb);
endbr_insn = emit_insn_before (endbr, insn);
}
}
}
if (patchable_area_size)
{
if (crtl->profile && flag_fentry)
x86: Add UNSPECV_PATCHABLE_AREA Currently patchable area is at the wrong place. It is placed immediately after function label, before both .cfi_startproc and ENDBR. This patch adds UNSPECV_PATCHABLE_AREA for pseudo patchable area instruction and changes ENDBR insertion pass to also insert patchable area instruction. TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY is defined to avoid placing patchable area before .cfi_startproc and ENDBR. gcc/ PR target/93492 * config/i386/i386-features.c (rest_of_insert_endbranch): Renamed to ... (rest_of_insert_endbr_and_patchable_area): Change return type to void. Add need_endbr and patchable_area_size arguments. Don't call timevar_push nor timevar_pop. Replace endbr_queued_at_entrance with insn_queued_at_entrance. Insert UNSPECV_PATCHABLE_AREA for patchable area. (pass_data_insert_endbranch): Renamed to ... (pass_data_insert_endbr_and_patchable_area): This. Change pass name to endbr_and_patchable_area. (pass_insert_endbranch): Renamed to ... (pass_insert_endbr_and_patchable_area): This. Add need_endbr and patchable_area_size;. (pass_insert_endbr_and_patchable_area::gate): Set and check need_endbr and patchable_area_size. (pass_insert_endbr_and_patchable_area::execute): Call timevar_push and timevar_pop. Pass need_endbr and patchable_area_size to rest_of_insert_endbr_and_patchable_area. (make_pass_insert_endbranch): Renamed to ... (make_pass_insert_endbr_and_patchable_area): This. * config/i386/i386-passes.def: Replace pass_insert_endbranch with pass_insert_endbr_and_patchable_area. * config/i386/i386-protos.h (ix86_output_patchable_area): New. (make_pass_insert_endbranch): Renamed to ... (make_pass_insert_endbr_and_patchable_area): This. * config/i386/i386.c (ix86_asm_output_function_label): Set function_label_emitted to true. (ix86_print_patchable_function_entry): New function. (ix86_output_patchable_area): Likewise. (x86_function_profiler): Replace endbr_queued_at_entrance with insn_queued_at_entrance. Generate ENDBR only for TYPE_ENDBR. Call ix86_output_patchable_area to generate patchable area if needed. (TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY): New. * config/i386/i386.h (queued_insn_type): New. (machine_function): Add function_label_emitted. Replace endbr_queued_at_entrance with insn_queued_at_entrance. * config/i386/i386.md (UNSPECV_PATCHABLE_AREA): New. (patchable_area): New. gcc/testsuite/ PR target/93492 * gcc.target/i386/pr93492-1.c: New test. * gcc.target/i386/pr93492-2.c: Likewise. * gcc.target/i386/pr93492-3.c: Likewise. * gcc.target/i386/pr93492-4.c: Likewise. * gcc.target/i386/pr93492-5.c: Likewise.
2020-02-03 19:22:57 +01:00
{
/* Queue patchable-area insertion to x86_function_profiler.
NB: If there is a queued ENDBR, x86_function_profiler
will also handle patchable-area. */
if (!cfun->machine->insn_queued_at_entrance)
cfun->machine->insn_queued_at_entrance = TYPE_PATCHABLE_AREA;
}
else
{
x86: Add UNSPECV_PATCHABLE_AREA Currently patchable area is at the wrong place. It is placed immediately after function label, before both .cfi_startproc and ENDBR. This patch adds UNSPECV_PATCHABLE_AREA for pseudo patchable area instruction and changes ENDBR insertion pass to also insert patchable area instruction. TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY is defined to avoid placing patchable area before .cfi_startproc and ENDBR. gcc/ PR target/93492 * config/i386/i386-features.c (rest_of_insert_endbranch): Renamed to ... (rest_of_insert_endbr_and_patchable_area): Change return type to void. Add need_endbr and patchable_area_size arguments. Don't call timevar_push nor timevar_pop. Replace endbr_queued_at_entrance with insn_queued_at_entrance. Insert UNSPECV_PATCHABLE_AREA for patchable area. (pass_data_insert_endbranch): Renamed to ... (pass_data_insert_endbr_and_patchable_area): This. Change pass name to endbr_and_patchable_area. (pass_insert_endbranch): Renamed to ... (pass_insert_endbr_and_patchable_area): This. Add need_endbr and patchable_area_size;. (pass_insert_endbr_and_patchable_area::gate): Set and check need_endbr and patchable_area_size. (pass_insert_endbr_and_patchable_area::execute): Call timevar_push and timevar_pop. Pass need_endbr and patchable_area_size to rest_of_insert_endbr_and_patchable_area. (make_pass_insert_endbranch): Renamed to ... (make_pass_insert_endbr_and_patchable_area): This. * config/i386/i386-passes.def: Replace pass_insert_endbranch with pass_insert_endbr_and_patchable_area. * config/i386/i386-protos.h (ix86_output_patchable_area): New. (make_pass_insert_endbranch): Renamed to ... (make_pass_insert_endbr_and_patchable_area): This. * config/i386/i386.c (ix86_asm_output_function_label): Set function_label_emitted to true. (ix86_print_patchable_function_entry): New function. (ix86_output_patchable_area): Likewise. (x86_function_profiler): Replace endbr_queued_at_entrance with insn_queued_at_entrance. Generate ENDBR only for TYPE_ENDBR. Call ix86_output_patchable_area to generate patchable area if needed. (TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY): New. * config/i386/i386.h (queued_insn_type): New. (machine_function): Add function_label_emitted. Replace endbr_queued_at_entrance with insn_queued_at_entrance. * config/i386/i386.md (UNSPECV_PATCHABLE_AREA): New. (patchable_area): New. gcc/testsuite/ PR target/93492 * gcc.target/i386/pr93492-1.c: New test. * gcc.target/i386/pr93492-2.c: Likewise. * gcc.target/i386/pr93492-3.c: Likewise. * gcc.target/i386/pr93492-4.c: Likewise. * gcc.target/i386/pr93492-5.c: Likewise.
2020-02-03 19:22:57 +01:00
rtx patchable_area
= gen_patchable_area (GEN_INT (patchable_area_size),
GEN_INT (crtl->patch_area_entry == 0));
if (endbr_insn)
emit_insn_after (patchable_area, endbr_insn);
else
{
bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
insn = BB_HEAD (bb);
emit_insn_before (patchable_area, insn);
}
}
}
x86: Add UNSPECV_PATCHABLE_AREA Currently patchable area is at the wrong place. It is placed immediately after function label, before both .cfi_startproc and ENDBR. This patch adds UNSPECV_PATCHABLE_AREA for pseudo patchable area instruction and changes ENDBR insertion pass to also insert patchable area instruction. TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY is defined to avoid placing patchable area before .cfi_startproc and ENDBR. gcc/ PR target/93492 * config/i386/i386-features.c (rest_of_insert_endbranch): Renamed to ... (rest_of_insert_endbr_and_patchable_area): Change return type to void. Add need_endbr and patchable_area_size arguments. Don't call timevar_push nor timevar_pop. Replace endbr_queued_at_entrance with insn_queued_at_entrance. Insert UNSPECV_PATCHABLE_AREA for patchable area. (pass_data_insert_endbranch): Renamed to ... (pass_data_insert_endbr_and_patchable_area): This. Change pass name to endbr_and_patchable_area. (pass_insert_endbranch): Renamed to ... (pass_insert_endbr_and_patchable_area): This. Add need_endbr and patchable_area_size;. (pass_insert_endbr_and_patchable_area::gate): Set and check need_endbr and patchable_area_size. (pass_insert_endbr_and_patchable_area::execute): Call timevar_push and timevar_pop. Pass need_endbr and patchable_area_size to rest_of_insert_endbr_and_patchable_area. (make_pass_insert_endbranch): Renamed to ... (make_pass_insert_endbr_and_patchable_area): This. * config/i386/i386-passes.def: Replace pass_insert_endbranch with pass_insert_endbr_and_patchable_area. * config/i386/i386-protos.h (ix86_output_patchable_area): New. (make_pass_insert_endbranch): Renamed to ... (make_pass_insert_endbr_and_patchable_area): This. * config/i386/i386.c (ix86_asm_output_function_label): Set function_label_emitted to true. (ix86_print_patchable_function_entry): New function. (ix86_output_patchable_area): Likewise. (x86_function_profiler): Replace endbr_queued_at_entrance with insn_queued_at_entrance. Generate ENDBR only for TYPE_ENDBR. Call ix86_output_patchable_area to generate patchable area if needed. (TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY): New. * config/i386/i386.h (queued_insn_type): New. (machine_function): Add function_label_emitted. Replace endbr_queued_at_entrance with insn_queued_at_entrance. * config/i386/i386.md (UNSPECV_PATCHABLE_AREA): New. (patchable_area): New. gcc/testsuite/ PR target/93492 * gcc.target/i386/pr93492-1.c: New test. * gcc.target/i386/pr93492-2.c: Likewise. * gcc.target/i386/pr93492-3.c: Likewise. * gcc.target/i386/pr93492-4.c: Likewise. * gcc.target/i386/pr93492-5.c: Likewise.
2020-02-03 19:22:57 +01:00
if (!need_endbr)
return;
bb = 0;
FOR_EACH_BB_FN (bb, cfun)
{
for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb));
insn = NEXT_INSN (insn))
{
if (CALL_P (insn))
{
need_endbr = find_reg_note (insn, REG_SETJMP, NULL) != NULL;
if (!need_endbr && !SIBLING_CALL_P (insn))
{
rtx call = get_call_rtx_from (insn);
rtx fnaddr = XEXP (call, 0);
tree fndecl = NULL_TREE;
/* Also generate ENDBRANCH for non-tail call which
may return via indirect branch. */
if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
if (fndecl == NULL_TREE)
fndecl = MEM_EXPR (fnaddr);
if (fndecl
&& TREE_CODE (TREE_TYPE (fndecl)) != FUNCTION_TYPE
&& TREE_CODE (TREE_TYPE (fndecl)) != METHOD_TYPE)
fndecl = NULL_TREE;
if (fndecl && TYPE_ARG_TYPES (TREE_TYPE (fndecl)))
{
tree fntype = TREE_TYPE (fndecl);
if (lookup_attribute ("indirect_return",
TYPE_ATTRIBUTES (fntype)))
need_endbr = true;
}
}
if (!need_endbr)
continue;
/* Generate ENDBRANCH after CALL, which can return more than
twice, setjmp-like functions. */
x86: Add UNSPECV_PATCHABLE_AREA Currently patchable area is at the wrong place. It is placed immediately after function label, before both .cfi_startproc and ENDBR. This patch adds UNSPECV_PATCHABLE_AREA for pseudo patchable area instruction and changes ENDBR insertion pass to also insert patchable area instruction. TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY is defined to avoid placing patchable area before .cfi_startproc and ENDBR. gcc/ PR target/93492 * config/i386/i386-features.c (rest_of_insert_endbranch): Renamed to ... (rest_of_insert_endbr_and_patchable_area): Change return type to void. Add need_endbr and patchable_area_size arguments. Don't call timevar_push nor timevar_pop. Replace endbr_queued_at_entrance with insn_queued_at_entrance. Insert UNSPECV_PATCHABLE_AREA for patchable area. (pass_data_insert_endbranch): Renamed to ... (pass_data_insert_endbr_and_patchable_area): This. Change pass name to endbr_and_patchable_area. (pass_insert_endbranch): Renamed to ... (pass_insert_endbr_and_patchable_area): This. Add need_endbr and patchable_area_size;. (pass_insert_endbr_and_patchable_area::gate): Set and check need_endbr and patchable_area_size. (pass_insert_endbr_and_patchable_area::execute): Call timevar_push and timevar_pop. Pass need_endbr and patchable_area_size to rest_of_insert_endbr_and_patchable_area. (make_pass_insert_endbranch): Renamed to ... (make_pass_insert_endbr_and_patchable_area): This. * config/i386/i386-passes.def: Replace pass_insert_endbranch with pass_insert_endbr_and_patchable_area. * config/i386/i386-protos.h (ix86_output_patchable_area): New. (make_pass_insert_endbranch): Renamed to ... (make_pass_insert_endbr_and_patchable_area): This. * config/i386/i386.c (ix86_asm_output_function_label): Set function_label_emitted to true. (ix86_print_patchable_function_entry): New function. (ix86_output_patchable_area): Likewise. (x86_function_profiler): Replace endbr_queued_at_entrance with insn_queued_at_entrance. Generate ENDBR only for TYPE_ENDBR. Call ix86_output_patchable_area to generate patchable area if needed. (TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY): New. * config/i386/i386.h (queued_insn_type): New. (machine_function): Add function_label_emitted. Replace endbr_queued_at_entrance with insn_queued_at_entrance. * config/i386/i386.md (UNSPECV_PATCHABLE_AREA): New. (patchable_area): New. gcc/testsuite/ PR target/93492 * gcc.target/i386/pr93492-1.c: New test. * gcc.target/i386/pr93492-2.c: Likewise. * gcc.target/i386/pr93492-3.c: Likewise. * gcc.target/i386/pr93492-4.c: Likewise. * gcc.target/i386/pr93492-5.c: Likewise.
2020-02-03 19:22:57 +01:00
endbr = gen_nop_endbr ();
emit_insn_after_setloc (endbr, insn, INSN_LOCATION (insn));
continue;
}
if (JUMP_P (insn) && flag_cet_switch)
{
rtx target = JUMP_LABEL (insn);
if (target == NULL_RTX || ANY_RETURN_P (target))
continue;
/* Check the jump is a switch table. */
rtx_insn *label = as_a<rtx_insn *> (target);
rtx_insn *table = next_insn (label);
if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
continue;
/* For the indirect jump find out all places it jumps and insert
ENDBRANCH there. It should be done under a special flag to
control ENDBRANCH generation for switch stmts. */
edge_iterator ei;
edge e;
basic_block dest_blk;
FOR_EACH_EDGE (e, ei, bb->succs)
{
rtx_insn *insn;
dest_blk = e->dest;
insn = BB_HEAD (dest_blk);
gcc_assert (LABEL_P (insn));
x86: Add UNSPECV_PATCHABLE_AREA Currently patchable area is at the wrong place. It is placed immediately after function label, before both .cfi_startproc and ENDBR. This patch adds UNSPECV_PATCHABLE_AREA for pseudo patchable area instruction and changes ENDBR insertion pass to also insert patchable area instruction. TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY is defined to avoid placing patchable area before .cfi_startproc and ENDBR. gcc/ PR target/93492 * config/i386/i386-features.c (rest_of_insert_endbranch): Renamed to ... (rest_of_insert_endbr_and_patchable_area): Change return type to void. Add need_endbr and patchable_area_size arguments. Don't call timevar_push nor timevar_pop. Replace endbr_queued_at_entrance with insn_queued_at_entrance. Insert UNSPECV_PATCHABLE_AREA for patchable area. (pass_data_insert_endbranch): Renamed to ... (pass_data_insert_endbr_and_patchable_area): This. Change pass name to endbr_and_patchable_area. (pass_insert_endbranch): Renamed to ... (pass_insert_endbr_and_patchable_area): This. Add need_endbr and patchable_area_size;. (pass_insert_endbr_and_patchable_area::gate): Set and check need_endbr and patchable_area_size. (pass_insert_endbr_and_patchable_area::execute): Call timevar_push and timevar_pop. Pass need_endbr and patchable_area_size to rest_of_insert_endbr_and_patchable_area. (make_pass_insert_endbranch): Renamed to ... (make_pass_insert_endbr_and_patchable_area): This. * config/i386/i386-passes.def: Replace pass_insert_endbranch with pass_insert_endbr_and_patchable_area. * config/i386/i386-protos.h (ix86_output_patchable_area): New. (make_pass_insert_endbranch): Renamed to ... (make_pass_insert_endbr_and_patchable_area): This. * config/i386/i386.c (ix86_asm_output_function_label): Set function_label_emitted to true. (ix86_print_patchable_function_entry): New function. (ix86_output_patchable_area): Likewise. (x86_function_profiler): Replace endbr_queued_at_entrance with insn_queued_at_entrance. Generate ENDBR only for TYPE_ENDBR. Call ix86_output_patchable_area to generate patchable area if needed. (TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY): New. * config/i386/i386.h (queued_insn_type): New. (machine_function): Add function_label_emitted. Replace endbr_queued_at_entrance with insn_queued_at_entrance. * config/i386/i386.md (UNSPECV_PATCHABLE_AREA): New. (patchable_area): New. gcc/testsuite/ PR target/93492 * gcc.target/i386/pr93492-1.c: New test. * gcc.target/i386/pr93492-2.c: Likewise. * gcc.target/i386/pr93492-3.c: Likewise. * gcc.target/i386/pr93492-4.c: Likewise. * gcc.target/i386/pr93492-5.c: Likewise.
2020-02-03 19:22:57 +01:00
endbr = gen_nop_endbr ();
emit_insn_after (endbr, insn);
}
continue;
}
if (LABEL_P (insn) && LABEL_PRESERVE_P (insn))
{
x86: Add UNSPECV_PATCHABLE_AREA Currently patchable area is at the wrong place. It is placed immediately after function label, before both .cfi_startproc and ENDBR. This patch adds UNSPECV_PATCHABLE_AREA for pseudo patchable area instruction and changes ENDBR insertion pass to also insert patchable area instruction. TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY is defined to avoid placing patchable area before .cfi_startproc and ENDBR. gcc/ PR target/93492 * config/i386/i386-features.c (rest_of_insert_endbranch): Renamed to ... (rest_of_insert_endbr_and_patchable_area): Change return type to void. Add need_endbr and patchable_area_size arguments. Don't call timevar_push nor timevar_pop. Replace endbr_queued_at_entrance with insn_queued_at_entrance. Insert UNSPECV_PATCHABLE_AREA for patchable area. (pass_data_insert_endbranch): Renamed to ... (pass_data_insert_endbr_and_patchable_area): This. Change pass name to endbr_and_patchable_area. (pass_insert_endbranch): Renamed to ... (pass_insert_endbr_and_patchable_area): This. Add need_endbr and patchable_area_size;. (pass_insert_endbr_and_patchable_area::gate): Set and check need_endbr and patchable_area_size. (pass_insert_endbr_and_patchable_area::execute): Call timevar_push and timevar_pop. Pass need_endbr and patchable_area_size to rest_of_insert_endbr_and_patchable_area. (make_pass_insert_endbranch): Renamed to ... (make_pass_insert_endbr_and_patchable_area): This. * config/i386/i386-passes.def: Replace pass_insert_endbranch with pass_insert_endbr_and_patchable_area. * config/i386/i386-protos.h (ix86_output_patchable_area): New. (make_pass_insert_endbranch): Renamed to ... (make_pass_insert_endbr_and_patchable_area): This. * config/i386/i386.c (ix86_asm_output_function_label): Set function_label_emitted to true. (ix86_print_patchable_function_entry): New function. (ix86_output_patchable_area): Likewise. (x86_function_profiler): Replace endbr_queued_at_entrance with insn_queued_at_entrance. Generate ENDBR only for TYPE_ENDBR. Call ix86_output_patchable_area to generate patchable area if needed. (TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY): New. * config/i386/i386.h (queued_insn_type): New. (machine_function): Add function_label_emitted. Replace endbr_queued_at_entrance with insn_queued_at_entrance. * config/i386/i386.md (UNSPECV_PATCHABLE_AREA): New. (patchable_area): New. gcc/testsuite/ PR target/93492 * gcc.target/i386/pr93492-1.c: New test. * gcc.target/i386/pr93492-2.c: Likewise. * gcc.target/i386/pr93492-3.c: Likewise. * gcc.target/i386/pr93492-4.c: Likewise. * gcc.target/i386/pr93492-5.c: Likewise.
2020-02-03 19:22:57 +01:00
endbr = gen_nop_endbr ();
emit_insn_after (endbr, insn);
continue;
}
}
}
x86: Add UNSPECV_PATCHABLE_AREA Currently patchable area is at the wrong place. It is placed immediately after function label, before both .cfi_startproc and ENDBR. This patch adds UNSPECV_PATCHABLE_AREA for pseudo patchable area instruction and changes ENDBR insertion pass to also insert patchable area instruction. TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY is defined to avoid placing patchable area before .cfi_startproc and ENDBR. gcc/ PR target/93492 * config/i386/i386-features.c (rest_of_insert_endbranch): Renamed to ... (rest_of_insert_endbr_and_patchable_area): Change return type to void. Add need_endbr and patchable_area_size arguments. Don't call timevar_push nor timevar_pop. Replace endbr_queued_at_entrance with insn_queued_at_entrance. Insert UNSPECV_PATCHABLE_AREA for patchable area. (pass_data_insert_endbranch): Renamed to ... (pass_data_insert_endbr_and_patchable_area): This. Change pass name to endbr_and_patchable_area. (pass_insert_endbranch): Renamed to ... (pass_insert_endbr_and_patchable_area): This. Add need_endbr and patchable_area_size;. (pass_insert_endbr_and_patchable_area::gate): Set and check need_endbr and patchable_area_size. (pass_insert_endbr_and_patchable_area::execute): Call timevar_push and timevar_pop. Pass need_endbr and patchable_area_size to rest_of_insert_endbr_and_patchable_area. (make_pass_insert_endbranch): Renamed to ... (make_pass_insert_endbr_and_patchable_area): This. * config/i386/i386-passes.def: Replace pass_insert_endbranch with pass_insert_endbr_and_patchable_area. * config/i386/i386-protos.h (ix86_output_patchable_area): New. (make_pass_insert_endbranch): Renamed to ... (make_pass_insert_endbr_and_patchable_area): This. * config/i386/i386.c (ix86_asm_output_function_label): Set function_label_emitted to true. (ix86_print_patchable_function_entry): New function. (ix86_output_patchable_area): Likewise. (x86_function_profiler): Replace endbr_queued_at_entrance with insn_queued_at_entrance. Generate ENDBR only for TYPE_ENDBR. Call ix86_output_patchable_area to generate patchable area if needed. (TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY): New. * config/i386/i386.h (queued_insn_type): New. (machine_function): Add function_label_emitted. Replace endbr_queued_at_entrance with insn_queued_at_entrance. * config/i386/i386.md (UNSPECV_PATCHABLE_AREA): New. (patchable_area): New. gcc/testsuite/ PR target/93492 * gcc.target/i386/pr93492-1.c: New test. * gcc.target/i386/pr93492-2.c: Likewise. * gcc.target/i386/pr93492-3.c: Likewise. * gcc.target/i386/pr93492-4.c: Likewise. * gcc.target/i386/pr93492-5.c: Likewise.
2020-02-03 19:22:57 +01:00
return;
}
namespace {
x86: Add UNSPECV_PATCHABLE_AREA Currently patchable area is at the wrong place. It is placed immediately after function label, before both .cfi_startproc and ENDBR. This patch adds UNSPECV_PATCHABLE_AREA for pseudo patchable area instruction and changes ENDBR insertion pass to also insert patchable area instruction. TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY is defined to avoid placing patchable area before .cfi_startproc and ENDBR. gcc/ PR target/93492 * config/i386/i386-features.c (rest_of_insert_endbranch): Renamed to ... (rest_of_insert_endbr_and_patchable_area): Change return type to void. Add need_endbr and patchable_area_size arguments. Don't call timevar_push nor timevar_pop. Replace endbr_queued_at_entrance with insn_queued_at_entrance. Insert UNSPECV_PATCHABLE_AREA for patchable area. (pass_data_insert_endbranch): Renamed to ... (pass_data_insert_endbr_and_patchable_area): This. Change pass name to endbr_and_patchable_area. (pass_insert_endbranch): Renamed to ... (pass_insert_endbr_and_patchable_area): This. Add need_endbr and patchable_area_size;. (pass_insert_endbr_and_patchable_area::gate): Set and check need_endbr and patchable_area_size. (pass_insert_endbr_and_patchable_area::execute): Call timevar_push and timevar_pop. Pass need_endbr and patchable_area_size to rest_of_insert_endbr_and_patchable_area. (make_pass_insert_endbranch): Renamed to ... (make_pass_insert_endbr_and_patchable_area): This. * config/i386/i386-passes.def: Replace pass_insert_endbranch with pass_insert_endbr_and_patchable_area. * config/i386/i386-protos.h (ix86_output_patchable_area): New. (make_pass_insert_endbranch): Renamed to ... (make_pass_insert_endbr_and_patchable_area): This. * config/i386/i386.c (ix86_asm_output_function_label): Set function_label_emitted to true. (ix86_print_patchable_function_entry): New function. (ix86_output_patchable_area): Likewise. (x86_function_profiler): Replace endbr_queued_at_entrance with insn_queued_at_entrance. Generate ENDBR only for TYPE_ENDBR. Call ix86_output_patchable_area to generate patchable area if needed. (TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY): New. * config/i386/i386.h (queued_insn_type): New. (machine_function): Add function_label_emitted. Replace endbr_queued_at_entrance with insn_queued_at_entrance. * config/i386/i386.md (UNSPECV_PATCHABLE_AREA): New. (patchable_area): New. gcc/testsuite/ PR target/93492 * gcc.target/i386/pr93492-1.c: New test. * gcc.target/i386/pr93492-2.c: Likewise. * gcc.target/i386/pr93492-3.c: Likewise. * gcc.target/i386/pr93492-4.c: Likewise. * gcc.target/i386/pr93492-5.c: Likewise.
2020-02-03 19:22:57 +01:00
const pass_data pass_data_insert_endbr_and_patchable_area =
{
RTL_PASS, /* type. */
x86: Add UNSPECV_PATCHABLE_AREA Currently patchable area is at the wrong place. It is placed immediately after function label, before both .cfi_startproc and ENDBR. This patch adds UNSPECV_PATCHABLE_AREA for pseudo patchable area instruction and changes ENDBR insertion pass to also insert patchable area instruction. TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY is defined to avoid placing patchable area before .cfi_startproc and ENDBR. gcc/ PR target/93492 * config/i386/i386-features.c (rest_of_insert_endbranch): Renamed to ... (rest_of_insert_endbr_and_patchable_area): Change return type to void. Add need_endbr and patchable_area_size arguments. Don't call timevar_push nor timevar_pop. Replace endbr_queued_at_entrance with insn_queued_at_entrance. Insert UNSPECV_PATCHABLE_AREA for patchable area. (pass_data_insert_endbranch): Renamed to ... (pass_data_insert_endbr_and_patchable_area): This. Change pass name to endbr_and_patchable_area. (pass_insert_endbranch): Renamed to ... (pass_insert_endbr_and_patchable_area): This. Add need_endbr and patchable_area_size;. (pass_insert_endbr_and_patchable_area::gate): Set and check need_endbr and patchable_area_size. (pass_insert_endbr_and_patchable_area::execute): Call timevar_push and timevar_pop. Pass need_endbr and patchable_area_size to rest_of_insert_endbr_and_patchable_area. (make_pass_insert_endbranch): Renamed to ... (make_pass_insert_endbr_and_patchable_area): This. * config/i386/i386-passes.def: Replace pass_insert_endbranch with pass_insert_endbr_and_patchable_area. * config/i386/i386-protos.h (ix86_output_patchable_area): New. (make_pass_insert_endbranch): Renamed to ... (make_pass_insert_endbr_and_patchable_area): This. * config/i386/i386.c (ix86_asm_output_function_label): Set function_label_emitted to true. (ix86_print_patchable_function_entry): New function. (ix86_output_patchable_area): Likewise. (x86_function_profiler): Replace endbr_queued_at_entrance with insn_queued_at_entrance. Generate ENDBR only for TYPE_ENDBR. Call ix86_output_patchable_area to generate patchable area if needed. (TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY): New. * config/i386/i386.h (queued_insn_type): New. (machine_function): Add function_label_emitted. Replace endbr_queued_at_entrance with insn_queued_at_entrance. * config/i386/i386.md (UNSPECV_PATCHABLE_AREA): New. (patchable_area): New. gcc/testsuite/ PR target/93492 * gcc.target/i386/pr93492-1.c: New test. * gcc.target/i386/pr93492-2.c: Likewise. * gcc.target/i386/pr93492-3.c: Likewise. * gcc.target/i386/pr93492-4.c: Likewise. * gcc.target/i386/pr93492-5.c: Likewise.
2020-02-03 19:22:57 +01:00
"endbr_and_patchable_area", /* name. */
OPTGROUP_NONE, /* optinfo_flags. */
TV_MACH_DEP, /* tv_id. */
0, /* properties_required. */
0, /* properties_provided. */
0, /* properties_destroyed. */
0, /* todo_flags_start. */
0, /* todo_flags_finish. */
};
x86: Add UNSPECV_PATCHABLE_AREA Currently patchable area is at the wrong place. It is placed immediately after function label, before both .cfi_startproc and ENDBR. This patch adds UNSPECV_PATCHABLE_AREA for pseudo patchable area instruction and changes ENDBR insertion pass to also insert patchable area instruction. TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY is defined to avoid placing patchable area before .cfi_startproc and ENDBR. gcc/ PR target/93492 * config/i386/i386-features.c (rest_of_insert_endbranch): Renamed to ... (rest_of_insert_endbr_and_patchable_area): Change return type to void. Add need_endbr and patchable_area_size arguments. Don't call timevar_push nor timevar_pop. Replace endbr_queued_at_entrance with insn_queued_at_entrance. Insert UNSPECV_PATCHABLE_AREA for patchable area. (pass_data_insert_endbranch): Renamed to ... (pass_data_insert_endbr_and_patchable_area): This. Change pass name to endbr_and_patchable_area. (pass_insert_endbranch): Renamed to ... (pass_insert_endbr_and_patchable_area): This. Add need_endbr and patchable_area_size;. (pass_insert_endbr_and_patchable_area::gate): Set and check need_endbr and patchable_area_size. (pass_insert_endbr_and_patchable_area::execute): Call timevar_push and timevar_pop. Pass need_endbr and patchable_area_size to rest_of_insert_endbr_and_patchable_area. (make_pass_insert_endbranch): Renamed to ... (make_pass_insert_endbr_and_patchable_area): This. * config/i386/i386-passes.def: Replace pass_insert_endbranch with pass_insert_endbr_and_patchable_area. * config/i386/i386-protos.h (ix86_output_patchable_area): New. (make_pass_insert_endbranch): Renamed to ... (make_pass_insert_endbr_and_patchable_area): This. * config/i386/i386.c (ix86_asm_output_function_label): Set function_label_emitted to true. (ix86_print_patchable_function_entry): New function. (ix86_output_patchable_area): Likewise. (x86_function_profiler): Replace endbr_queued_at_entrance with insn_queued_at_entrance. Generate ENDBR only for TYPE_ENDBR. Call ix86_output_patchable_area to generate patchable area if needed. (TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY): New. * config/i386/i386.h (queued_insn_type): New. (machine_function): Add function_label_emitted. Replace endbr_queued_at_entrance with insn_queued_at_entrance. * config/i386/i386.md (UNSPECV_PATCHABLE_AREA): New. (patchable_area): New. gcc/testsuite/ PR target/93492 * gcc.target/i386/pr93492-1.c: New test. * gcc.target/i386/pr93492-2.c: Likewise. * gcc.target/i386/pr93492-3.c: Likewise. * gcc.target/i386/pr93492-4.c: Likewise. * gcc.target/i386/pr93492-5.c: Likewise.
2020-02-03 19:22:57 +01:00
class pass_insert_endbr_and_patchable_area : public rtl_opt_pass
{
public:
x86: Add UNSPECV_PATCHABLE_AREA Currently patchable area is at the wrong place. It is placed immediately after function label, before both .cfi_startproc and ENDBR. This patch adds UNSPECV_PATCHABLE_AREA for pseudo patchable area instruction and changes ENDBR insertion pass to also insert patchable area instruction. TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY is defined to avoid placing patchable area before .cfi_startproc and ENDBR. gcc/ PR target/93492 * config/i386/i386-features.c (rest_of_insert_endbranch): Renamed to ... (rest_of_insert_endbr_and_patchable_area): Change return type to void. Add need_endbr and patchable_area_size arguments. Don't call timevar_push nor timevar_pop. Replace endbr_queued_at_entrance with insn_queued_at_entrance. Insert UNSPECV_PATCHABLE_AREA for patchable area. (pass_data_insert_endbranch): Renamed to ... (pass_data_insert_endbr_and_patchable_area): This. Change pass name to endbr_and_patchable_area. (pass_insert_endbranch): Renamed to ... (pass_insert_endbr_and_patchable_area): This. Add need_endbr and patchable_area_size;. (pass_insert_endbr_and_patchable_area::gate): Set and check need_endbr and patchable_area_size. (pass_insert_endbr_and_patchable_area::execute): Call timevar_push and timevar_pop. Pass need_endbr and patchable_area_size to rest_of_insert_endbr_and_patchable_area. (make_pass_insert_endbranch): Renamed to ... (make_pass_insert_endbr_and_patchable_area): This. * config/i386/i386-passes.def: Replace pass_insert_endbranch with pass_insert_endbr_and_patchable_area. * config/i386/i386-protos.h (ix86_output_patchable_area): New. (make_pass_insert_endbranch): Renamed to ... (make_pass_insert_endbr_and_patchable_area): This. * config/i386/i386.c (ix86_asm_output_function_label): Set function_label_emitted to true. (ix86_print_patchable_function_entry): New function. (ix86_output_patchable_area): Likewise. (x86_function_profiler): Replace endbr_queued_at_entrance with insn_queued_at_entrance. Generate ENDBR only for TYPE_ENDBR. Call ix86_output_patchable_area to generate patchable area if needed. (TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY): New. * config/i386/i386.h (queued_insn_type): New. (machine_function): Add function_label_emitted. Replace endbr_queued_at_entrance with insn_queued_at_entrance. * config/i386/i386.md (UNSPECV_PATCHABLE_AREA): New. (patchable_area): New. gcc/testsuite/ PR target/93492 * gcc.target/i386/pr93492-1.c: New test. * gcc.target/i386/pr93492-2.c: Likewise. * gcc.target/i386/pr93492-3.c: Likewise. * gcc.target/i386/pr93492-4.c: Likewise. * gcc.target/i386/pr93492-5.c: Likewise.
2020-02-03 19:22:57 +01:00
pass_insert_endbr_and_patchable_area (gcc::context *ctxt)
: rtl_opt_pass (pass_data_insert_endbr_and_patchable_area, ctxt)
{}
/* opt_pass methods: */
Add 'final' and 'override' to opt_pass vfunc impls gcc/ChangeLog: * adjust-alignment.cc: Add "final" and "override" to opt_pass vfunc implementations, removing redundant "virtual" as appropriate. * asan.cc: Likewise. * auto-inc-dec.cc: Likewise. * auto-profile.cc: Likewise. * bb-reorder.cc: Likewise. * cfgcleanup.cc: Likewise. * cfgexpand.cc: Likewise. * cfgrtl.cc: Likewise. * cgraphbuild.cc: Likewise. * combine-stack-adj.cc: Likewise. * combine.cc: Likewise. * compare-elim.cc: Likewise. * config/i386/i386-features.cc: Likewise. * coroutine-passes.cc: Likewise. * cprop.cc: Likewise. * cse.cc: Likewise. * dce.cc: Likewise. * df-core.cc: Likewise. * dse.cc: Likewise. * dwarf2cfi.cc: Likewise. * early-remat.cc: Likewise. * except.cc: Likewise. * final.cc: Likewise. * function.cc: Likewise. * fwprop.cc: Likewise. * gcse.cc: Likewise. * gimple-harden-conditionals.cc: Likewise. * gimple-if-to-switch.cc: Likewise. * gimple-isel.cc: Likewise. * gimple-laddress.cc: Likewise. * gimple-loop-interchange.cc: Likewise. * gimple-loop-jam.cc: Likewise. * gimple-loop-versioning.cc: Likewise. * gimple-low.cc: Likewise. * gimple-ssa-backprop.cc: Likewise. * gimple-ssa-evrp.cc: Likewise. * gimple-ssa-isolate-paths.cc: Likewise. * gimple-ssa-nonnull-compare.cc: Likewise. * gimple-ssa-split-paths.cc: Likewise. * gimple-ssa-store-merging.cc: Likewise. * gimple-ssa-strength-reduction.cc: Likewise. * gimple-ssa-warn-access.cc: Likewise. * gimple-ssa-warn-alloca.cc: Likewise. * gimple-ssa-warn-restrict.cc: Likewise. * gimple-warn-recursion.cc: Likewise. * graphite.cc: Likewise. * ifcvt.cc: Likewise. * init-regs.cc: Likewise. * ipa-comdats.cc: Likewise. * ipa-cp.cc: Likewise. * ipa-devirt.cc: Likewise. * ipa-fnsummary.cc: Likewise. * ipa-free-lang-data.cc: Likewise. * ipa-icf.cc: Likewise. * ipa-inline.cc: Likewise. * ipa-modref.cc: Likewise. * ipa-profile.cc: Likewise. * ipa-pure-const.cc: Likewise. * ipa-reference.cc: Likewise. * ipa-split.cc: Likewise. * ipa-sra.cc: Likewise. * ipa-visibility.cc: Likewise. * ipa.cc: Likewise. * ira.cc: Likewise. * jump.cc: Likewise. * loop-init.cc: Likewise. * lower-subreg.cc: Likewise. * mode-switching.cc: Likewise. * modulo-sched.cc: Likewise. * multiple_target.cc: Likewise. * omp-expand.cc: Likewise. * omp-low.cc: Likewise. * omp-oacc-kernels-decompose.cc: Likewise. * omp-oacc-neuter-broadcast.cc: Likewise. * omp-offload.cc: Likewise. * omp-simd-clone.cc: Likewise. * passes.cc: Likewise. * postreload-gcse.cc: Likewise. * postreload.cc: Likewise. * predict.cc: Likewise. * recog.cc: Likewise. * ree.cc: Likewise. * reg-stack.cc: Likewise. * regcprop.cc: Likewise. * reginfo.cc: Likewise. * regrename.cc: Likewise. * reorg.cc: Likewise. * sancov.cc: Likewise. * sanopt.cc: Likewise. * sched-rgn.cc: Likewise. * stack-ptr-mod.cc: Likewise. * store-motion.cc: Likewise. * tracer.cc: Likewise. * trans-mem.cc: Likewise. * tree-call-cdce.cc: Likewise. * tree-cfg.cc: Likewise. * tree-cfgcleanup.cc: Likewise. * tree-complex.cc: Likewise. * tree-eh.cc: Likewise. * tree-emutls.cc: Likewise. * tree-if-conv.cc: Likewise. * tree-into-ssa.cc: Likewise. * tree-loop-distribution.cc: Likewise. * tree-nrv.cc: Likewise. * tree-object-size.cc: Likewise. * tree-parloops.cc: Likewise. * tree-predcom.cc: Likewise. * tree-profile.cc: Likewise. * tree-sra.cc: Likewise. * tree-ssa-ccp.cc: Likewise. * tree-ssa-copy.cc: Likewise. * tree-ssa-dce.cc: Likewise. * tree-ssa-dom.cc: Likewise. * tree-ssa-dse.cc: Likewise. * tree-ssa-forwprop.cc: Likewise. * tree-ssa-ifcombine.cc: Likewise. * tree-ssa-loop-ch.cc: Likewise. * tree-ssa-loop-im.cc: Likewise. * tree-ssa-loop-ivcanon.cc: Likewise. * tree-ssa-loop-prefetch.cc: Likewise. * tree-ssa-loop-split.cc: Likewise. * tree-ssa-loop-unswitch.cc: Likewise. * tree-ssa-loop.cc: Likewise. * tree-ssa-math-opts.cc: Likewise. * tree-ssa-phiopt.cc: Likewise. * tree-ssa-phiprop.cc: Likewise. * tree-ssa-pre.cc: Likewise. * tree-ssa-reassoc.cc: Likewise. * tree-ssa-sccvn.cc: Likewise. * tree-ssa-sink.cc: Likewise. * tree-ssa-strlen.cc: Likewise. * tree-ssa-structalias.cc: Likewise. * tree-ssa-uncprop.cc: Likewise. * tree-ssa-uninit.cc: Likewise. * tree-ssanames.cc: Likewise. * tree-stdarg.cc: Likewise. * tree-switch-conversion.cc: Likewise. * tree-tailcall.cc: Likewise. * tree-vect-generic.cc: Likewise. * tree-vectorizer.cc: Likewise. * tree-vrp.cc: Likewise. * tsan.cc: Likewise. * ubsan.cc: Likewise. * var-tracking.cc: Likewise. * vtable-verify.cc: Likewise. * web.cc: Likewise. Signed-off-by: David Malcolm <dmalcolm@redhat.com>
2022-06-27 23:00:33 +02:00
bool gate (function *) final override
{
x86: Add UNSPECV_PATCHABLE_AREA Currently patchable area is at the wrong place. It is placed immediately after function label, before both .cfi_startproc and ENDBR. This patch adds UNSPECV_PATCHABLE_AREA for pseudo patchable area instruction and changes ENDBR insertion pass to also insert patchable area instruction. TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY is defined to avoid placing patchable area before .cfi_startproc and ENDBR. gcc/ PR target/93492 * config/i386/i386-features.c (rest_of_insert_endbranch): Renamed to ... (rest_of_insert_endbr_and_patchable_area): Change return type to void. Add need_endbr and patchable_area_size arguments. Don't call timevar_push nor timevar_pop. Replace endbr_queued_at_entrance with insn_queued_at_entrance. Insert UNSPECV_PATCHABLE_AREA for patchable area. (pass_data_insert_endbranch): Renamed to ... (pass_data_insert_endbr_and_patchable_area): This. Change pass name to endbr_and_patchable_area. (pass_insert_endbranch): Renamed to ... (pass_insert_endbr_and_patchable_area): This. Add need_endbr and patchable_area_size;. (pass_insert_endbr_and_patchable_area::gate): Set and check need_endbr and patchable_area_size. (pass_insert_endbr_and_patchable_area::execute): Call timevar_push and timevar_pop. Pass need_endbr and patchable_area_size to rest_of_insert_endbr_and_patchable_area. (make_pass_insert_endbranch): Renamed to ... (make_pass_insert_endbr_and_patchable_area): This. * config/i386/i386-passes.def: Replace pass_insert_endbranch with pass_insert_endbr_and_patchable_area. * config/i386/i386-protos.h (ix86_output_patchable_area): New. (make_pass_insert_endbranch): Renamed to ... (make_pass_insert_endbr_and_patchable_area): This. * config/i386/i386.c (ix86_asm_output_function_label): Set function_label_emitted to true. (ix86_print_patchable_function_entry): New function. (ix86_output_patchable_area): Likewise. (x86_function_profiler): Replace endbr_queued_at_entrance with insn_queued_at_entrance. Generate ENDBR only for TYPE_ENDBR. Call ix86_output_patchable_area to generate patchable area if needed. (TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY): New. * config/i386/i386.h (queued_insn_type): New. (machine_function): Add function_label_emitted. Replace endbr_queued_at_entrance with insn_queued_at_entrance. * config/i386/i386.md (UNSPECV_PATCHABLE_AREA): New. (patchable_area): New. gcc/testsuite/ PR target/93492 * gcc.target/i386/pr93492-1.c: New test. * gcc.target/i386/pr93492-2.c: Likewise. * gcc.target/i386/pr93492-3.c: Likewise. * gcc.target/i386/pr93492-4.c: Likewise. * gcc.target/i386/pr93492-5.c: Likewise.
2020-02-03 19:22:57 +01:00
need_endbr = (flag_cf_protection & CF_BRANCH) != 0;
patchable_area_size = crtl->patch_area_size - crtl->patch_area_entry;
return need_endbr || patchable_area_size;
}
Add 'final' and 'override' to opt_pass vfunc impls gcc/ChangeLog: * adjust-alignment.cc: Add "final" and "override" to opt_pass vfunc implementations, removing redundant "virtual" as appropriate. * asan.cc: Likewise. * auto-inc-dec.cc: Likewise. * auto-profile.cc: Likewise. * bb-reorder.cc: Likewise. * cfgcleanup.cc: Likewise. * cfgexpand.cc: Likewise. * cfgrtl.cc: Likewise. * cgraphbuild.cc: Likewise. * combine-stack-adj.cc: Likewise. * combine.cc: Likewise. * compare-elim.cc: Likewise. * config/i386/i386-features.cc: Likewise. * coroutine-passes.cc: Likewise. * cprop.cc: Likewise. * cse.cc: Likewise. * dce.cc: Likewise. * df-core.cc: Likewise. * dse.cc: Likewise. * dwarf2cfi.cc: Likewise. * early-remat.cc: Likewise. * except.cc: Likewise. * final.cc: Likewise. * function.cc: Likewise. * fwprop.cc: Likewise. * gcse.cc: Likewise. * gimple-harden-conditionals.cc: Likewise. * gimple-if-to-switch.cc: Likewise. * gimple-isel.cc: Likewise. * gimple-laddress.cc: Likewise. * gimple-loop-interchange.cc: Likewise. * gimple-loop-jam.cc: Likewise. * gimple-loop-versioning.cc: Likewise. * gimple-low.cc: Likewise. * gimple-ssa-backprop.cc: Likewise. * gimple-ssa-evrp.cc: Likewise. * gimple-ssa-isolate-paths.cc: Likewise. * gimple-ssa-nonnull-compare.cc: Likewise. * gimple-ssa-split-paths.cc: Likewise. * gimple-ssa-store-merging.cc: Likewise. * gimple-ssa-strength-reduction.cc: Likewise. * gimple-ssa-warn-access.cc: Likewise. * gimple-ssa-warn-alloca.cc: Likewise. * gimple-ssa-warn-restrict.cc: Likewise. * gimple-warn-recursion.cc: Likewise. * graphite.cc: Likewise. * ifcvt.cc: Likewise. * init-regs.cc: Likewise. * ipa-comdats.cc: Likewise. * ipa-cp.cc: Likewise. * ipa-devirt.cc: Likewise. * ipa-fnsummary.cc: Likewise. * ipa-free-lang-data.cc: Likewise. * ipa-icf.cc: Likewise. * ipa-inline.cc: Likewise. * ipa-modref.cc: Likewise. * ipa-profile.cc: Likewise. * ipa-pure-const.cc: Likewise. * ipa-reference.cc: Likewise. * ipa-split.cc: Likewise. * ipa-sra.cc: Likewise. * ipa-visibility.cc: Likewise. * ipa.cc: Likewise. * ira.cc: Likewise. * jump.cc: Likewise. * loop-init.cc: Likewise. * lower-subreg.cc: Likewise. * mode-switching.cc: Likewise. * modulo-sched.cc: Likewise. * multiple_target.cc: Likewise. * omp-expand.cc: Likewise. * omp-low.cc: Likewise. * omp-oacc-kernels-decompose.cc: Likewise. * omp-oacc-neuter-broadcast.cc: Likewise. * omp-offload.cc: Likewise. * omp-simd-clone.cc: Likewise. * passes.cc: Likewise. * postreload-gcse.cc: Likewise. * postreload.cc: Likewise. * predict.cc: Likewise. * recog.cc: Likewise. * ree.cc: Likewise. * reg-stack.cc: Likewise. * regcprop.cc: Likewise. * reginfo.cc: Likewise. * regrename.cc: Likewise. * reorg.cc: Likewise. * sancov.cc: Likewise. * sanopt.cc: Likewise. * sched-rgn.cc: Likewise. * stack-ptr-mod.cc: Likewise. * store-motion.cc: Likewise. * tracer.cc: Likewise. * trans-mem.cc: Likewise. * tree-call-cdce.cc: Likewise. * tree-cfg.cc: Likewise. * tree-cfgcleanup.cc: Likewise. * tree-complex.cc: Likewise. * tree-eh.cc: Likewise. * tree-emutls.cc: Likewise. * tree-if-conv.cc: Likewise. * tree-into-ssa.cc: Likewise. * tree-loop-distribution.cc: Likewise. * tree-nrv.cc: Likewise. * tree-object-size.cc: Likewise. * tree-parloops.cc: Likewise. * tree-predcom.cc: Likewise. * tree-profile.cc: Likewise. * tree-sra.cc: Likewise. * tree-ssa-ccp.cc: Likewise. * tree-ssa-copy.cc: Likewise. * tree-ssa-dce.cc: Likewise. * tree-ssa-dom.cc: Likewise. * tree-ssa-dse.cc: Likewise. * tree-ssa-forwprop.cc: Likewise. * tree-ssa-ifcombine.cc: Likewise. * tree-ssa-loop-ch.cc: Likewise. * tree-ssa-loop-im.cc: Likewise. * tree-ssa-loop-ivcanon.cc: Likewise. * tree-ssa-loop-prefetch.cc: Likewise. * tree-ssa-loop-split.cc: Likewise. * tree-ssa-loop-unswitch.cc: Likewise. * tree-ssa-loop.cc: Likewise. * tree-ssa-math-opts.cc: Likewise. * tree-ssa-phiopt.cc: Likewise. * tree-ssa-phiprop.cc: Likewise. * tree-ssa-pre.cc: Likewise. * tree-ssa-reassoc.cc: Likewise. * tree-ssa-sccvn.cc: Likewise. * tree-ssa-sink.cc: Likewise. * tree-ssa-strlen.cc: Likewise. * tree-ssa-structalias.cc: Likewise. * tree-ssa-uncprop.cc: Likewise. * tree-ssa-uninit.cc: Likewise. * tree-ssanames.cc: Likewise. * tree-stdarg.cc: Likewise. * tree-switch-conversion.cc: Likewise. * tree-tailcall.cc: Likewise. * tree-vect-generic.cc: Likewise. * tree-vectorizer.cc: Likewise. * tree-vrp.cc: Likewise. * tsan.cc: Likewise. * ubsan.cc: Likewise. * var-tracking.cc: Likewise. * vtable-verify.cc: Likewise. * web.cc: Likewise. Signed-off-by: David Malcolm <dmalcolm@redhat.com>
2022-06-27 23:00:33 +02:00
unsigned int execute (function *) final override
{
x86: Add UNSPECV_PATCHABLE_AREA Currently patchable area is at the wrong place. It is placed immediately after function label, before both .cfi_startproc and ENDBR. This patch adds UNSPECV_PATCHABLE_AREA for pseudo patchable area instruction and changes ENDBR insertion pass to also insert patchable area instruction. TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY is defined to avoid placing patchable area before .cfi_startproc and ENDBR. gcc/ PR target/93492 * config/i386/i386-features.c (rest_of_insert_endbranch): Renamed to ... (rest_of_insert_endbr_and_patchable_area): Change return type to void. Add need_endbr and patchable_area_size arguments. Don't call timevar_push nor timevar_pop. Replace endbr_queued_at_entrance with insn_queued_at_entrance. Insert UNSPECV_PATCHABLE_AREA for patchable area. (pass_data_insert_endbranch): Renamed to ... (pass_data_insert_endbr_and_patchable_area): This. Change pass name to endbr_and_patchable_area. (pass_insert_endbranch): Renamed to ... (pass_insert_endbr_and_patchable_area): This. Add need_endbr and patchable_area_size;. (pass_insert_endbr_and_patchable_area::gate): Set and check need_endbr and patchable_area_size. (pass_insert_endbr_and_patchable_area::execute): Call timevar_push and timevar_pop. Pass need_endbr and patchable_area_size to rest_of_insert_endbr_and_patchable_area. (make_pass_insert_endbranch): Renamed to ... (make_pass_insert_endbr_and_patchable_area): This. * config/i386/i386-passes.def: Replace pass_insert_endbranch with pass_insert_endbr_and_patchable_area. * config/i386/i386-protos.h (ix86_output_patchable_area): New. (make_pass_insert_endbranch): Renamed to ... (make_pass_insert_endbr_and_patchable_area): This. * config/i386/i386.c (ix86_asm_output_function_label): Set function_label_emitted to true. (ix86_print_patchable_function_entry): New function. (ix86_output_patchable_area): Likewise. (x86_function_profiler): Replace endbr_queued_at_entrance with insn_queued_at_entrance. Generate ENDBR only for TYPE_ENDBR. Call ix86_output_patchable_area to generate patchable area if needed. (TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY): New. * config/i386/i386.h (queued_insn_type): New. (machine_function): Add function_label_emitted. Replace endbr_queued_at_entrance with insn_queued_at_entrance. * config/i386/i386.md (UNSPECV_PATCHABLE_AREA): New. (patchable_area): New. gcc/testsuite/ PR target/93492 * gcc.target/i386/pr93492-1.c: New test. * gcc.target/i386/pr93492-2.c: Likewise. * gcc.target/i386/pr93492-3.c: Likewise. * gcc.target/i386/pr93492-4.c: Likewise. * gcc.target/i386/pr93492-5.c: Likewise.
2020-02-03 19:22:57 +01:00
timevar_push (TV_MACH_DEP);
rest_of_insert_endbr_and_patchable_area (need_endbr,
patchable_area_size);
timevar_pop (TV_MACH_DEP);
return 0;
}
x86: Add UNSPECV_PATCHABLE_AREA Currently patchable area is at the wrong place. It is placed immediately after function label, before both .cfi_startproc and ENDBR. This patch adds UNSPECV_PATCHABLE_AREA for pseudo patchable area instruction and changes ENDBR insertion pass to also insert patchable area instruction. TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY is defined to avoid placing patchable area before .cfi_startproc and ENDBR. gcc/ PR target/93492 * config/i386/i386-features.c (rest_of_insert_endbranch): Renamed to ... (rest_of_insert_endbr_and_patchable_area): Change return type to void. Add need_endbr and patchable_area_size arguments. Don't call timevar_push nor timevar_pop. Replace endbr_queued_at_entrance with insn_queued_at_entrance. Insert UNSPECV_PATCHABLE_AREA for patchable area. (pass_data_insert_endbranch): Renamed to ... (pass_data_insert_endbr_and_patchable_area): This. Change pass name to endbr_and_patchable_area. (pass_insert_endbranch): Renamed to ... (pass_insert_endbr_and_patchable_area): This. Add need_endbr and patchable_area_size;. (pass_insert_endbr_and_patchable_area::gate): Set and check need_endbr and patchable_area_size. (pass_insert_endbr_and_patchable_area::execute): Call timevar_push and timevar_pop. Pass need_endbr and patchable_area_size to rest_of_insert_endbr_and_patchable_area. (make_pass_insert_endbranch): Renamed to ... (make_pass_insert_endbr_and_patchable_area): This. * config/i386/i386-passes.def: Replace pass_insert_endbranch with pass_insert_endbr_and_patchable_area. * config/i386/i386-protos.h (ix86_output_patchable_area): New. (make_pass_insert_endbranch): Renamed to ... (make_pass_insert_endbr_and_patchable_area): This. * config/i386/i386.c (ix86_asm_output_function_label): Set function_label_emitted to true. (ix86_print_patchable_function_entry): New function. (ix86_output_patchable_area): Likewise. (x86_function_profiler): Replace endbr_queued_at_entrance with insn_queued_at_entrance. Generate ENDBR only for TYPE_ENDBR. Call ix86_output_patchable_area to generate patchable area if needed. (TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY): New. * config/i386/i386.h (queued_insn_type): New. (machine_function): Add function_label_emitted. Replace endbr_queued_at_entrance with insn_queued_at_entrance. * config/i386/i386.md (UNSPECV_PATCHABLE_AREA): New. (patchable_area): New. gcc/testsuite/ PR target/93492 * gcc.target/i386/pr93492-1.c: New test. * gcc.target/i386/pr93492-2.c: Likewise. * gcc.target/i386/pr93492-3.c: Likewise. * gcc.target/i386/pr93492-4.c: Likewise. * gcc.target/i386/pr93492-5.c: Likewise.
2020-02-03 19:22:57 +01:00
private:
bool need_endbr;
unsigned int patchable_area_size;
}; // class pass_insert_endbr_and_patchable_area
} // anon namespace
rtl_opt_pass *
x86: Add UNSPECV_PATCHABLE_AREA Currently patchable area is at the wrong place. It is placed immediately after function label, before both .cfi_startproc and ENDBR. This patch adds UNSPECV_PATCHABLE_AREA for pseudo patchable area instruction and changes ENDBR insertion pass to also insert patchable area instruction. TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY is defined to avoid placing patchable area before .cfi_startproc and ENDBR. gcc/ PR target/93492 * config/i386/i386-features.c (rest_of_insert_endbranch): Renamed to ... (rest_of_insert_endbr_and_patchable_area): Change return type to void. Add need_endbr and patchable_area_size arguments. Don't call timevar_push nor timevar_pop. Replace endbr_queued_at_entrance with insn_queued_at_entrance. Insert UNSPECV_PATCHABLE_AREA for patchable area. (pass_data_insert_endbranch): Renamed to ... (pass_data_insert_endbr_and_patchable_area): This. Change pass name to endbr_and_patchable_area. (pass_insert_endbranch): Renamed to ... (pass_insert_endbr_and_patchable_area): This. Add need_endbr and patchable_area_size;. (pass_insert_endbr_and_patchable_area::gate): Set and check need_endbr and patchable_area_size. (pass_insert_endbr_and_patchable_area::execute): Call timevar_push and timevar_pop. Pass need_endbr and patchable_area_size to rest_of_insert_endbr_and_patchable_area. (make_pass_insert_endbranch): Renamed to ... (make_pass_insert_endbr_and_patchable_area): This. * config/i386/i386-passes.def: Replace pass_insert_endbranch with pass_insert_endbr_and_patchable_area. * config/i386/i386-protos.h (ix86_output_patchable_area): New. (make_pass_insert_endbranch): Renamed to ... (make_pass_insert_endbr_and_patchable_area): This. * config/i386/i386.c (ix86_asm_output_function_label): Set function_label_emitted to true. (ix86_print_patchable_function_entry): New function. (ix86_output_patchable_area): Likewise. (x86_function_profiler): Replace endbr_queued_at_entrance with insn_queued_at_entrance. Generate ENDBR only for TYPE_ENDBR. Call ix86_output_patchable_area to generate patchable area if needed. (TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY): New. * config/i386/i386.h (queued_insn_type): New. (machine_function): Add function_label_emitted. Replace endbr_queued_at_entrance with insn_queued_at_entrance. * config/i386/i386.md (UNSPECV_PATCHABLE_AREA): New. (patchable_area): New. gcc/testsuite/ PR target/93492 * gcc.target/i386/pr93492-1.c: New test. * gcc.target/i386/pr93492-2.c: Likewise. * gcc.target/i386/pr93492-3.c: Likewise. * gcc.target/i386/pr93492-4.c: Likewise. * gcc.target/i386/pr93492-5.c: Likewise.
2020-02-03 19:22:57 +01:00
make_pass_insert_endbr_and_patchable_area (gcc::context *ctxt)
{
x86: Add UNSPECV_PATCHABLE_AREA Currently patchable area is at the wrong place. It is placed immediately after function label, before both .cfi_startproc and ENDBR. This patch adds UNSPECV_PATCHABLE_AREA for pseudo patchable area instruction and changes ENDBR insertion pass to also insert patchable area instruction. TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY is defined to avoid placing patchable area before .cfi_startproc and ENDBR. gcc/ PR target/93492 * config/i386/i386-features.c (rest_of_insert_endbranch): Renamed to ... (rest_of_insert_endbr_and_patchable_area): Change return type to void. Add need_endbr and patchable_area_size arguments. Don't call timevar_push nor timevar_pop. Replace endbr_queued_at_entrance with insn_queued_at_entrance. Insert UNSPECV_PATCHABLE_AREA for patchable area. (pass_data_insert_endbranch): Renamed to ... (pass_data_insert_endbr_and_patchable_area): This. Change pass name to endbr_and_patchable_area. (pass_insert_endbranch): Renamed to ... (pass_insert_endbr_and_patchable_area): This. Add need_endbr and patchable_area_size;. (pass_insert_endbr_and_patchable_area::gate): Set and check need_endbr and patchable_area_size. (pass_insert_endbr_and_patchable_area::execute): Call timevar_push and timevar_pop. Pass need_endbr and patchable_area_size to rest_of_insert_endbr_and_patchable_area. (make_pass_insert_endbranch): Renamed to ... (make_pass_insert_endbr_and_patchable_area): This. * config/i386/i386-passes.def: Replace pass_insert_endbranch with pass_insert_endbr_and_patchable_area. * config/i386/i386-protos.h (ix86_output_patchable_area): New. (make_pass_insert_endbranch): Renamed to ... (make_pass_insert_endbr_and_patchable_area): This. * config/i386/i386.c (ix86_asm_output_function_label): Set function_label_emitted to true. (ix86_print_patchable_function_entry): New function. (ix86_output_patchable_area): Likewise. (x86_function_profiler): Replace endbr_queued_at_entrance with insn_queued_at_entrance. Generate ENDBR only for TYPE_ENDBR. Call ix86_output_patchable_area to generate patchable area if needed. (TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY): New. * config/i386/i386.h (queued_insn_type): New. (machine_function): Add function_label_emitted. Replace endbr_queued_at_entrance with insn_queued_at_entrance. * config/i386/i386.md (UNSPECV_PATCHABLE_AREA): New. (patchable_area): New. gcc/testsuite/ PR target/93492 * gcc.target/i386/pr93492-1.c: New test. * gcc.target/i386/pr93492-2.c: Likewise. * gcc.target/i386/pr93492-3.c: Likewise. * gcc.target/i386/pr93492-4.c: Likewise. * gcc.target/i386/pr93492-5.c: Likewise.
2020-02-03 19:22:57 +01:00
return new pass_insert_endbr_and_patchable_area (ctxt);
}
/* At entry of the nearest common dominator for basic blocks with
conversions/rcp/sqrt/rsqrt/round, generate a single
vxorps %xmmN, %xmmN, %xmmN
for all
vcvtss2sd op, %xmmN, %xmmX
vcvtsd2ss op, %xmmN, %xmmX
vcvtsi2ss op, %xmmN, %xmmX
vcvtsi2sd op, %xmmN, %xmmX
NB: We want to generate only a single vxorps to cover the whole
function. The LCM algorithm isn't appropriate here since it may
place a vxorps inside the loop. */
static unsigned int
remove_partial_avx_dependency (void)
{
timevar_push (TV_MACH_DEP);
bitmap_obstack_initialize (NULL);
bitmap convert_bbs = BITMAP_ALLOC (NULL);
basic_block bb;
rtx_insn *insn, *set_insn;
rtx set;
rtx v4sf_const0 = NULL_RTX;
auto_vec<rtx_insn *> control_flow_insns;
/* We create invalid RTL initially so defer rescans. */
df_set_flags (DF_DEFER_INSN_RESCAN);
FOR_EACH_BB_FN (bb, cfun)
{
FOR_BB_INSNS (bb, insn)
{
if (!NONDEBUG_INSN_P (insn))
continue;
set = single_set (insn);
if (!set)
continue;
if (get_attr_avx_partial_xmm_update (insn)
!= AVX_PARTIAL_XMM_UPDATE_TRUE)
continue;
/* Convert PARTIAL_XMM_UPDATE_TRUE insns, DF -> SF, SF -> DF,
SI -> SF, SI -> DF, DI -> SF, DI -> DF, sqrt, rsqrt, rcp,
round, to vec_dup and vec_merge with subreg. */
rtx src = SET_SRC (set);
rtx dest = SET_DEST (set);
machine_mode dest_mode = GET_MODE (dest);
bool convert_p = false;
switch (GET_CODE (src))
{
case FLOAT:
case FLOAT_EXTEND:
case FLOAT_TRUNCATE:
case UNSIGNED_FLOAT:
convert_p = true;
break;
default:
break;
}
/* Only hanlde conversion here. */
machine_mode src_mode
= convert_p ? GET_MODE (XEXP (src, 0)) : VOIDmode;
switch (src_mode)
{
case E_SFmode:
case E_DFmode:
x86: Add TARGET_SSE_PARTIAL_REG_[FP_]CONVERTS_DEPENDENCY 1. Replace TARGET_SSE_PARTIAL_REG_DEPENDENCY with TARGET_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY in SSE FP to FP splitters. 2. Replace TARGET_SSE_PARTIAL_REG_DEPENDENCY with TARGET_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY in SSE INT to FP splitters. 3. Also check TARGET_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY and TARGET_SSE_PARTIAL_REG_DEPENDENCY when handling avx_partial_xmm_update attribute. Don't convert AVX partial XMM register update if there is no partial SSE register dependency for SSE conversion. gcc/ * config/i386/i386-features.c (remove_partial_avx_dependency): Also check TARGET_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY and and TARGET_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY before generating vxorps. * config/i386/i386.h (TARGET_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY): New. (TARGET_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY): Likewise. * config/i386/i386.md (SSE FP to FP splitters): Replace TARGET_SSE_PARTIAL_REG_DEPENDENCY with TARGET_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY. (SSE INT to FP splitter): Replace TARGET_SSE_PARTIAL_REG_DEPENDENCY with TARGET_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY. * config/i386/x86-tune.def (X86_TUNE_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY): New. (X86_TUNE_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY): Likewise. gcc/testsuite/ * gcc.target/i386/avx-covert-1.c: New file. * gcc.target/i386/avx-fp-covert-1.c: Likewise. * gcc.target/i386/avx-int-covert-1.c: Likewise. * gcc.target/i386/sse-covert-1.c: Likewise. * gcc.target/i386/sse-fp-covert-1.c: Likewise. * gcc.target/i386/sse-int-covert-1.c: Likewise.
2021-09-15 08:18:21 +02:00
if (TARGET_USE_VECTOR_FP_CONVERTS
|| !TARGET_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY)
continue;
break;
case E_SImode:
case E_DImode:
x86: Add TARGET_SSE_PARTIAL_REG_[FP_]CONVERTS_DEPENDENCY 1. Replace TARGET_SSE_PARTIAL_REG_DEPENDENCY with TARGET_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY in SSE FP to FP splitters. 2. Replace TARGET_SSE_PARTIAL_REG_DEPENDENCY with TARGET_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY in SSE INT to FP splitters. 3. Also check TARGET_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY and TARGET_SSE_PARTIAL_REG_DEPENDENCY when handling avx_partial_xmm_update attribute. Don't convert AVX partial XMM register update if there is no partial SSE register dependency for SSE conversion. gcc/ * config/i386/i386-features.c (remove_partial_avx_dependency): Also check TARGET_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY and and TARGET_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY before generating vxorps. * config/i386/i386.h (TARGET_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY): New. (TARGET_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY): Likewise. * config/i386/i386.md (SSE FP to FP splitters): Replace TARGET_SSE_PARTIAL_REG_DEPENDENCY with TARGET_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY. (SSE INT to FP splitter): Replace TARGET_SSE_PARTIAL_REG_DEPENDENCY with TARGET_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY. * config/i386/x86-tune.def (X86_TUNE_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY): New. (X86_TUNE_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY): Likewise. gcc/testsuite/ * gcc.target/i386/avx-covert-1.c: New file. * gcc.target/i386/avx-fp-covert-1.c: Likewise. * gcc.target/i386/avx-int-covert-1.c: Likewise. * gcc.target/i386/sse-covert-1.c: Likewise. * gcc.target/i386/sse-fp-covert-1.c: Likewise. * gcc.target/i386/sse-int-covert-1.c: Likewise.
2021-09-15 08:18:21 +02:00
if (TARGET_USE_VECTOR_CONVERTS
|| !TARGET_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY)
continue;
break;
case E_VOIDmode:
gcc_assert (!convert_p);
break;
default:
gcc_unreachable ();
}
if (!v4sf_const0)
v4sf_const0 = gen_reg_rtx (V4SFmode);
rtx zero;
machine_mode dest_vecmode;
switch (dest_mode)
{
case E_HFmode:
dest_vecmode = V8HFmode;
zero = gen_rtx_SUBREG (V8HFmode, v4sf_const0, 0);
break;
case E_SFmode:
dest_vecmode = V4SFmode;
zero = v4sf_const0;
break;
case E_DFmode:
dest_vecmode = V2DFmode;
zero = gen_rtx_SUBREG (V2DFmode, v4sf_const0, 0);
break;
default:
gcc_unreachable ();
}
/* Change source to vector mode. */
src = gen_rtx_VEC_DUPLICATE (dest_vecmode, src);
src = gen_rtx_VEC_MERGE (dest_vecmode, src, zero,
GEN_INT (HOST_WIDE_INT_1U));
/* Change destination to vector mode. */
rtx vec = gen_reg_rtx (dest_vecmode);
/* Generate an XMM vector SET. */
set = gen_rtx_SET (vec, src);
set_insn = emit_insn_before (set, insn);
df_insn_rescan (set_insn);
if (cfun->can_throw_non_call_exceptions)
{
/* Handle REG_EH_REGION note. */
rtx note = find_reg_note (insn, REG_EH_REGION, NULL_RTX);
if (note)
{
control_flow_insns.safe_push (set_insn);
add_reg_note (set_insn, REG_EH_REGION, XEXP (note, 0));
}
}
src = gen_rtx_SUBREG (dest_mode, vec, 0);
set = gen_rtx_SET (dest, src);
/* Drop possible dead definitions. */
PATTERN (insn) = set;
INSN_CODE (insn) = -1;
recog_memoized (insn);
df_insn_rescan (insn);
bitmap_set_bit (convert_bbs, bb->index);
}
}
if (v4sf_const0)
{
/* (Re-)discover loops so that bb->loop_father can be used in the
analysis below. */
calculate_dominance_info (CDI_DOMINATORS);
loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
/* Generate a vxorps at entry of the nearest dominator for basic
Fix up duplicated duplicated words mostly in comments In the r10-7197-gbae7b38cf8a21e068ad5c0bab089dedb78af3346 commit I've noticed duplicated word in a message, which lead me to grep for those and we have a tons of them. I've used grep -v 'long long\|optab optab\|template template\|double double' *.[chS] */*.[chS] *.def config/*/* 2>/dev/null | grep ' \([a-zA-Z]\+\) \1 ' Note, the command will not detect the doubled words at the start or end of line or when one of the words is at the end of line and the next one at the start of another one. Some of it is fairly obvious, e.g. all the "the the" cases which is something I've posted and committed patch for already e.g. in 2016, other cases are often valid, e.g. "that that" seems to look mostly ok to me. Some cases are quite hard to figure out, I've left out some of them from the patch (e.g. "and and" in some cases isn't talking about bitwise/logical and and so looks incorrect, but in other cases it is talking about those operations). In most cases the right solution seems to be to remove one of the duplicated words, but not always. I think most important are the ones with user visible messages (in the patch 3 of the first 4 hunks), the rest is just comments (and internal documentation; for that see the doc/tm.texi changes). 2020-03-17 Jakub Jelinek <jakub@redhat.com> * lra-spills.c (remove_pseudos): Fix up duplicated word issue in a dump message. * tree-sra.c (create_access_replacement): Fix up duplicated word issue in a comment. * read-rtl-function.c (find_param_by_name, function_reader::parse_enum_value, function_reader::get_insn_by_uid): Likewise. * spellcheck.c (get_edit_distance_cutoff): Likewise. * tree-data-ref.c (create_ifn_alias_checks): Likewise. * tree.def (SWITCH_EXPR): Likewise. * selftest.c (assert_str_contains): Likewise. * ipa-param-manipulation.h (class ipa_param_body_adjustments): Likewise. * tree-ssa-math-opts.c (convert_expand_mult_copysign): Likewise. * tree-ssa-loop-split.c (find_vdef_in_loop): Likewise. * langhooks.h (struct lang_hooks_for_decls): Likewise. * ipa-prop.h (struct ipa_param_descriptor): Likewise. * tree-ssa-strlen.c (handle_builtin_string_cmp, handle_store): Likewise. * tree-ssa-dom.c (simplify_stmt_for_jump_threading): Likewise. * tree-ssa-reassoc.c (reassociate_bb): Likewise. * tree.c (component_ref_size): Likewise. * hsa-common.c (hsa_init_compilation_unit_data): Likewise. * gimple-ssa-sprintf.c (get_string_length, format_string, format_directive): Likewise. * omp-grid.c (grid_process_kernel_body_copy): Likewise. * input.c (string_concat_db::get_string_concatenation, test_lexer_string_locations_ucn4): Likewise. * cfgexpand.c (pass_expand::execute): Likewise. * gimple-ssa-warn-restrict.c (builtin_memref::offset_out_of_bounds, maybe_diag_overlap): Likewise. * rtl.c (RTX_CODE_HWINT_P_1): Likewise. * shrink-wrap.c (spread_components): Likewise. * tree-ssa-dse.c (initialize_ao_ref_for_dse, valid_ao_ref_for_dse): Likewise. * tree-call-cdce.c (shrink_wrap_one_built_in_call_with_conds): Likewise. * dwarf2out.c (dwarf2out_early_finish): Likewise. * gimple-ssa-store-merging.c: Likewise. * ira-costs.c (record_operand_costs): Likewise. * tree-vect-loop.c (vectorizable_reduction): Likewise. * target.def (dispatch): Likewise. (validate_dims, gen_ccmp_first): Fix up duplicated word issue in documentation text. * doc/tm.texi: Regenerated. * config/i386/x86-tune.def (X86_TUNE_PARTIAL_FLAG_REG_STALL): Fix up duplicated word issue in a comment. * config/i386/i386.c (ix86_test_loading_unspec): Likewise. * config/i386/i386-features.c (remove_partial_avx_dependency): Likewise. * config/msp430/msp430.c (msp430_select_section): Likewise. * config/gcn/gcn-run.c (load_image): Likewise. * config/aarch64/aarch64-sve.md (sve_ld1r<mode>): Likewise. * config/aarch64/aarch64.c (aarch64_gen_adjusted_ldpstp): Likewise. * config/aarch64/falkor-tag-collision-avoidance.c (single_dest_per_chain): Likewise. * config/nvptx/nvptx.c (nvptx_record_fndecl): Likewise. * config/fr30/fr30.c (fr30_arg_partial_bytes): Likewise. * config/rs6000/rs6000-string.c (expand_cmp_vec_sequence): Likewise. * config/rs6000/rs6000-p8swap.c (replace_swapped_load_constant): Likewise. * config/rs6000/rs6000-c.c (rs6000_target_modify_macros): Likewise. * config/rs6000/rs6000.c (rs6000_option_override_internal): Likewise. * config/rs6000/rs6000-logue.c (rs6000_emit_probe_stack_range_stack_clash): Likewise. * config/nds32/nds32-md-auxiliary.c (nds32_split_ashiftdi3): Likewise. Fix various other issues in the comment. c-family/ * c-common.c (resolve_overloaded_builtin): Fix up duplicated word issue in a diagnostic message. cp/ * pt.c (tsubst): Fix up duplicated word issue in a diagnostic message. (lookup_template_class_1, tsubst_expr): Fix up duplicated word issue in a comment. * parser.c (cp_parser_statement, cp_parser_linkage_specification, cp_parser_placeholder_type_specifier, cp_parser_constraint_requires_parens): Likewise. * name-lookup.c (suggest_alternative_in_explicit_scope): Likewise. fortran/ * array.c (gfc_check_iter_variable): Fix up duplicated word issue in a comment. * arith.c (gfc_arith_concat): Likewise. * resolve.c (gfc_resolve_ref): Likewise. * frontend-passes.c (matmul_lhs_realloc): Likewise. * module.c (gfc_match_submodule, load_needed): Likewise. * trans-expr.c (gfc_init_se): Likewise.
2020-03-17 13:52:19 +01:00
blocks with conversions, which is in the fake loop that
contains the whole function, so that there is only a single
vxorps in the whole function. */
bb = nearest_common_dominator_for_set (CDI_DOMINATORS,
convert_bbs);
while (bb->loop_father->latch
!= EXIT_BLOCK_PTR_FOR_FN (cfun))
bb = get_immediate_dominator (CDI_DOMINATORS,
bb->loop_father->header);
set = gen_rtx_SET (v4sf_const0, CONST0_RTX (V4SFmode));
insn = BB_HEAD (bb);
while (insn && !NONDEBUG_INSN_P (insn))
{
if (insn == BB_END (bb))
{
insn = NULL;
break;
}
insn = NEXT_INSN (insn);
}
if (insn == BB_HEAD (bb))
set_insn = emit_insn_before (set, insn);
else
set_insn = emit_insn_after (set,
insn ? PREV_INSN (insn) : BB_END (bb));
df_insn_rescan (set_insn);
loop_optimizer_finalize ();
if (!control_flow_insns.is_empty ())
{
free_dominance_info (CDI_DOMINATORS);
unsigned int i;
FOR_EACH_VEC_ELT (control_flow_insns, i, insn)
if (control_flow_insn_p (insn))
{
/* Split the block after insn. There will be a fallthru
edge, which is OK so we keep it. We have to create
the exception edges ourselves. */
bb = BLOCK_FOR_INSN (insn);
split_block (bb, insn);
rtl_make_eh_edge (NULL, bb, BB_END (bb));
}
}
}
df_process_deferred_rescans ();
df_clear_flags (DF_DEFER_INSN_RESCAN);
bitmap_obstack_release (NULL);
BITMAP_FREE (convert_bbs);
timevar_pop (TV_MACH_DEP);
return 0;
}
namespace {
const pass_data pass_data_remove_partial_avx_dependency =
{
RTL_PASS, /* type */
"rpad", /* name */
OPTGROUP_NONE, /* optinfo_flags */
TV_MACH_DEP, /* tv_id */
0, /* properties_required */
0, /* properties_provided */
0, /* properties_destroyed */
0, /* todo_flags_start */
0, /* todo_flags_finish */
};
class pass_remove_partial_avx_dependency : public rtl_opt_pass
{
public:
pass_remove_partial_avx_dependency (gcc::context *ctxt)
: rtl_opt_pass (pass_data_remove_partial_avx_dependency, ctxt)
{}
/* opt_pass methods: */
Add 'final' and 'override' to opt_pass vfunc impls gcc/ChangeLog: * adjust-alignment.cc: Add "final" and "override" to opt_pass vfunc implementations, removing redundant "virtual" as appropriate. * asan.cc: Likewise. * auto-inc-dec.cc: Likewise. * auto-profile.cc: Likewise. * bb-reorder.cc: Likewise. * cfgcleanup.cc: Likewise. * cfgexpand.cc: Likewise. * cfgrtl.cc: Likewise. * cgraphbuild.cc: Likewise. * combine-stack-adj.cc: Likewise. * combine.cc: Likewise. * compare-elim.cc: Likewise. * config/i386/i386-features.cc: Likewise. * coroutine-passes.cc: Likewise. * cprop.cc: Likewise. * cse.cc: Likewise. * dce.cc: Likewise. * df-core.cc: Likewise. * dse.cc: Likewise. * dwarf2cfi.cc: Likewise. * early-remat.cc: Likewise. * except.cc: Likewise. * final.cc: Likewise. * function.cc: Likewise. * fwprop.cc: Likewise. * gcse.cc: Likewise. * gimple-harden-conditionals.cc: Likewise. * gimple-if-to-switch.cc: Likewise. * gimple-isel.cc: Likewise. * gimple-laddress.cc: Likewise. * gimple-loop-interchange.cc: Likewise. * gimple-loop-jam.cc: Likewise. * gimple-loop-versioning.cc: Likewise. * gimple-low.cc: Likewise. * gimple-ssa-backprop.cc: Likewise. * gimple-ssa-evrp.cc: Likewise. * gimple-ssa-isolate-paths.cc: Likewise. * gimple-ssa-nonnull-compare.cc: Likewise. * gimple-ssa-split-paths.cc: Likewise. * gimple-ssa-store-merging.cc: Likewise. * gimple-ssa-strength-reduction.cc: Likewise. * gimple-ssa-warn-access.cc: Likewise. * gimple-ssa-warn-alloca.cc: Likewise. * gimple-ssa-warn-restrict.cc: Likewise. * gimple-warn-recursion.cc: Likewise. * graphite.cc: Likewise. * ifcvt.cc: Likewise. * init-regs.cc: Likewise. * ipa-comdats.cc: Likewise. * ipa-cp.cc: Likewise. * ipa-devirt.cc: Likewise. * ipa-fnsummary.cc: Likewise. * ipa-free-lang-data.cc: Likewise. * ipa-icf.cc: Likewise. * ipa-inline.cc: Likewise. * ipa-modref.cc: Likewise. * ipa-profile.cc: Likewise. * ipa-pure-const.cc: Likewise. * ipa-reference.cc: Likewise. * ipa-split.cc: Likewise. * ipa-sra.cc: Likewise. * ipa-visibility.cc: Likewise. * ipa.cc: Likewise. * ira.cc: Likewise. * jump.cc: Likewise. * loop-init.cc: Likewise. * lower-subreg.cc: Likewise. * mode-switching.cc: Likewise. * modulo-sched.cc: Likewise. * multiple_target.cc: Likewise. * omp-expand.cc: Likewise. * omp-low.cc: Likewise. * omp-oacc-kernels-decompose.cc: Likewise. * omp-oacc-neuter-broadcast.cc: Likewise. * omp-offload.cc: Likewise. * omp-simd-clone.cc: Likewise. * passes.cc: Likewise. * postreload-gcse.cc: Likewise. * postreload.cc: Likewise. * predict.cc: Likewise. * recog.cc: Likewise. * ree.cc: Likewise. * reg-stack.cc: Likewise. * regcprop.cc: Likewise. * reginfo.cc: Likewise. * regrename.cc: Likewise. * reorg.cc: Likewise. * sancov.cc: Likewise. * sanopt.cc: Likewise. * sched-rgn.cc: Likewise. * stack-ptr-mod.cc: Likewise. * store-motion.cc: Likewise. * tracer.cc: Likewise. * trans-mem.cc: Likewise. * tree-call-cdce.cc: Likewise. * tree-cfg.cc: Likewise. * tree-cfgcleanup.cc: Likewise. * tree-complex.cc: Likewise. * tree-eh.cc: Likewise. * tree-emutls.cc: Likewise. * tree-if-conv.cc: Likewise. * tree-into-ssa.cc: Likewise. * tree-loop-distribution.cc: Likewise. * tree-nrv.cc: Likewise. * tree-object-size.cc: Likewise. * tree-parloops.cc: Likewise. * tree-predcom.cc: Likewise. * tree-profile.cc: Likewise. * tree-sra.cc: Likewise. * tree-ssa-ccp.cc: Likewise. * tree-ssa-copy.cc: Likewise. * tree-ssa-dce.cc: Likewise. * tree-ssa-dom.cc: Likewise. * tree-ssa-dse.cc: Likewise. * tree-ssa-forwprop.cc: Likewise. * tree-ssa-ifcombine.cc: Likewise. * tree-ssa-loop-ch.cc: Likewise. * tree-ssa-loop-im.cc: Likewise. * tree-ssa-loop-ivcanon.cc: Likewise. * tree-ssa-loop-prefetch.cc: Likewise. * tree-ssa-loop-split.cc: Likewise. * tree-ssa-loop-unswitch.cc: Likewise. * tree-ssa-loop.cc: Likewise. * tree-ssa-math-opts.cc: Likewise. * tree-ssa-phiopt.cc: Likewise. * tree-ssa-phiprop.cc: Likewise. * tree-ssa-pre.cc: Likewise. * tree-ssa-reassoc.cc: Likewise. * tree-ssa-sccvn.cc: Likewise. * tree-ssa-sink.cc: Likewise. * tree-ssa-strlen.cc: Likewise. * tree-ssa-structalias.cc: Likewise. * tree-ssa-uncprop.cc: Likewise. * tree-ssa-uninit.cc: Likewise. * tree-ssanames.cc: Likewise. * tree-stdarg.cc: Likewise. * tree-switch-conversion.cc: Likewise. * tree-tailcall.cc: Likewise. * tree-vect-generic.cc: Likewise. * tree-vectorizer.cc: Likewise. * tree-vrp.cc: Likewise. * tsan.cc: Likewise. * ubsan.cc: Likewise. * var-tracking.cc: Likewise. * vtable-verify.cc: Likewise. * web.cc: Likewise. Signed-off-by: David Malcolm <dmalcolm@redhat.com>
2022-06-27 23:00:33 +02:00
bool gate (function *) final override
{
Remove pass_cpb which is related to enable avx512 embedded broadcast from constant pool. By optimizing vector movement to broadcast in ix86_expand_vector_move during pass_expand, pass_reload/LRA can automatically generate an avx512 embedded broadcast, pass_cpb is not needed. Considering that in the absence of avx512f, broadcast from memory is still slightly faster than loading the entire memory, so always enable broadcast. benchmark: https://gitlab.com/x86-benchmarks/microbenchmark/-/tree/vaddps/broadcast The performance diff strategy : cycles memory : 1046611188 memory : 1255420817 memory : 1044720793 memory : 1253414145 average : 1097868397 broadcast : 1044430688 broadcast : 1044477630 broadcast : 1253554603 broadcast : 1044561934 average : 1096756213 But however broadcast has larger size. the size diff size broadcast.o text data bss dec hex filename 137 0 0 137 89 broadcast.o size memory.o text data bss dec hex filename 115 0 0 115 73 memory.o gcc/ChangeLog: * config/i386/i386-expand.c (ix86_broadcast_from_integer_constant): Rename to .. (ix86_broadcast_from_constant): .. this, and extend it to handle float mode. (ix86_expand_vector_move): Extend to float mode. * config/i386/i386-features.c (replace_constant_pool_with_broadcast): Remove. (remove_partial_avx_dependency_gate): Ditto. (constant_pool_broadcast): Ditto. (class pass_constant_pool_broadcast): Ditto. (make_pass_constant_pool_broadcast): Ditto. (remove_partial_avx_dependency): Adjust gate. * config/i386/i386-passes.def: Remove pass_constant_pool_broadcast. * config/i386/i386-protos.h (make_pass_constant_pool_broadcast): Remove. gcc/testsuite/ChangeLog: * gcc.target/i386/fuse-caller-save-xmm.c: Adjust testcase.
2021-07-13 12:22:03 +02:00
return (TARGET_AVX
&& TARGET_SSE_PARTIAL_REG_DEPENDENCY
&& TARGET_SSE_MATH
&& optimize
&& optimize_function_for_speed_p (cfun));
}
Add 'final' and 'override' to opt_pass vfunc impls gcc/ChangeLog: * adjust-alignment.cc: Add "final" and "override" to opt_pass vfunc implementations, removing redundant "virtual" as appropriate. * asan.cc: Likewise. * auto-inc-dec.cc: Likewise. * auto-profile.cc: Likewise. * bb-reorder.cc: Likewise. * cfgcleanup.cc: Likewise. * cfgexpand.cc: Likewise. * cfgrtl.cc: Likewise. * cgraphbuild.cc: Likewise. * combine-stack-adj.cc: Likewise. * combine.cc: Likewise. * compare-elim.cc: Likewise. * config/i386/i386-features.cc: Likewise. * coroutine-passes.cc: Likewise. * cprop.cc: Likewise. * cse.cc: Likewise. * dce.cc: Likewise. * df-core.cc: Likewise. * dse.cc: Likewise. * dwarf2cfi.cc: Likewise. * early-remat.cc: Likewise. * except.cc: Likewise. * final.cc: Likewise. * function.cc: Likewise. * fwprop.cc: Likewise. * gcse.cc: Likewise. * gimple-harden-conditionals.cc: Likewise. * gimple-if-to-switch.cc: Likewise. * gimple-isel.cc: Likewise. * gimple-laddress.cc: Likewise. * gimple-loop-interchange.cc: Likewise. * gimple-loop-jam.cc: Likewise. * gimple-loop-versioning.cc: Likewise. * gimple-low.cc: Likewise. * gimple-ssa-backprop.cc: Likewise. * gimple-ssa-evrp.cc: Likewise. * gimple-ssa-isolate-paths.cc: Likewise. * gimple-ssa-nonnull-compare.cc: Likewise. * gimple-ssa-split-paths.cc: Likewise. * gimple-ssa-store-merging.cc: Likewise. * gimple-ssa-strength-reduction.cc: Likewise. * gimple-ssa-warn-access.cc: Likewise. * gimple-ssa-warn-alloca.cc: Likewise. * gimple-ssa-warn-restrict.cc: Likewise. * gimple-warn-recursion.cc: Likewise. * graphite.cc: Likewise. * ifcvt.cc: Likewise. * init-regs.cc: Likewise. * ipa-comdats.cc: Likewise. * ipa-cp.cc: Likewise. * ipa-devirt.cc: Likewise. * ipa-fnsummary.cc: Likewise. * ipa-free-lang-data.cc: Likewise. * ipa-icf.cc: Likewise. * ipa-inline.cc: Likewise. * ipa-modref.cc: Likewise. * ipa-profile.cc: Likewise. * ipa-pure-const.cc: Likewise. * ipa-reference.cc: Likewise. * ipa-split.cc: Likewise. * ipa-sra.cc: Likewise. * ipa-visibility.cc: Likewise. * ipa.cc: Likewise. * ira.cc: Likewise. * jump.cc: Likewise. * loop-init.cc: Likewise. * lower-subreg.cc: Likewise. * mode-switching.cc: Likewise. * modulo-sched.cc: Likewise. * multiple_target.cc: Likewise. * omp-expand.cc: Likewise. * omp-low.cc: Likewise. * omp-oacc-kernels-decompose.cc: Likewise. * omp-oacc-neuter-broadcast.cc: Likewise. * omp-offload.cc: Likewise. * omp-simd-clone.cc: Likewise. * passes.cc: Likewise. * postreload-gcse.cc: Likewise. * postreload.cc: Likewise. * predict.cc: Likewise. * recog.cc: Likewise. * ree.cc: Likewise. * reg-stack.cc: Likewise. * regcprop.cc: Likewise. * reginfo.cc: Likewise. * regrename.cc: Likewise. * reorg.cc: Likewise. * sancov.cc: Likewise. * sanopt.cc: Likewise. * sched-rgn.cc: Likewise. * stack-ptr-mod.cc: Likewise. * store-motion.cc: Likewise. * tracer.cc: Likewise. * trans-mem.cc: Likewise. * tree-call-cdce.cc: Likewise. * tree-cfg.cc: Likewise. * tree-cfgcleanup.cc: Likewise. * tree-complex.cc: Likewise. * tree-eh.cc: Likewise. * tree-emutls.cc: Likewise. * tree-if-conv.cc: Likewise. * tree-into-ssa.cc: Likewise. * tree-loop-distribution.cc: Likewise. * tree-nrv.cc: Likewise. * tree-object-size.cc: Likewise. * tree-parloops.cc: Likewise. * tree-predcom.cc: Likewise. * tree-profile.cc: Likewise. * tree-sra.cc: Likewise. * tree-ssa-ccp.cc: Likewise. * tree-ssa-copy.cc: Likewise. * tree-ssa-dce.cc: Likewise. * tree-ssa-dom.cc: Likewise. * tree-ssa-dse.cc: Likewise. * tree-ssa-forwprop.cc: Likewise. * tree-ssa-ifcombine.cc: Likewise. * tree-ssa-loop-ch.cc: Likewise. * tree-ssa-loop-im.cc: Likewise. * tree-ssa-loop-ivcanon.cc: Likewise. * tree-ssa-loop-prefetch.cc: Likewise. * tree-ssa-loop-split.cc: Likewise. * tree-ssa-loop-unswitch.cc: Likewise. * tree-ssa-loop.cc: Likewise. * tree-ssa-math-opts.cc: Likewise. * tree-ssa-phiopt.cc: Likewise. * tree-ssa-phiprop.cc: Likewise. * tree-ssa-pre.cc: Likewise. * tree-ssa-reassoc.cc: Likewise. * tree-ssa-sccvn.cc: Likewise. * tree-ssa-sink.cc: Likewise. * tree-ssa-strlen.cc: Likewise. * tree-ssa-structalias.cc: Likewise. * tree-ssa-uncprop.cc: Likewise. * tree-ssa-uninit.cc: Likewise. * tree-ssanames.cc: Likewise. * tree-stdarg.cc: Likewise. * tree-switch-conversion.cc: Likewise. * tree-tailcall.cc: Likewise. * tree-vect-generic.cc: Likewise. * tree-vectorizer.cc: Likewise. * tree-vrp.cc: Likewise. * tsan.cc: Likewise. * ubsan.cc: Likewise. * var-tracking.cc: Likewise. * vtable-verify.cc: Likewise. * web.cc: Likewise. Signed-off-by: David Malcolm <dmalcolm@redhat.com>
2022-06-27 23:00:33 +02:00
unsigned int execute (function *) final override
{
return remove_partial_avx_dependency ();
}
}; // class pass_rpad
} // anon namespace
rtl_opt_pass *
make_pass_remove_partial_avx_dependency (gcc::context *ctxt)
{
return new pass_remove_partial_avx_dependency (ctxt);
}
/* This compares the priority of target features in function DECL1
and DECL2. It returns positive value if DECL1 is higher priority,
negative value if DECL2 is higher priority and 0 if they are the
same. */
int
ix86_compare_version_priority (tree decl1, tree decl2)
{
unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
return (int)priority1 - (int)priority2;
}
/* V1 and V2 point to function versions with different priorities
based on the target ISA. This function compares their priorities. */
static int
feature_compare (const void *v1, const void *v2)
{
typedef struct _function_version_info
{
tree version_decl;
tree predicate_chain;
unsigned int dispatch_priority;
} function_version_info;
const function_version_info c1 = *(const function_version_info *)v1;
const function_version_info c2 = *(const function_version_info *)v2;
return (c2.dispatch_priority - c1.dispatch_priority);
}
/* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
to return a pointer to VERSION_DECL if the outcome of the expression
formed by PREDICATE_CHAIN is true. This function will be called during
version dispatch to decide which function version to execute. It returns
the basic block at the end, to which more conditions can be added. */
static basic_block
add_condition_to_bb (tree function_decl, tree version_decl,
tree predicate_chain, basic_block new_bb)
{
gimple *return_stmt;
tree convert_expr, result_var;
gimple *convert_stmt;
gimple *call_cond_stmt;
gimple *if_else_stmt;
basic_block bb1, bb2, bb3;
edge e12, e23;
tree cond_var, and_expr_var = NULL_TREE;
gimple_seq gseq;
tree predicate_decl, predicate_arg;
push_cfun (DECL_STRUCT_FUNCTION (function_decl));
gcc_assert (new_bb != NULL);
gseq = bb_seq (new_bb);
convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
build_fold_addr_expr (version_decl));
result_var = create_tmp_var (ptr_type_node);
convert_stmt = gimple_build_assign (result_var, convert_expr);
return_stmt = gimple_build_return (result_var);
if (predicate_chain == NULL_TREE)
{
gimple_seq_add_stmt (&gseq, convert_stmt);
gimple_seq_add_stmt (&gseq, return_stmt);
set_bb_seq (new_bb, gseq);
gimple_set_bb (convert_stmt, new_bb);
gimple_set_bb (return_stmt, new_bb);
pop_cfun ();
return new_bb;
}
while (predicate_chain != NULL)
{
cond_var = create_tmp_var (integer_type_node);
predicate_decl = TREE_PURPOSE (predicate_chain);
predicate_arg = TREE_VALUE (predicate_chain);
call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
gimple_call_set_lhs (call_cond_stmt, cond_var);
gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
gimple_set_bb (call_cond_stmt, new_bb);
gimple_seq_add_stmt (&gseq, call_cond_stmt);
predicate_chain = TREE_CHAIN (predicate_chain);
if (and_expr_var == NULL)
and_expr_var = cond_var;
else
{
gimple *assign_stmt;
/* Use MIN_EXPR to check if any integer is zero?.
and_expr_var = min_expr <cond_var, and_expr_var> */
assign_stmt = gimple_build_assign (and_expr_var,
build2 (MIN_EXPR, integer_type_node,
cond_var, and_expr_var));
gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
gimple_set_bb (assign_stmt, new_bb);
gimple_seq_add_stmt (&gseq, assign_stmt);
}
}
if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
integer_zero_node,
NULL_TREE, NULL_TREE);
gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
gimple_set_bb (if_else_stmt, new_bb);
gimple_seq_add_stmt (&gseq, if_else_stmt);
gimple_seq_add_stmt (&gseq, convert_stmt);
gimple_seq_add_stmt (&gseq, return_stmt);
set_bb_seq (new_bb, gseq);
bb1 = new_bb;
e12 = split_block (bb1, if_else_stmt);
bb2 = e12->dest;
e12->flags &= ~EDGE_FALLTHRU;
e12->flags |= EDGE_TRUE_VALUE;
e23 = split_block (bb2, return_stmt);
gimple_set_bb (convert_stmt, bb2);
gimple_set_bb (return_stmt, bb2);
bb3 = e23->dest;
make_edge (bb1, bb3, EDGE_FALSE_VALUE);
remove_edge (e23);
make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
pop_cfun ();
return bb3;
}
/* This function generates the dispatch function for
multi-versioned functions. DISPATCH_DECL is the function which will
contain the dispatch logic. FNDECLS are the function choices for
dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
in DISPATCH_DECL in which the dispatch code is generated. */
static int
dispatch_function_versions (tree dispatch_decl,
void *fndecls_p,
basic_block *empty_bb)
{
tree default_decl;
gimple *ifunc_cpu_init_stmt;
gimple_seq gseq;
int ix;
tree ele;
vec<tree> *fndecls;
unsigned int num_versions = 0;
unsigned int actual_versions = 0;
unsigned int i;
struct _function_version_info
{
tree version_decl;
tree predicate_chain;
unsigned int dispatch_priority;
}*function_version_info;
gcc_assert (dispatch_decl != NULL
&& fndecls_p != NULL
&& empty_bb != NULL);
/*fndecls_p is actually a vector. */
fndecls = static_cast<vec<tree> *> (fndecls_p);
/* At least one more version other than the default. */
num_versions = fndecls->length ();
gcc_assert (num_versions >= 2);
function_version_info = (struct _function_version_info *)
XNEWVEC (struct _function_version_info, (num_versions - 1));
/* The first version in the vector is the default decl. */
default_decl = (*fndecls)[0];
push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
gseq = bb_seq (*empty_bb);
/* Function version dispatch is via IFUNC. IFUNC resolvers fire before
constructors, so explicity call __builtin_cpu_init here. */
ifunc_cpu_init_stmt
= gimple_build_call_vec (get_ix86_builtin (IX86_BUILTIN_CPU_INIT), vNULL);
gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
set_bb_seq (*empty_bb, gseq);
pop_cfun ();
for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
{
tree version_decl = ele;
tree predicate_chain = NULL_TREE;
unsigned int priority;
/* Get attribute string, parse it and find the right predicate decl.
The predicate function could be a lengthy combination of many
features, like arch-type and various isa-variants. */
priority = get_builtin_code_for_version (version_decl,
&predicate_chain);
if (predicate_chain == NULL_TREE)
continue;
function_version_info [actual_versions].version_decl = version_decl;
function_version_info [actual_versions].predicate_chain
= predicate_chain;
function_version_info [actual_versions].dispatch_priority = priority;
actual_versions++;
}
/* Sort the versions according to descending order of dispatch priority. The
priority is based on the ISA. This is not a perfect solution. There
could still be ambiguity. If more than one function version is suitable
to execute, which one should be dispatched? In future, allow the user
to specify a dispatch priority next to the version. */
qsort (function_version_info, actual_versions,
sizeof (struct _function_version_info), feature_compare);
for (i = 0; i < actual_versions; ++i)
*empty_bb = add_condition_to_bb (dispatch_decl,
function_version_info[i].version_decl,
function_version_info[i].predicate_chain,
*empty_bb);
/* dispatch default version at the end. */
*empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
NULL, *empty_bb);
free (function_version_info);
return 0;
}
/* This function changes the assembler name for functions that are
versions. If DECL is a function version and has a "target"
attribute, it appends the attribute string to its assembler name. */
static tree
ix86_mangle_function_version_assembler_name (tree decl, tree id)
{
tree version_attr;
const char *orig_name, *version_string;
char *attr_str, *assembler_name;
if (DECL_DECLARED_INLINE_P (decl)
&& lookup_attribute ("gnu_inline",
DECL_ATTRIBUTES (decl)))
error_at (DECL_SOURCE_LOCATION (decl),
trans.c (check_inlining_for_nested_subprog): Quote reserved names. gcc/ada/ChangeLog: * gcc-interface/trans.c (check_inlining_for_nested_subprog): Quote reserved names. gcc/brig/ChangeLog: * brigfrontend/brig-control-handler.cc (brig_directive_control_handler::operator): Remove trailing newline from a diagnostic. * brigfrontend/brig-module-handler.cc (brig_directive_module_handler::operator): Remove a duplicated space from a diagnostic. gcc/c/ChangeLog: * c-decl.c (start_decl): Quote keywords, operators, and types in diagnostics. (finish_decl): Same. * c-parser.c (c_parser_asm_statement): Same. (c_parser_conditional_expression): Same. (c_parser_transaction_cancel): Same. * c-typeck.c (c_common_type): Same. (build_conditional_expr): Same. (digest_init): Same. (process_init_element): Same. (build_binary_op): Same. gcc/c-family/ChangeLog: * c-attribs.c (handle_no_sanitize_attribute): Quote identifiers, keywords, operators, and types in diagnostics. (handle_scalar_storage_order_attribute): Same. (handle_mode_attribute): Same. (handle_visibility_attribute): Same. (handle_assume_aligned_attribute): Same. (handle_no_split_stack_attribute): Same. * c-common.c (shorten_compare): Same. (c_common_truthvalue_conversion): Same. (cb_get_source_date_epoch): Same. * c-lex.c (cb_def_pragma): Quote keywords, operators, and types in diagnostics. (interpret_float): Same. * c-omp.c (c_finish_omp_for): Same. * c-opts.c (c_common_post_options): Same. * c-pch.c (c_common_pch_pragma): Same. * c-pragma.c (pop_alignment): Same. (handle_pragma_pack): Same. (apply_pragma_weak): Same. (handle_pragma_weak): Same. (handle_pragma_scalar_storage_order): Same. (handle_pragma_redefine_extname): Same. (add_to_renaming_pragma_list): Same. (maybe_apply_renaming_pragma): Same. (push_visibility): Same. (handle_pragma_visibility): Same. (handle_pragma_optimize): Same. (handle_pragma_message): Same. * c-warn.c (warn_for_omitted_condop): Same. (lvalue_error): Same. gcc/cp/ChangeLog: * call.c (print_z_candidate): Wrap diagnostic text in a gettext macro. Adjust. (print_z_candidates): Same. (build_conditional_expr_1): Quote keywords, operators, and types in diagnostics. (build_op_delete_call): Same. (maybe_print_user_conv_context): Wrap diagnostic text in a gettext macro. (convert_like_real): Same. (convert_arg_to_ellipsis): Quote keywords, operators, and types in diagnostics. (build_over_call): Same. (joust): Break up an overlong line. Wrap diagnostic text in a gettext macro. * constexpr.c (cxx_eval_check_shift_p): Spell out >= in English. (cxx_eval_constant_expression): Quote keywords, operators, and types in diagnostics. (potential_constant_expression_1): Same. * cp-gimplify.c (cp_genericize_r): Same. * cvt.c (maybe_warn_nodiscard): Quote keywords, operators, and types in diagnostics. (type_promotes_to): Same. * decl.c (check_previous_goto_1): Same. (check_goto): Same. (start_decl): Same. (cp_finish_decl): Avoid parenthesizing a sentence for consistency. (grok_op_properties): Quote keywords, operators, and types in diagnostics. * decl2.c (grokfield): Same. (coerce_delete_type): Same. * except.c (is_admissible_throw_operand_or_catch_parameter): Same. * friend.c (do_friend): Quote C++ tokens. * init.c (build_new_1): Quote keywords, operators, and types in diagnostics. (build_vec_delete_1): Same. (build_delete): Same. * lex.c (parse_strconst_pragma): Same. (handle_pragma_implementation): Same. (unqualified_fn_lookup_error): Same. * mangle.c (write_type): Same. * method.c (defaulted_late_check): Avoid two consecutive punctuators. * name-lookup.c (cp_binding_level_debug): Remove a trailing newline. (pop_everything): Same. * parser.c (cp_lexer_start_debugging): Quote a macro name. in a diagnostic (cp_lexer_stop_debugging): Same. (cp_parser_userdef_numeric_literal): Quote a C++ header name in a diagnostic. (cp_parser_nested_name_specifier_opt): Quote keywords, operators, and types in diagnostics. (cp_parser_question_colon_clause): Same. (cp_parser_asm_definition): Same. (cp_parser_init_declarator): Same. (cp_parser_template_declaration_after_parameters): Avoid capitalizing a sentence in a diagnostic. (cp_parser_omp_declare_reduction): Quote keywords, operators, and types in diagnostics. (cp_parser_transaction): Same. * pt.c (maybe_process_partial_specialization): Replace second call to permerror with inform for consistency with other uses. (expand_integer_pack): Quote keywords, operators, and types in diagnostics. * rtti.c (get_typeid): Quote keywords, operators, and types in diagnostics. (build_dynamic_cast_1): Same. * semantics.c (finish_asm_stmt): Same. (finish_label_decl): Same. (finish_bases): Same. (finish_offsetof): Same. (cp_check_omp_declare_reduction): Same. (finish_decltype_type): Same. * tree.c (handle_init_priority_attribute): Same. Add detail to diagnostics. (maybe_warn_zero_as_null_pointer_constant): Same. * typeck.c (cp_build_binary_op): Quote keywords, operators, and types in diagnostics. (cp_build_unary_op): Same. (check_for_casting_away_constness): Same. (build_static_cast): Same. (build_const_cast_1): Same. (maybe_warn_about_returning_address_of_local): Same. (check_return_expr): Same. * typeck2.c (abstract_virtuals_error_sfinae): Same. (digest_init_r): Replace a tab with spaces in a diagnostic. (build_functional_cast): Quote keywords, operators, and types in diagnostics. gcc/d/ChangeLog: * d-builtins.cc (d_init_builtins): Quote keywords, operators, and types in diagnostics. * d-codegen.cc (get_array_length): Same. Replace can't with cannot. * d-convert.cc (convert_expr): Same. * d-frontend.cc (getTypeInfoType): Quote an option name in a diagnostic. * d-lang.cc (d_handle_option): Same. (d_parse_file): Same. * decl.cc: Remove a trailing period from a diagnostic. * expr.cc: Use a directive for an apostrophe. * toir.cc: Quote keywords, operators, and types in diagnostics. * typeinfo.cc (build_typeinfo): Quote an option name in a diagnostic. gcc/fortran/ChangeLog: * gfortranspec.c (append_arg): Spell out the word "argument." gcc/ChangeLog: * config/i386/i386-expand.c (get_element_number): Quote keywords and other internal names in diagnostics. Adjust other diagnostic formatting issues noted by -Wformat-diag. * config/i386/i386-features.c (ix86_mangle_function_version_assembler_name): Same. * config/i386/i386-options.c (ix86_handle_abi_attribute): Same. * config/i386/i386.c (ix86_function_type_abi): Same. (ix86_function_ms_hook_prologue): Same. (classify_argument): Same. (ix86_expand_prologue): Same. (ix86_md_asm_adjust): Same. (ix86_memmodel_check): Same. gcc/ChangeLog: * builtins.c (expand_builtin_atomic_always_lock_free): Quote identifiers, keywords, operators, and types in diagnostics. Correct quoting, spelling, and sentence capitalization issues. (expand_builtin_atomic_is_lock_free): Same. (fold_builtin_next_arg): Same. * cfgexpand.c (expand_one_var): Same. (tree_conflicts_with_clobbers_p): Same. (expand_asm_stmt): Same. (verify_loop_structure): Same. * cgraphunit.c (process_function_and_variable_attributes): Same. * collect-utils.c (collect_execute): Same. * collect2.c (maybe_run_lto_and_relink): Same. (is_lto_object_file): Same. (scan_prog_file): Same. * convert.c (convert_to_real_1): Same. * dwarf2out.c (dwarf2out_begin_prologue): Same. * except.c (verify_eh_tree): Same. * gcc.c (execute): Same. (eval_spec_function): Same. (run_attempt): Same. (driver::set_up_specs): Same. (compare_debug_auxbase_opt_spec_function): Same. * gcov-tool.c (unlink_gcda_file): Same. (do_merge): Same. (do_rewrite): Same. * gcse.c (gcse_or_cprop_is_too_expensive): Same. * gimplify.c (gimplify_asm_expr): Same. (gimplify_adjust_omp_clauses): Same. * hsa-gen.c (gen_hsa_addr_insns): Same. (gen_hsa_insns_for_load): Same. (gen_hsa_cmp_insn_from_gimple): Same. (gen_hsa_insns_for_operation_assignment): Same. (gen_get_level): Same. (gen_hsa_alloca): Same. (omp_simple_builtin::generate): Same. (gen_hsa_atomic_for_builtin): Same. (gen_hsa_insns_for_call): Same. * input.c (dump_location_info): Same. * ipa-devirt.c (compare_virtual_tables): Same. * ira.c (ira_setup_eliminable_regset): Same. * lra-assigns.c (lra_assign): Same. * lra-constraints.c (lra_constraints): Same. * lto-streamer-in.c (lto_input_mode_table): Same. * lto-wrapper.c (get_options_from_collect_gcc_options): Same. (merge_and_complain): Same. (compile_offload_image): Same. (compile_images_for_offload_targets): Same. (debug_objcopy): Same. (run_gcc): Same. (main): Same. * opts.c (print_specific_help): Same. (parse_no_sanitize_attribute): Same. (print_help): Same. (handle_param): Same. * plugin.c (add_new_plugin): Same. (parse_plugin_arg_opt): Same. (try_init_one_plugin): Same. * print-rtl.c (debug_bb_n_slim): Quote identifiers, keywords, operators, and types in diagnostics. Correct quoting and spelling issues. * read-rtl-function.c (parse_edge_flag_token): Same. (function_reader::parse_enum_value): Same. * reg-stack.c (check_asm_stack_operands): Same. * regcprop.c (validate_value_data): Same. * sched-rgn.c (make_pass_sched_fusion): Same. * stmt.c (check_unique_operand_names): Same. * targhooks.c (default_target_option_pragma_parse): Same. * tlink.c (recompile_files): Same. * toplev.c (process_options): Same. (do_compile): Same. * trans-mem.c (diagnose_tm_1): Same. (ipa_tm_scan_irr_block): Same. (ipa_tm_diagnose_transaction): Same. * tree-cfg.c (verify_address): Same. Use get_tree_code_name to format a tree code name in a diagnostic. (verify_types_in_gimple_min_lval): Same. (verify_types_in_gimple_reference): Same. (verify_gimple_call): Same. (verify_gimple_assign_unary): Same. (verify_gimple_assign_binary): Same. (verify_gimple_assign_ternary): Same. (verify_gimple_assign_single): Same. (verify_gimple_switch): Same. (verify_gimple_label): Same. (verify_gimple_phi): Same. (verify_gimple_in_seq): Same. (verify_eh_throw_stmt_node): Same. (collect_subblocks): Same. (gimple_verify_flow_info): Same. (do_warn_unused_result): Same. * tree-inline.c (expand_call_inline): Same. * tree-into-ssa.c (update_ssa): Same. * tree.c (tree_int_cst_elt_check_failed): Same. (tree_vec_elt_check_failed): Same. (omp_clause_operand_check_failed): Same. (verify_type_variant): Same. (verify_type): Same. * value-prof.c (verify_histograms): Same. * varasm.c (assemble_start_function): Same. gcc/lto/ChangeLog: * lto-dump.c (lto_main): Same. * lto.c (stream_out): Same. gcc/objc/ChangeLog: * objc-act.c (objc_begin_catch_clause): Quote keywords and options in diagnostics. (objc_build_throw_stmt): Same. (objc_finish_message_expr): Same. (get_super_receiver): Same. * objc-next-runtime-abi-01.c (objc_next_runtime_abi_01_init): Spell out "less than" in English./ * objc-next-runtime-abi-02.c (objc_next_runtime_abi_02_init): Spell out "greater" in English. gcc/testsuite/ChangeLog: * c-c++-common/Wbool-operation-1.c: Adjust text of expected diagnostics. * c-c++-common/Wvarargs-2.c: Same. * c-c++-common/Wvarargs.c: Same. * c-c++-common/pr51768.c: Same. * c-c++-common/tm/inline-asm.c: Same. * c-c++-common/tm/safe-1.c: Same. * g++.dg/asm-qual-1.C: Same. * g++.dg/asm-qual-3.C: Same. * g++.dg/conversion/dynamic1.C: Same. * g++.dg/cpp0x/constexpr-89599.C: Same. * g++.dg/cpp0x/constexpr-cast.C: Same. * g++.dg/cpp0x/constexpr-shift1.C: Same. * g++.dg/cpp0x/lambda/lambda-conv11.C: Same. * g++.dg/cpp0x/nullptr04.C: Same. * g++.dg/cpp0x/static_assert12.C: Same. * g++.dg/cpp0x/static_assert8.C: Same. * g++.dg/cpp1y/lambda-conv1.C: Same. * g++.dg/cpp1y/pr79393-3.C: Same. * g++.dg/cpp1y/static_assert1.C: Same. * g++.dg/cpp1z/constexpr-if4.C: Same. * g++.dg/cpp1z/constexpr-if5.C: Same. * g++.dg/cpp1z/constexpr-if9.C: Same. * g++.dg/eh/goto2.C: Same. * g++.dg/eh/goto3.C: Same. * g++.dg/expr/static_cast8.C: Same. * g++.dg/ext/flexary5.C: Same. * g++.dg/ext/utf-array-short-wchar.C: Same. * g++.dg/ext/utf-array.C: Same. * g++.dg/ext/utf8-2.C: Same. * g++.dg/gomp/loop-4.C: Same. * g++.dg/gomp/macro-4.C: Same. * g++.dg/gomp/udr-1.C: Same. * g++.dg/init/initializer-string-too-long.C: Same. * g++.dg/other/offsetof9.C: Same. * g++.dg/ubsan/pr63956.C: Same. * g++.dg/warn/Wbool-operation-1.C: Same. * g++.dg/warn/Wtype-limits-Wextra.C: Same. * g++.dg/warn/Wtype-limits.C: Same. * g++.dg/wrappers/pr88680.C: Same. * g++.old-deja/g++.mike/eh55.C: Same. * gcc.dg/Wsign-compare-1.c: Same. * gcc.dg/Wtype-limits-Wextra.c: Same. * gcc.dg/Wtype-limits.c: Same. * gcc.dg/Wunknownprag.c: Same. * gcc.dg/Wunsuffixed-float-constants-1.c: Same. * gcc.dg/asm-6.c: Same. * gcc.dg/asm-qual-1.c: Same. * gcc.dg/cast-1.c: Same. * gcc.dg/cast-2.c: Same. * gcc.dg/cast-3.c: Same. * gcc.dg/cpp/source_date_epoch-2.c: Same. * gcc.dg/debug/pr85252.c: Same. * gcc.dg/dfp/cast-bad.c: Same. * gcc.dg/format/gcc_diag-1.c: Same. * gcc.dg/format/gcc_diag-11.c: Same.New test. * gcc.dg/gcc_diag-11.c: Same.New test. * gcc.dg/gnu-cond-expr-2.c: Same. * gcc.dg/gnu-cond-expr-3.c: Same. * gcc.dg/gomp/macro-4.c: Same. * gcc.dg/init-bad-1.c: Same. * gcc.dg/init-bad-2.c: Same. * gcc.dg/init-bad-3.c: Same. * gcc.dg/pr27528.c: Same. * gcc.dg/pr48552-1.c: Same. * gcc.dg/pr48552-2.c: Same. * gcc.dg/pr59846.c: Same. * gcc.dg/pr61096-1.c: Same. * gcc.dg/pr8788-1.c: Same. * gcc.dg/pr90082.c: Same. * gcc.dg/simd-2.c: Same. * gcc.dg/spellcheck-params-2.c: Same. * gcc.dg/spellcheck-params.c: Same. * gcc.dg/strlenopt-49.c: Same. * gcc.dg/tm/pr52141.c: Same. * gcc.dg/torture/pr51106-1.c: Same. * gcc.dg/torture/pr51106-2.c: Same. * gcc.dg/utf-array-short-wchar.c: Same. * gcc.dg/utf-array.c: Same. * gcc.dg/utf8-2.c: Same. * gcc.dg/warn-sprintf-no-nul.c: Same. * gcc.target/i386/asm-flag-0.c: Same. * gcc.target/i386/inline_error.c: Same. * gcc.target/i386/pr30848.c: Same. * gcc.target/i386/pr39082-1.c: Same. * gcc.target/i386/pr39678.c: Same. * gcc.target/i386/pr57756.c: Same. * gcc.target/i386/pr68843-1.c: Same. * gcc.target/i386/pr79804.c: Same. * gcc.target/i386/pr82673.c: Same. * obj-c++.dg/class-protocol-1.mm: Same. * obj-c++.dg/exceptions-3.mm: Same. * obj-c++.dg/exceptions-4.mm: Same. * obj-c++.dg/exceptions-5.mm: Same. * obj-c++.dg/exceptions-6.mm: Same. * obj-c++.dg/method-12.mm: Same. * obj-c++.dg/method-13.mm: Same. * obj-c++.dg/method-6.mm: Same. * obj-c++.dg/method-7.mm: Same. * obj-c++.dg/method-9.mm: Same. * obj-c++.dg/method-lookup-1.mm: Same. * obj-c++.dg/proto-lossage-4.mm: Same. * obj-c++.dg/protocol-qualifier-2.mm: Same. * objc.dg/call-super-2.m: Same. * objc.dg/class-protocol-1.m: Same. * objc.dg/desig-init-1.m: Same. * objc.dg/exceptions-3.m: Same. * objc.dg/exceptions-4.m: Same. * objc.dg/exceptions-5.m: Same. * objc.dg/exceptions-6.m: Same. * objc.dg/method-19.m: Same. * objc.dg/method-2.m: Same. * objc.dg/method-5.m: Same. * objc.dg/method-6.m: Same. * objc.dg/method-7.m: Same. * objc.dg/method-lookup-1.m: Same. * objc.dg/proto-hier-1.m: Same. * objc.dg/proto-lossage-4.m: Same. From-SVN: r271338
2019-05-17 19:55:43 +02:00
"function versions cannot be marked as %<gnu_inline%>,"
" bodies have to be generated");
if (DECL_VIRTUAL_P (decl)
|| DECL_VINDEX (decl))
sorry ("virtual function multiversioning not supported");
version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
/* target attribute string cannot be NULL. */
gcc_assert (version_attr != NULL_TREE);
orig_name = IDENTIFIER_POINTER (id);
version_string
= TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
if (strcmp (version_string, "default") == 0)
return id;
attr_str = sorted_attr_string (TREE_VALUE (version_attr));
assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
sprintf (assembler_name, "%s.%s", orig_name, attr_str);
/* Allow assembler name to be modified if already set. */
if (DECL_ASSEMBLER_NAME_SET_P (decl))
SET_DECL_RTL (decl, NULL);
tree ret = get_identifier (assembler_name);
XDELETEVEC (attr_str);
XDELETEVEC (assembler_name);
return ret;
}
tree
ix86_mangle_decl_assembler_name (tree decl, tree id)
{
/* For function version, add the target suffix to the assembler name. */
if (TREE_CODE (decl) == FUNCTION_DECL
&& DECL_FUNCTION_VERSIONED (decl))
id = ix86_mangle_function_version_assembler_name (decl, id);
#ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
#endif
return id;
}
/* Make a dispatcher declaration for the multi-versioned function DECL.
Calls to DECL function will be replaced with calls to the dispatcher
by the front-end. Returns the decl of the dispatcher function. */
tree
ix86_get_function_versions_dispatcher (void *decl)
{
tree fn = (tree) decl;
struct cgraph_node *node = NULL;
struct cgraph_node *default_node = NULL;
struct cgraph_function_version_info *node_v = NULL;
struct cgraph_function_version_info *first_v = NULL;
tree dispatch_decl = NULL;
struct cgraph_function_version_info *default_version_info = NULL;
gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
node = cgraph_node::get (fn);
gcc_assert (node != NULL);
node_v = node->function_version ();
gcc_assert (node_v != NULL);
if (node_v->dispatcher_resolver != NULL)
return node_v->dispatcher_resolver;
/* Find the default version and make it the first node. */
first_v = node_v;
/* Go to the beginning of the chain. */
while (first_v->prev != NULL)
first_v = first_v->prev;
default_version_info = first_v;
while (default_version_info != NULL)
{
if (is_function_default_version
(default_version_info->this_node->decl))
break;
default_version_info = default_version_info->next;
}
/* If there is no default node, just return NULL. */
if (default_version_info == NULL)
return NULL;
/* Make default info the first node. */
if (first_v != default_version_info)
{
default_version_info->prev->next = default_version_info->next;
if (default_version_info->next)
default_version_info->next->prev = default_version_info->prev;
first_v->prev = default_version_info;
default_version_info->next = first_v;
default_version_info->prev = NULL;
}
default_node = default_version_info->this_node;
#if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
if (targetm.has_ifunc_p ())
{
struct cgraph_function_version_info *it_v = NULL;
struct cgraph_node *dispatcher_node = NULL;
struct cgraph_function_version_info *dispatcher_version_info = NULL;
/* Right now, the dispatching is done via ifunc. */
dispatch_decl = make_dispatcher_decl (default_node->decl);
dispatcher_node = cgraph_node::get_create (dispatch_decl);
gcc_assert (dispatcher_node != NULL);
dispatcher_node->dispatcher_function = 1;
dispatcher_version_info
= dispatcher_node->insert_new_function_version ();
dispatcher_version_info->next = default_version_info;
dispatcher_node->definition = 1;
/* Set the dispatcher for all the versions. */
it_v = default_version_info;
while (it_v != NULL)
{
it_v->dispatcher_resolver = dispatch_decl;
it_v = it_v->next;
}
}
else
#endif
{
error_at (DECL_SOURCE_LOCATION (default_node->decl),
c-decl.c (start_decl): Adjust quoting and hyphenation in diagnostics. gcc/c/ChangeLog: * c-decl.c (start_decl): Adjust quoting and hyphenation in diagnostics. (finish_decl): Same. (finish_enum): Same. (start_function): Same. (declspecs_add_type): Same. * c-parser.c (warn_for_abs): Same. * c-typeck.c (build_binary_op): Same. gcc/c-family/ChangeLog: * c-attribs.c (handle_mode_attribute): Adjust quoting and hyphenation. (handle_alias_ifunc_attribute): Same. (handle_copy_attribute): Same. (handle_weakref_attribute): Same. (handle_nonnull_attribute): Same. * c-warn.c (warn_for_sign_compare): Same. (warn_for_restrict): Same. * c.opt: Same. gcc/cp/ChangeLog: * call.c (build_conditional_expr_1): Adjust quoting and hyphenation. (convert_like_real): Same. (convert_arg_to_ellipsis): Same. * constexpr.c (diag_array_subscript): Same. * constraint.cc (diagnose_trait_expression): Same. * cvt.c (ocp_convert): Same. * decl.c (start_decl): Same. (check_for_uninitialized_const_var): Same. (grokfndecl): Same. (check_special_function_return_type): Same. (finish_enum_value_list): Same. (start_preparsed_function): Same. * parser.c (cp_parser_decl_specifier_seq): Same. * typeck.c (cp_build_binary_op): Same. (build_static_cast_1): Same. gcc/lto/ChangeLog: * lto-common.c (lto_file_finalize): Adjust quoting and hyphenation. gcc/objc/ChangeLog: * objc-act.c (objc_build_setter_call): Adjust quoting and hyphenation. * objc-encoding.c (encode_gnu_bitfield): Same. gcc/ChangeLog: * config/i386/i386-features.c (ix86_get_function_versions_dispatcher): Adjust quoting and hyphenation. * convert.c (convert_to_real_1): Same. * gcc.c (driver_wrong_lang_callback): Same. (driver::handle_unrecognized_options): Same. * gimple-ssa-nonnull-compare.c (do_warn_nonnull_compare): Same. * opts-common.c (cmdline_handle_error): Same. (read_cmdline_option): Same. * opts-global.c (complain_wrong_lang): Same. (print_ignored_options): Same. (handle_common_deferred_options): Same. * pretty-print.h: Same. * print-rtl.c (debug_bb_n_slim): Same. * sched-rgn.c (make_pass_sched_fusion): Same. * tree-cfg.c (verify_gimple_assign_unary): Same. (verify_gimple_label): Same. * tree-ssa-operands.c (verify_ssa_operands): Same. * varasm.c (do_assemble_alias): Same. (assemble_alias): Same. From-SVN: r271971
2019-06-05 20:30:48 +02:00
"multiversioning needs %<ifunc%> which is not supported "
"on this target");
}
return dispatch_decl;
}
/* Make the resolver function decl to dispatch the versions of
a multi-versioned function, DEFAULT_DECL. IFUNC_ALIAS_DECL is
ifunc alias that will point to the created resolver. Create an
empty basic block in the resolver and store the pointer in
EMPTY_BB. Return the decl of the resolver function. */
static tree
make_resolver_func (const tree default_decl,
const tree ifunc_alias_decl,
basic_block *empty_bb)
{
tree decl, type, t;
/* Create resolver function name based on default_decl. */
tree decl_name = clone_function_name (default_decl, "resolver");
const char *resolver_name = IDENTIFIER_POINTER (decl_name);
/* The resolver function should return a (void *). */
type = build_function_type_list (ptr_type_node, NULL_TREE);
decl = build_fn_decl (resolver_name, type);
SET_DECL_ASSEMBLER_NAME (decl, decl_name);
DECL_NAME (decl) = decl_name;
TREE_USED (decl) = 1;
DECL_ARTIFICIAL (decl) = 1;
DECL_IGNORED_P (decl) = 1;
TREE_PUBLIC (decl) = 0;
DECL_UNINLINABLE (decl) = 1;
/* Resolver is not external, body is generated. */
DECL_EXTERNAL (decl) = 0;
DECL_EXTERNAL (ifunc_alias_decl) = 0;
DECL_CONTEXT (decl) = NULL_TREE;
DECL_INITIAL (decl) = make_node (BLOCK);
DECL_STATIC_CONSTRUCTOR (decl) = 0;
if (DECL_COMDAT_GROUP (default_decl)
|| TREE_PUBLIC (default_decl))
{
/* In this case, each translation unit with a call to this
versioned function will put out a resolver. Ensure it
is comdat to keep just one copy. */
DECL_COMDAT (decl) = 1;
make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
}
else
TREE_PUBLIC (ifunc_alias_decl) = 0;
/* Build result decl and add to function_decl. */
t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
DECL_CONTEXT (t) = decl;
DECL_ARTIFICIAL (t) = 1;
DECL_IGNORED_P (t) = 1;
DECL_RESULT (decl) = t;
gimplify_function_tree (decl);
push_cfun (DECL_STRUCT_FUNCTION (decl));
*empty_bb = init_lowered_empty_function (decl, false,
profile_count::uninitialized ());
cgraph_node::add_new_function (decl, true);
symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl));
pop_cfun ();
gcc_assert (ifunc_alias_decl != NULL);
/* Mark ifunc_alias_decl as "ifunc" with resolver as resolver_name. */
DECL_ATTRIBUTES (ifunc_alias_decl)
= make_attribute ("ifunc", resolver_name,
DECL_ATTRIBUTES (ifunc_alias_decl));
/* Create the alias for dispatch to resolver here. */
cgraph_node::create_same_body_alias (ifunc_alias_decl, decl);
return decl;
}
/* Generate the dispatching code body to dispatch multi-versioned function
DECL. The target hook is called to process the "target" attributes and
provide the code to dispatch the right function at run-time. NODE points
to the dispatcher decl whose body will be created. */
tree
ix86_generate_version_dispatcher_body (void *node_p)
{
tree resolver_decl;
basic_block empty_bb;
tree default_ver_decl;
struct cgraph_node *versn;
struct cgraph_node *node;
struct cgraph_function_version_info *node_version_info = NULL;
struct cgraph_function_version_info *versn_info = NULL;
node = (cgraph_node *)node_p;
node_version_info = node->function_version ();
gcc_assert (node->dispatcher_function
&& node_version_info != NULL);
if (node_version_info->dispatcher_resolver)
return node_version_info->dispatcher_resolver;
/* The first version in the chain corresponds to the default version. */
default_ver_decl = node_version_info->next->this_node->decl;
/* node is going to be an alias, so remove the finalized bit. */
node->definition = false;
resolver_decl = make_resolver_func (default_ver_decl,
node->decl, &empty_bb);
node_version_info->dispatcher_resolver = resolver_decl;
push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
auto_vec<tree, 2> fn_ver_vec;
for (versn_info = node_version_info->next; versn_info;
versn_info = versn_info->next)
{
versn = versn_info->this_node;
/* Check for virtual functions here again, as by this time it should
have been determined if this function needs a vtable index or
not. This happens for methods in derived classes that override
virtual methods in base classes but are not explicitly marked as
virtual. */
if (DECL_VINDEX (versn->decl))
sorry ("virtual function multiversioning not supported");
fn_ver_vec.safe_push (versn->decl);
}
dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
cgraph_edge::rebuild_edges ();
pop_cfun ();
return resolver_decl;
}