diff options
Diffstat (limited to 'contrib/gcc/config/ia64/ia64.c')
-rw-r--r-- | contrib/gcc/config/ia64/ia64.c | 5224 |
1 files changed, 2836 insertions, 2388 deletions
diff --git a/contrib/gcc/config/ia64/ia64.c b/contrib/gcc/config/ia64/ia64.c index 12f3204..19c5e92 100644 --- a/contrib/gcc/config/ia64/ia64.c +++ b/contrib/gcc/config/ia64/ia64.c @@ -1,27 +1,30 @@ /* Definitions of target machine for GNU compiler. - Copyright (C) 1999, 2000, 2001, 2002, 2003 Free Software Foundation, Inc. + Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004 + Free Software Foundation, Inc. Contributed by James E. Wilson <wilson@cygnus.com> and - David Mosberger <davidm@hpl.hp.com>. + David Mosberger <davidm@hpl.hp.com>. -This file is part of GNU CC. +This file is part of GCC. -GNU CC is free software; you can redistribute it and/or modify +GCC is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2, or (at your option) any later version. -GNU CC is distributed in the hope that it will be useful, +GCC is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License -along with GNU CC; see the file COPYING. If not, write to +along with GCC; see the file COPYING. If not, write to the Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include "config.h" #include "system.h" +#include "coretypes.h" +#include "tm.h" #include "rtl.h" #include "tree.h" #include "regs.h" @@ -45,7 +48,9 @@ Boston, MA 02111-1307, USA. */ #include "target.h" #include "target-def.h" #include "tm_p.h" +#include "hashtab.h" #include "langhooks.h" +#include "cfglayout.h" /* This is used for communication between ASM_OUTPUT_LABEL and ASM_OUTPUT_LABELREF. */ @@ -102,6 +107,12 @@ int ia64_tls_size = 22; /* String used with the -mtls-size= option. */ const char *ia64_tls_size_string; +/* Which cpu are we scheduling for. */ +enum processor_type ia64_tune; + +/* String used with the -tune= option. */ +const char *ia64_tune_string; + /* Determines whether we run our final scheduling pass or not. We always avoid the normal second scheduling pass. */ static int ia64_flag_schedule_insns2; @@ -111,6 +122,10 @@ static int ia64_flag_schedule_insns2; unsigned int ia64_section_threshold; +/* The following variable is used by the DFA insn scheduler. The value is + TRUE if we do insn bundling instead of insn scheduling. */ +int bundling_p = 0; + /* Structure to be filled in by ia64_compute_frame_size with register save masks and offsets for the current function. */ @@ -122,7 +137,7 @@ struct ia64_frame_info HOST_WIDE_INT spill_size; /* size of the gr/br/fr spill area. */ HOST_WIDE_INT extra_spill_size; /* size of spill area for others. */ HARD_REG_SET mask; /* mask of saved registers. */ - unsigned int gr_used_mask; /* mask of registers in use as gr spill + unsigned int gr_used_mask; /* mask of registers in use as gr spill registers or long-term scratches. */ int n_spilled; /* number of spilled registers. */ int reg_fp; /* register for fp. */ @@ -144,84 +159,118 @@ struct ia64_frame_info /* Current frame information calculated by ia64_compute_frame_size. */ static struct ia64_frame_info current_frame_info; -static rtx gen_tls_get_addr PARAMS ((void)); -static rtx gen_thread_pointer PARAMS ((void)); -static int find_gr_spill PARAMS ((int)); -static int next_scratch_gr_reg PARAMS ((void)); -static void mark_reg_gr_used_mask PARAMS ((rtx, void *)); -static void ia64_compute_frame_size PARAMS ((HOST_WIDE_INT)); -static void setup_spill_pointers PARAMS ((int, rtx, HOST_WIDE_INT)); -static void finish_spill_pointers PARAMS ((void)); -static rtx spill_restore_mem PARAMS ((rtx, HOST_WIDE_INT)); -static void do_spill PARAMS ((rtx (*)(rtx, rtx, rtx), rtx, HOST_WIDE_INT, rtx)); -static void do_restore PARAMS ((rtx (*)(rtx, rtx, rtx), rtx, HOST_WIDE_INT)); -static rtx gen_movdi_x PARAMS ((rtx, rtx, rtx)); -static rtx gen_fr_spill_x PARAMS ((rtx, rtx, rtx)); -static rtx gen_fr_restore_x PARAMS ((rtx, rtx, rtx)); - -static enum machine_mode hfa_element_mode PARAMS ((tree, int)); -static void fix_range PARAMS ((const char *)); -static struct machine_function * ia64_init_machine_status PARAMS ((void)); -static void emit_insn_group_barriers PARAMS ((FILE *, rtx)); -static void emit_all_insn_group_barriers PARAMS ((FILE *, rtx)); -static void emit_predicate_relation_info PARAMS ((void)); -static bool ia64_in_small_data_p PARAMS ((tree)); -static void ia64_encode_section_info PARAMS ((tree, int)); -static const char *ia64_strip_name_encoding PARAMS ((const char *)); -static void process_epilogue PARAMS ((void)); -static int process_set PARAMS ((FILE *, rtx)); - -static rtx ia64_expand_fetch_and_op PARAMS ((optab, enum machine_mode, - tree, rtx)); -static rtx ia64_expand_op_and_fetch PARAMS ((optab, enum machine_mode, - tree, rtx)); -static rtx ia64_expand_compare_and_swap PARAMS ((enum machine_mode, - enum machine_mode, - int, tree, rtx)); -static rtx ia64_expand_lock_test_and_set PARAMS ((enum machine_mode, - tree, rtx)); -static rtx ia64_expand_lock_release PARAMS ((enum machine_mode, tree, rtx)); -static bool ia64_assemble_integer PARAMS ((rtx, unsigned int, int)); -static void ia64_output_function_prologue PARAMS ((FILE *, HOST_WIDE_INT)); -static void ia64_output_function_epilogue PARAMS ((FILE *, HOST_WIDE_INT)); -static void ia64_output_function_end_prologue PARAMS ((FILE *)); - -static int ia64_issue_rate PARAMS ((void)); -static int ia64_adjust_cost PARAMS ((rtx, rtx, rtx, int)); -static void ia64_sched_init PARAMS ((FILE *, int, int)); -static void ia64_sched_finish PARAMS ((FILE *, int)); -static int ia64_internal_sched_reorder PARAMS ((FILE *, int, rtx *, - int *, int, int)); -static int ia64_sched_reorder PARAMS ((FILE *, int, rtx *, int *, int)); -static int ia64_sched_reorder2 PARAMS ((FILE *, int, rtx *, int *, int)); -static int ia64_variable_issue PARAMS ((FILE *, int, rtx, int)); - -static void ia64_output_mi_thunk PARAMS ((FILE *, tree, HOST_WIDE_INT, - HOST_WIDE_INT, tree)); - -static void ia64_select_rtx_section PARAMS ((enum machine_mode, rtx, - unsigned HOST_WIDE_INT)); -static void ia64_rwreloc_select_section PARAMS ((tree, int, - unsigned HOST_WIDE_INT)) +static int ia64_use_dfa_pipeline_interface (void); +static int ia64_first_cycle_multipass_dfa_lookahead (void); +static void ia64_dependencies_evaluation_hook (rtx, rtx); +static void ia64_init_dfa_pre_cycle_insn (void); +static rtx ia64_dfa_pre_cycle_insn (void); +static int ia64_first_cycle_multipass_dfa_lookahead_guard (rtx); +static int ia64_dfa_new_cycle (FILE *, int, rtx, int, int, int *); +static rtx gen_tls_get_addr (void); +static rtx gen_thread_pointer (void); +static rtx ia64_expand_tls_address (enum tls_model, rtx, rtx); +static int find_gr_spill (int); +static int next_scratch_gr_reg (void); +static void mark_reg_gr_used_mask (rtx, void *); +static void ia64_compute_frame_size (HOST_WIDE_INT); +static void setup_spill_pointers (int, rtx, HOST_WIDE_INT); +static void finish_spill_pointers (void); +static rtx spill_restore_mem (rtx, HOST_WIDE_INT); +static void do_spill (rtx (*)(rtx, rtx, rtx), rtx, HOST_WIDE_INT, rtx); +static void do_restore (rtx (*)(rtx, rtx, rtx), rtx, HOST_WIDE_INT); +static rtx gen_movdi_x (rtx, rtx, rtx); +static rtx gen_fr_spill_x (rtx, rtx, rtx); +static rtx gen_fr_restore_x (rtx, rtx, rtx); + +static enum machine_mode hfa_element_mode (tree, int); +static bool ia64_function_ok_for_sibcall (tree, tree); +static bool ia64_rtx_costs (rtx, int, int, int *); +static void fix_range (const char *); +static struct machine_function * ia64_init_machine_status (void); +static void emit_insn_group_barriers (FILE *); +static void emit_all_insn_group_barriers (FILE *); +static void final_emit_insn_group_barriers (FILE *); +static void emit_predicate_relation_info (void); +static void ia64_reorg (void); +static bool ia64_in_small_data_p (tree); +static void process_epilogue (void); +static int process_set (FILE *, rtx); + +static rtx ia64_expand_fetch_and_op (optab, enum machine_mode, tree, rtx); +static rtx ia64_expand_op_and_fetch (optab, enum machine_mode, tree, rtx); +static rtx ia64_expand_compare_and_swap (enum machine_mode, enum machine_mode, + int, tree, rtx); +static rtx ia64_expand_lock_test_and_set (enum machine_mode, tree, rtx); +static rtx ia64_expand_lock_release (enum machine_mode, tree, rtx); +static bool ia64_assemble_integer (rtx, unsigned int, int); +static void ia64_output_function_prologue (FILE *, HOST_WIDE_INT); +static void ia64_output_function_epilogue (FILE *, HOST_WIDE_INT); +static void ia64_output_function_end_prologue (FILE *); + +static int ia64_issue_rate (void); +static int ia64_adjust_cost (rtx, rtx, rtx, int); +static void ia64_sched_init (FILE *, int, int); +static void ia64_sched_finish (FILE *, int); +static int ia64_dfa_sched_reorder (FILE *, int, rtx *, int *, int, int); +static int ia64_sched_reorder (FILE *, int, rtx *, int *, int); +static int ia64_sched_reorder2 (FILE *, int, rtx *, int *, int); +static int ia64_variable_issue (FILE *, int, rtx, int); + +static struct bundle_state *get_free_bundle_state (void); +static void free_bundle_state (struct bundle_state *); +static void initiate_bundle_states (void); +static void finish_bundle_states (void); +static unsigned bundle_state_hash (const void *); +static int bundle_state_eq_p (const void *, const void *); +static int insert_bundle_state (struct bundle_state *); +static void initiate_bundle_state_table (void); +static void finish_bundle_state_table (void); +static int try_issue_nops (struct bundle_state *, int); +static int try_issue_insn (struct bundle_state *, rtx); +static void issue_nops_and_insn (struct bundle_state *, int, rtx, int, int); +static int get_max_pos (state_t); +static int get_template (state_t, int); + +static rtx get_next_important_insn (rtx, rtx); +static void bundling (FILE *, int, rtx, rtx); + +static void ia64_output_mi_thunk (FILE *, tree, HOST_WIDE_INT, + HOST_WIDE_INT, tree); +static void ia64_file_start (void); + +static void ia64_select_rtx_section (enum machine_mode, rtx, + unsigned HOST_WIDE_INT); +static void ia64_rwreloc_select_section (tree, int, unsigned HOST_WIDE_INT) ATTRIBUTE_UNUSED; -static void ia64_rwreloc_unique_section PARAMS ((tree, int)) +static void ia64_rwreloc_unique_section (tree, int) ATTRIBUTE_UNUSED; -static void ia64_rwreloc_select_rtx_section PARAMS ((enum machine_mode, rtx, - unsigned HOST_WIDE_INT)) +static void ia64_rwreloc_select_rtx_section (enum machine_mode, rtx, + unsigned HOST_WIDE_INT) ATTRIBUTE_UNUSED; -static unsigned int ia64_rwreloc_section_type_flags - PARAMS ((tree, const char *, int)) +static unsigned int ia64_rwreloc_section_type_flags (tree, const char *, int) ATTRIBUTE_UNUSED; -static void ia64_hpux_add_extern_decl PARAMS ((const char *name)) +static void ia64_hpux_add_extern_decl (tree decl) + ATTRIBUTE_UNUSED; +static void ia64_hpux_file_end (void) + ATTRIBUTE_UNUSED; +static void ia64_hpux_init_libfuncs (void) ATTRIBUTE_UNUSED; +static void ia64_vms_init_libfuncs (void) + ATTRIBUTE_UNUSED; + +static tree ia64_handle_model_attribute (tree *, tree, tree, int, bool *); +static void ia64_encode_section_info (tree, rtx, int); +static rtx ia64_struct_value_rtx (tree, int); + /* Table of valid machine attributes. */ static const struct attribute_spec ia64_attribute_table[] = { /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler } */ { "syscall_linkage", 0, 0, false, true, true, NULL }, - { NULL, 0, 0, false, false, false, NULL } + { "model", 1, 1, true, false, false, ia64_handle_model_attribute }, + { NULL, 0, 0, false, false, false, NULL } }; /* Initialize the GCC target structure. */ @@ -260,10 +309,6 @@ static const struct attribute_spec ia64_attribute_table[] = #undef TARGET_IN_SMALL_DATA_P #define TARGET_IN_SMALL_DATA_P ia64_in_small_data_p -#undef TARGET_ENCODE_SECTION_INFO -#define TARGET_ENCODE_SECTION_INFO ia64_encode_section_info -#undef TARGET_STRIP_NAME_ENCODING -#define TARGET_STRIP_NAME_ENCODING ia64_strip_name_encoding #undef TARGET_SCHED_ADJUST_COST #define TARGET_SCHED_ADJUST_COST ia64_adjust_cost @@ -280,24 +325,58 @@ static const struct attribute_spec ia64_attribute_table[] = #undef TARGET_SCHED_REORDER2 #define TARGET_SCHED_REORDER2 ia64_sched_reorder2 -#ifdef HAVE_AS_TLS -#undef TARGET_HAVE_TLS -#define TARGET_HAVE_TLS true -#endif +#undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK +#define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK ia64_dependencies_evaluation_hook + +#undef TARGET_SCHED_USE_DFA_PIPELINE_INTERFACE +#define TARGET_SCHED_USE_DFA_PIPELINE_INTERFACE ia64_use_dfa_pipeline_interface + +#undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD +#define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD ia64_first_cycle_multipass_dfa_lookahead + +#undef TARGET_SCHED_INIT_DFA_PRE_CYCLE_INSN +#define TARGET_SCHED_INIT_DFA_PRE_CYCLE_INSN ia64_init_dfa_pre_cycle_insn +#undef TARGET_SCHED_DFA_PRE_CYCLE_INSN +#define TARGET_SCHED_DFA_PRE_CYCLE_INSN ia64_dfa_pre_cycle_insn + +#undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD +#define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD\ + ia64_first_cycle_multipass_dfa_lookahead_guard + +#undef TARGET_SCHED_DFA_NEW_CYCLE +#define TARGET_SCHED_DFA_NEW_CYCLE ia64_dfa_new_cycle + +#undef TARGET_FUNCTION_OK_FOR_SIBCALL +#define TARGET_FUNCTION_OK_FOR_SIBCALL ia64_function_ok_for_sibcall #undef TARGET_ASM_OUTPUT_MI_THUNK #define TARGET_ASM_OUTPUT_MI_THUNK ia64_output_mi_thunk #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK #define TARGET_ASM_CAN_OUTPUT_MI_THUNK hook_bool_tree_hwi_hwi_tree_true +#undef TARGET_ASM_FILE_START +#define TARGET_ASM_FILE_START ia64_file_start + +#undef TARGET_RTX_COSTS +#define TARGET_RTX_COSTS ia64_rtx_costs +#undef TARGET_ADDRESS_COST +#define TARGET_ADDRESS_COST hook_int_rtx_0 + +#undef TARGET_MACHINE_DEPENDENT_REORG +#define TARGET_MACHINE_DEPENDENT_REORG ia64_reorg + +#undef TARGET_ENCODE_SECTION_INFO +#define TARGET_ENCODE_SECTION_INFO ia64_encode_section_info + +#undef TARGET_STRUCT_VALUE_RTX +#define TARGET_STRUCT_VALUE_RTX ia64_struct_value_rtx + struct gcc_target targetm = TARGET_INITIALIZER; /* Return 1 if OP is a valid operand for the MEM of a CALL insn. */ int -call_operand (op, mode) - rtx op; - enum machine_mode mode; +call_operand (rtx op, enum machine_mode mode) { if (mode != GET_MODE (op) && mode != VOIDmode) return 0; @@ -309,9 +388,7 @@ call_operand (op, mode) /* Return 1 if OP refers to a symbol in the sdata section. */ int -sdata_symbolic_operand (op, mode) - rtx op; - enum machine_mode mode ATTRIBUTE_UNUSED; +sdata_symbolic_operand (rtx op, enum machine_mode mode ATTRIBUTE_UNUSED) { switch (GET_CODE (op)) { @@ -326,10 +403,7 @@ sdata_symbolic_operand (op, mode) if (CONSTANT_POOL_ADDRESS_P (op)) return GET_MODE_SIZE (get_pool_mode (op)) <= ia64_section_threshold; else - { - const char *str = XSTR (op, 0); - return (str[0] == ENCODE_SECTION_INFO_CHAR && str[1] == 's'); - } + return SYMBOL_REF_LOCAL_P (op) && SYMBOL_REF_SMALL_P (op); default: break; @@ -338,12 +412,16 @@ sdata_symbolic_operand (op, mode) return 0; } +int +small_addr_symbolic_operand (rtx op, enum machine_mode mode ATTRIBUTE_UNUSED) +{ + return SYMBOL_REF_SMALL_ADDR_P (op); +} + /* Return 1 if OP refers to a symbol, and is appropriate for a GOT load. */ int -got_symbolic_operand (op, mode) - rtx op; - enum machine_mode mode ATTRIBUTE_UNUSED; +got_symbolic_operand (rtx op, enum machine_mode mode ATTRIBUTE_UNUSED) { switch (GET_CODE (op)) { @@ -374,6 +452,8 @@ got_symbolic_operand (op, mode) return (INTVAL (op) & 0x3fff) == 0; case SYMBOL_REF: + if (SYMBOL_REF_SMALL_ADDR_P (op)) + return 0; case LABEL_REF: return 1; @@ -386,9 +466,7 @@ got_symbolic_operand (op, mode) /* Return 1 if OP refers to a symbol. */ int -symbolic_operand (op, mode) - rtx op; - enum machine_mode mode ATTRIBUTE_UNUSED; +symbolic_operand (rtx op, enum machine_mode mode ATTRIBUTE_UNUSED) { switch (GET_CODE (op)) { @@ -406,40 +484,20 @@ symbolic_operand (op, mode) /* Return tls_model if OP refers to a TLS symbol. */ int -tls_symbolic_operand (op, mode) - rtx op; - enum machine_mode mode ATTRIBUTE_UNUSED; +tls_symbolic_operand (rtx op, enum machine_mode mode ATTRIBUTE_UNUSED) { - const char *str; - if (GET_CODE (op) != SYMBOL_REF) return 0; - str = XSTR (op, 0); - if (str[0] != ENCODE_SECTION_INFO_CHAR) - return 0; - switch (str[1]) - { - case 'G': - return TLS_MODEL_GLOBAL_DYNAMIC; - case 'L': - return TLS_MODEL_LOCAL_DYNAMIC; - case 'i': - return TLS_MODEL_INITIAL_EXEC; - case 'l': - return TLS_MODEL_LOCAL_EXEC; - } - return 0; + return SYMBOL_REF_TLS_MODEL (op); } /* Return 1 if OP refers to a function. */ int -function_operand (op, mode) - rtx op; - enum machine_mode mode ATTRIBUTE_UNUSED; +function_operand (rtx op, enum machine_mode mode ATTRIBUTE_UNUSED) { - if (GET_CODE (op) == SYMBOL_REF && SYMBOL_REF_FLAG (op)) + if (GET_CODE (op) == SYMBOL_REF && SYMBOL_REF_FUNCTION_P (op)) return 1; else return 0; @@ -450,9 +508,7 @@ function_operand (op, mode) /* ??? This is an unsatisfying solution. Should rethink. */ int -setjmp_operand (op, mode) - rtx op; - enum machine_mode mode ATTRIBUTE_UNUSED; +setjmp_operand (rtx op, enum machine_mode mode ATTRIBUTE_UNUSED) { const char *name; int retval = 0; @@ -495,29 +551,18 @@ setjmp_operand (op, mode) return retval; } -/* Return 1 if OP is a general operand, but when pic exclude symbolic - operands. */ - -/* ??? If we drop no-pic support, can delete SYMBOL_REF, CONST, and LABEL_REF - from PREDICATE_CODES. */ +/* Return 1 if OP is a general operand, excluding tls symbolic operands. */ int -move_operand (op, mode) - rtx op; - enum machine_mode mode; +move_operand (rtx op, enum machine_mode mode) { - if (! TARGET_NO_PIC && symbolic_operand (op, mode)) - return 0; - - return general_operand (op, mode); + return general_operand (op, mode) && !tls_symbolic_operand (op, mode); } /* Return 1 if OP is a register operand that is (or could be) a GR reg. */ int -gr_register_operand (op, mode) - rtx op; - enum machine_mode mode; +gr_register_operand (rtx op, enum machine_mode mode) { if (! register_operand (op, mode)) return 0; @@ -535,9 +580,7 @@ gr_register_operand (op, mode) /* Return 1 if OP is a register operand that is (or could be) an FR reg. */ int -fr_register_operand (op, mode) - rtx op; - enum machine_mode mode; +fr_register_operand (rtx op, enum machine_mode mode) { if (! register_operand (op, mode)) return 0; @@ -555,9 +598,7 @@ fr_register_operand (op, mode) /* Return 1 if OP is a register operand that is (or could be) a GR/FR reg. */ int -grfr_register_operand (op, mode) - rtx op; - enum machine_mode mode; +grfr_register_operand (rtx op, enum machine_mode mode) { if (! register_operand (op, mode)) return 0; @@ -575,9 +616,7 @@ grfr_register_operand (op, mode) /* Return 1 if OP is a nonimmediate operand that is (or could be) a GR reg. */ int -gr_nonimmediate_operand (op, mode) - rtx op; - enum machine_mode mode; +gr_nonimmediate_operand (rtx op, enum machine_mode mode) { if (! nonimmediate_operand (op, mode)) return 0; @@ -595,9 +634,7 @@ gr_nonimmediate_operand (op, mode) /* Return 1 if OP is a nonimmediate operand that is (or could be) a FR reg. */ int -fr_nonimmediate_operand (op, mode) - rtx op; - enum machine_mode mode; +fr_nonimmediate_operand (rtx op, enum machine_mode mode) { if (! nonimmediate_operand (op, mode)) return 0; @@ -615,9 +652,7 @@ fr_nonimmediate_operand (op, mode) /* Return 1 if OP is a nonimmediate operand that is a GR/FR reg. */ int -grfr_nonimmediate_operand (op, mode) - rtx op; - enum machine_mode mode; +grfr_nonimmediate_operand (rtx op, enum machine_mode mode) { if (! nonimmediate_operand (op, mode)) return 0; @@ -635,9 +670,7 @@ grfr_nonimmediate_operand (op, mode) /* Return 1 if OP is a GR register operand, or zero. */ int -gr_reg_or_0_operand (op, mode) - rtx op; - enum machine_mode mode; +gr_reg_or_0_operand (rtx op, enum machine_mode mode) { return (op == const0_rtx || gr_register_operand (op, mode)); } @@ -645,9 +678,7 @@ gr_reg_or_0_operand (op, mode) /* Return 1 if OP is a GR register operand, or a 5 bit immediate operand. */ int -gr_reg_or_5bit_operand (op, mode) - rtx op; - enum machine_mode mode; +gr_reg_or_5bit_operand (rtx op, enum machine_mode mode) { return ((GET_CODE (op) == CONST_INT && INTVAL (op) >= 0 && INTVAL (op) < 32) || GET_CODE (op) == CONSTANT_P_RTX @@ -657,9 +688,7 @@ gr_reg_or_5bit_operand (op, mode) /* Return 1 if OP is a GR register operand, or a 6 bit immediate operand. */ int -gr_reg_or_6bit_operand (op, mode) - rtx op; - enum machine_mode mode; +gr_reg_or_6bit_operand (rtx op, enum machine_mode mode) { return ((GET_CODE (op) == CONST_INT && CONST_OK_FOR_M (INTVAL (op))) || GET_CODE (op) == CONSTANT_P_RTX @@ -669,9 +698,7 @@ gr_reg_or_6bit_operand (op, mode) /* Return 1 if OP is a GR register operand, or an 8 bit immediate operand. */ int -gr_reg_or_8bit_operand (op, mode) - rtx op; - enum machine_mode mode; +gr_reg_or_8bit_operand (rtx op, enum machine_mode mode) { return ((GET_CODE (op) == CONST_INT && CONST_OK_FOR_K (INTVAL (op))) || GET_CODE (op) == CONSTANT_P_RTX @@ -681,9 +708,7 @@ gr_reg_or_8bit_operand (op, mode) /* Return 1 if OP is a GR/FR register operand, or an 8 bit immediate. */ int -grfr_reg_or_8bit_operand (op, mode) - rtx op; - enum machine_mode mode; +grfr_reg_or_8bit_operand (rtx op, enum machine_mode mode) { return ((GET_CODE (op) == CONST_INT && CONST_OK_FOR_K (INTVAL (op))) || GET_CODE (op) == CONSTANT_P_RTX @@ -694,9 +719,7 @@ grfr_reg_or_8bit_operand (op, mode) operand. */ int -gr_reg_or_8bit_adjusted_operand (op, mode) - rtx op; - enum machine_mode mode; +gr_reg_or_8bit_adjusted_operand (rtx op, enum machine_mode mode) { return ((GET_CODE (op) == CONST_INT && CONST_OK_FOR_L (INTVAL (op))) || GET_CODE (op) == CONSTANT_P_RTX @@ -709,9 +732,7 @@ gr_reg_or_8bit_adjusted_operand (op, mode) so we need the union of the immediates accepted by GT and LT. */ int -gr_reg_or_8bit_and_adjusted_operand (op, mode) - rtx op; - enum machine_mode mode; +gr_reg_or_8bit_and_adjusted_operand (rtx op, enum machine_mode mode) { return ((GET_CODE (op) == CONST_INT && CONST_OK_FOR_K (INTVAL (op)) && CONST_OK_FOR_L (INTVAL (op))) @@ -722,9 +743,7 @@ gr_reg_or_8bit_and_adjusted_operand (op, mode) /* Return 1 if OP is a register operand, or a 14 bit immediate operand. */ int -gr_reg_or_14bit_operand (op, mode) - rtx op; - enum machine_mode mode; +gr_reg_or_14bit_operand (rtx op, enum machine_mode mode) { return ((GET_CODE (op) == CONST_INT && CONST_OK_FOR_I (INTVAL (op))) || GET_CODE (op) == CONSTANT_P_RTX @@ -734,9 +753,7 @@ gr_reg_or_14bit_operand (op, mode) /* Return 1 if OP is a register operand, or a 22 bit immediate operand. */ int -gr_reg_or_22bit_operand (op, mode) - rtx op; - enum machine_mode mode; +gr_reg_or_22bit_operand (rtx op, enum machine_mode mode) { return ((GET_CODE (op) == CONST_INT && CONST_OK_FOR_J (INTVAL (op))) || GET_CODE (op) == CONSTANT_P_RTX @@ -746,9 +763,7 @@ gr_reg_or_22bit_operand (op, mode) /* Return 1 if OP is a 6 bit immediate operand. */ int -shift_count_operand (op, mode) - rtx op; - enum machine_mode mode ATTRIBUTE_UNUSED; +shift_count_operand (rtx op, enum machine_mode mode ATTRIBUTE_UNUSED) { return ((GET_CODE (op) == CONST_INT && CONST_OK_FOR_M (INTVAL (op))) || GET_CODE (op) == CONSTANT_P_RTX); @@ -757,9 +772,7 @@ shift_count_operand (op, mode) /* Return 1 if OP is a 5 bit immediate operand. */ int -shift_32bit_count_operand (op, mode) - rtx op; - enum machine_mode mode ATTRIBUTE_UNUSED; +shift_32bit_count_operand (rtx op, enum machine_mode mode ATTRIBUTE_UNUSED) { return ((GET_CODE (op) == CONST_INT && (INTVAL (op) >= 0 && INTVAL (op) < 32)) @@ -769,9 +782,7 @@ shift_32bit_count_operand (op, mode) /* Return 1 if OP is a 2, 4, 8, or 16 immediate operand. */ int -shladd_operand (op, mode) - rtx op; - enum machine_mode mode ATTRIBUTE_UNUSED; +shladd_operand (rtx op, enum machine_mode mode ATTRIBUTE_UNUSED) { return (GET_CODE (op) == CONST_INT && (INTVAL (op) == 2 || INTVAL (op) == 4 @@ -781,9 +792,7 @@ shladd_operand (op, mode) /* Return 1 if OP is a -16, -8, -4, -1, 1, 4, 8, or 16 immediate operand. */ int -fetchadd_operand (op, mode) - rtx op; - enum machine_mode mode ATTRIBUTE_UNUSED; +fetchadd_operand (rtx op, enum machine_mode mode ATTRIBUTE_UNUSED) { return (GET_CODE (op) == CONST_INT && (INTVAL (op) == -16 || INTVAL (op) == -8 || @@ -795,9 +804,7 @@ fetchadd_operand (op, mode) /* Return 1 if OP is a floating-point constant zero, one, or a register. */ int -fr_reg_or_fp01_operand (op, mode) - rtx op; - enum machine_mode mode; +fr_reg_or_fp01_operand (rtx op, enum machine_mode mode) { return ((GET_CODE (op) == CONST_DOUBLE && CONST_DOUBLE_OK_FOR_G (op)) || fr_register_operand (op, mode)); @@ -807,9 +814,7 @@ fr_reg_or_fp01_operand (op, mode) POST_MODIFY with a REG as displacement. */ int -destination_operand (op, mode) - rtx op; - enum machine_mode mode; +destination_operand (rtx op, enum machine_mode mode) { if (! nonimmediate_operand (op, mode)) return 0; @@ -823,21 +828,17 @@ destination_operand (op, mode) /* Like memory_operand, but don't allow post-increments. */ int -not_postinc_memory_operand (op, mode) - rtx op; - enum machine_mode mode; +not_postinc_memory_operand (rtx op, enum machine_mode mode) { return (memory_operand (op, mode) && GET_RTX_CLASS (GET_CODE (XEXP (op, 0))) != 'a'); } -/* Return 1 if this is a comparison operator, which accepts an normal 8-bit +/* Return 1 if this is a comparison operator, which accepts a normal 8-bit signed immediate operand. */ int -normal_comparison_operator (op, mode) - register rtx op; - enum machine_mode mode; +normal_comparison_operator (register rtx op, enum machine_mode mode) { enum rtx_code code = GET_CODE (op); return ((mode == VOIDmode || GET_MODE (op) == mode) @@ -849,9 +850,7 @@ normal_comparison_operator (op, mode) signed immediate operand. */ int -adjusted_comparison_operator (op, mode) - register rtx op; - enum machine_mode mode; +adjusted_comparison_operator (register rtx op, enum machine_mode mode) { enum rtx_code code = GET_CODE (op); return ((mode == VOIDmode || GET_MODE (op) == mode) @@ -861,9 +860,7 @@ adjusted_comparison_operator (op, mode) /* Return 1 if this is a signed inequality operator. */ int -signed_inequality_operator (op, mode) - register rtx op; - enum machine_mode mode; +signed_inequality_operator (register rtx op, enum machine_mode mode) { enum rtx_code code = GET_CODE (op); return ((mode == VOIDmode || GET_MODE (op) == mode) @@ -874,9 +871,7 @@ signed_inequality_operator (op, mode) /* Return 1 if this operator is valid for predication. */ int -predicate_operator (op, mode) - register rtx op; - enum machine_mode mode; +predicate_operator (register rtx op, enum machine_mode mode) { enum rtx_code code = GET_CODE (op); return ((GET_MODE (op) == mode || mode == VOIDmode) @@ -886,9 +881,7 @@ predicate_operator (op, mode) /* Return 1 if this operator can be used in a conditional operation. */ int -condop_operator (op, mode) - register rtx op; - enum machine_mode mode; +condop_operator (register rtx op, enum machine_mode mode) { enum rtx_code code = GET_CODE (op); return ((GET_MODE (op) == mode || mode == VOIDmode) @@ -899,9 +892,7 @@ condop_operator (op, mode) /* Return 1 if this is the ar.lc register. */ int -ar_lc_reg_operand (op, mode) - register rtx op; - enum machine_mode mode; +ar_lc_reg_operand (register rtx op, enum machine_mode mode) { return (GET_MODE (op) == DImode && (mode == DImode || mode == VOIDmode) @@ -912,9 +903,7 @@ ar_lc_reg_operand (op, mode) /* Return 1 if this is the ar.ccv register. */ int -ar_ccv_reg_operand (op, mode) - register rtx op; - enum machine_mode mode; +ar_ccv_reg_operand (register rtx op, enum machine_mode mode) { return ((GET_MODE (op) == mode || mode == VOIDmode) && GET_CODE (op) == REG @@ -924,9 +913,7 @@ ar_ccv_reg_operand (op, mode) /* Return 1 if this is the ar.pfs register. */ int -ar_pfs_reg_operand (op, mode) - register rtx op; - enum machine_mode mode; +ar_pfs_reg_operand (register rtx op, enum machine_mode mode) { return ((GET_MODE (op) == mode || mode == VOIDmode) && GET_CODE (op) == REG @@ -936,9 +923,7 @@ ar_pfs_reg_operand (op, mode) /* Like general_operand, but don't allow (mem (addressof)). */ int -general_tfmode_operand (op, mode) - rtx op; - enum machine_mode mode; +general_xfmode_operand (rtx op, enum machine_mode mode) { if (! general_operand (op, mode)) return 0; @@ -950,9 +935,7 @@ general_tfmode_operand (op, mode) /* Similarly. */ int -destination_tfmode_operand (op, mode) - rtx op; - enum machine_mode mode; +destination_xfmode_operand (rtx op, enum machine_mode mode) { if (! destination_operand (op, mode)) return 0; @@ -964,9 +947,7 @@ destination_tfmode_operand (op, mode) /* Similarly. */ int -tfreg_or_fp01_operand (op, mode) - rtx op; - enum machine_mode mode; +xfreg_or_fp01_operand (rtx op, enum machine_mode mode) { if (GET_CODE (op) == SUBREG) return 0; @@ -976,9 +957,7 @@ tfreg_or_fp01_operand (op, mode) /* Return 1 if OP is valid as a base register in a reg + offset address. */ int -basereg_operand (op, mode) - rtx op; - enum machine_mode mode; +basereg_operand (rtx op, enum machine_mode mode) { /* ??? Should I copy the flag_omit_frame_pointer and cse_not_expected checks from pa.c basereg_operand as well? Seems to be OK without them @@ -988,11 +967,133 @@ basereg_operand (op, mode) REG_POINTER ((GET_CODE (op) == SUBREG) ? SUBREG_REG (op) : op)); } +typedef enum + { + ADDR_AREA_NORMAL, /* normal address area */ + ADDR_AREA_SMALL /* addressable by "addl" (-2MB < addr < 2MB) */ + } +ia64_addr_area; + +static GTY(()) tree small_ident1; +static GTY(()) tree small_ident2; + +static void +init_idents (void) +{ + if (small_ident1 == 0) + { + small_ident1 = get_identifier ("small"); + small_ident2 = get_identifier ("__small__"); + } +} + +/* Retrieve the address area that has been chosen for the given decl. */ + +static ia64_addr_area +ia64_get_addr_area (tree decl) +{ + tree model_attr; + + model_attr = lookup_attribute ("model", DECL_ATTRIBUTES (decl)); + if (model_attr) + { + tree id; + + init_idents (); + id = TREE_VALUE (TREE_VALUE (model_attr)); + if (id == small_ident1 || id == small_ident2) + return ADDR_AREA_SMALL; + } + return ADDR_AREA_NORMAL; +} + +static tree +ia64_handle_model_attribute (tree *node, tree name, tree args, int flags ATTRIBUTE_UNUSED, bool *no_add_attrs) +{ + ia64_addr_area addr_area = ADDR_AREA_NORMAL; + ia64_addr_area area; + tree arg, decl = *node; + + init_idents (); + arg = TREE_VALUE (args); + if (arg == small_ident1 || arg == small_ident2) + { + addr_area = ADDR_AREA_SMALL; + } + else + { + warning ("invalid argument of `%s' attribute", + IDENTIFIER_POINTER (name)); + *no_add_attrs = true; + } + + switch (TREE_CODE (decl)) + { + case VAR_DECL: + if ((DECL_CONTEXT (decl) && TREE_CODE (DECL_CONTEXT (decl)) + == FUNCTION_DECL) + && !TREE_STATIC (decl)) + { + error ("%Jan address area attribute cannot be specified for " + "local variables", decl, decl); + *no_add_attrs = true; + } + area = ia64_get_addr_area (decl); + if (area != ADDR_AREA_NORMAL && addr_area != area) + { + error ("%Jaddress area of '%s' conflicts with previous " + "declaration", decl, decl); + *no_add_attrs = true; + } + break; + + case FUNCTION_DECL: + error ("%Jaddress area attribute cannot be specified for functions", + decl, decl); + *no_add_attrs = true; + break; + + default: + warning ("`%s' attribute ignored", IDENTIFIER_POINTER (name)); + *no_add_attrs = true; + break; + } + + return NULL_TREE; +} + +static void +ia64_encode_addr_area (tree decl, rtx symbol) +{ + int flags; + + flags = SYMBOL_REF_FLAGS (symbol); + switch (ia64_get_addr_area (decl)) + { + case ADDR_AREA_NORMAL: break; + case ADDR_AREA_SMALL: flags |= SYMBOL_FLAG_SMALL_ADDR; break; + default: abort (); + } + SYMBOL_REF_FLAGS (symbol) = flags; +} + +static void +ia64_encode_section_info (tree decl, rtx rtl, int first) +{ + default_encode_section_info (decl, rtl, first); + + /* Careful not to prod global register variables. */ + if (TREE_CODE (decl) == VAR_DECL + && GET_CODE (DECL_RTL (decl)) == MEM + && GET_CODE (XEXP (DECL_RTL (decl), 0)) == SYMBOL_REF + && (TREE_STATIC (decl) || DECL_EXTERNAL (decl))) + ia64_encode_addr_area (decl, XEXP (rtl, 0)); +} + /* Return 1 if the operands of a move are ok. */ int -ia64_move_ok (dst, src) - rtx dst, src; +ia64_move_ok (rtx dst, rtx src) { /* If we're under init_recog_no_volatile, we'll not be able to use memory_operand. So check the code directly and don't worry about @@ -1012,27 +1113,18 @@ ia64_move_ok (dst, src) return GET_CODE (src) == CONST_DOUBLE && CONST_DOUBLE_OK_FOR_G (src); } -/* Return 0 if we are doing C++ code. This optimization fails with - C++ because of GNAT c++/6685. */ - int -addp4_optimize_ok (op1, op2) - rtx op1, op2; +addp4_optimize_ok (rtx op1, rtx op2) { - - if (!strcmp (lang_hooks.name, "GNU C++")) - return 0; - return (basereg_operand (op1, GET_MODE(op1)) != basereg_operand (op2, GET_MODE(op2))); } -/* Check if OP is a mask suitible for use with SHIFT in a dep.z instruction. +/* Check if OP is a mask suitable for use with SHIFT in a dep.z instruction. Return the length of the field, or <= 0 on failure. */ int -ia64_depz_field_mask (rop, rshift) - rtx rop, rshift; +ia64_depz_field_mask (rtx rop, rtx rshift) { unsigned HOST_WIDE_INT op = INTVAL (rop); unsigned HOST_WIDE_INT shift = INTVAL (rshift); @@ -1045,40 +1137,48 @@ ia64_depz_field_mask (rop, rshift) } /* Expand a symbolic constant load. */ -/* ??? Should generalize this, so that we can also support 32 bit pointers. */ void -ia64_expand_load_address (dest, src, scratch) - rtx dest, src, scratch; -{ - rtx temp; - - /* The destination could be a MEM during initial rtl generation, - which isn't a valid destination for the PIC load address patterns. */ - if (! register_operand (dest, DImode)) - if (! scratch || ! register_operand (scratch, DImode)) - temp = gen_reg_rtx (DImode); - else - temp = scratch; - else - temp = dest; - - if (tls_symbolic_operand (src, Pmode)) +ia64_expand_load_address (rtx dest, rtx src) +{ + if (tls_symbolic_operand (src, VOIDmode)) + abort (); + if (GET_CODE (dest) != REG) abort (); - if (TARGET_AUTO_PIC) - emit_insn (gen_load_gprel64 (temp, src)); - else if (GET_CODE (src) == SYMBOL_REF && SYMBOL_REF_FLAG (src)) - emit_insn (gen_load_fptr (temp, src)); - else if ((GET_MODE (src) == Pmode || GET_MODE (src) == ptr_mode) - && sdata_symbolic_operand (src, VOIDmode)) - emit_insn (gen_load_gprel (temp, src)); - else if (GET_CODE (src) == CONST - && GET_CODE (XEXP (src, 0)) == PLUS - && GET_CODE (XEXP (XEXP (src, 0), 1)) == CONST_INT - && (INTVAL (XEXP (XEXP (src, 0), 1)) & 0x1fff) != 0) - { - rtx subtarget = no_new_pseudos ? temp : gen_reg_rtx (DImode); + /* ILP32 mode still loads 64-bits of data from the GOT. This avoids + having to pointer-extend the value afterward. Other forms of address + computation below are also more natural to compute as 64-bit quantities. + If we've been given an SImode destination register, change it. */ + if (GET_MODE (dest) != Pmode) + dest = gen_rtx_REG (Pmode, REGNO (dest)); + + if (GET_CODE (src) == SYMBOL_REF && SYMBOL_REF_SMALL_ADDR_P (src)) + { + emit_insn (gen_rtx_SET (VOIDmode, dest, src)); + return; + } + else if (TARGET_AUTO_PIC) + { + emit_insn (gen_load_gprel64 (dest, src)); + return; + } + else if (GET_CODE (src) == SYMBOL_REF && SYMBOL_REF_FUNCTION_P (src)) + { + emit_insn (gen_load_fptr (dest, src)); + return; + } + else if (sdata_symbolic_operand (src, VOIDmode)) + { + emit_insn (gen_load_gprel (dest, src)); + return; + } + + if (GET_CODE (src) == CONST + && GET_CODE (XEXP (src, 0)) == PLUS + && GET_CODE (XEXP (XEXP (src, 0), 1)) == CONST_INT + && (INTVAL (XEXP (XEXP (src, 0), 1)) & 0x1fff) != 0) + { rtx sym = XEXP (XEXP (src, 0), 0); HOST_WIDE_INT ofs, hi, lo; @@ -1088,49 +1188,34 @@ ia64_expand_load_address (dest, src, scratch) lo = ((ofs & 0x3fff) ^ 0x2000) - 0x2000; hi = ofs - lo; - if (! scratch) - scratch = no_new_pseudos ? subtarget : gen_reg_rtx (DImode); - - emit_insn (gen_load_symptr (subtarget, plus_constant (sym, hi), - scratch)); - emit_insn (gen_adddi3 (temp, subtarget, GEN_INT (lo))); + ia64_expand_load_address (dest, plus_constant (sym, hi)); + emit_insn (gen_adddi3 (dest, dest, GEN_INT (lo))); } else { - rtx insn; - if (! scratch) - scratch = no_new_pseudos ? temp : gen_reg_rtx (DImode); + rtx tmp; - insn = emit_insn (gen_load_symptr (temp, src, scratch)); -#ifdef POINTERS_EXTEND_UNSIGNED - if (GET_MODE (temp) != GET_MODE (src)) - src = convert_memory_address (GET_MODE (temp), src); -#endif - REG_NOTES (insn) = gen_rtx_EXPR_LIST (REG_EQUAL, src, REG_NOTES (insn)); - } + tmp = gen_rtx_HIGH (Pmode, src); + tmp = gen_rtx_PLUS (Pmode, tmp, pic_offset_table_rtx); + emit_insn (gen_rtx_SET (VOIDmode, dest, tmp)); - if (temp != dest) - { - if (GET_MODE (dest) != GET_MODE (temp)) - temp = convert_to_mode (GET_MODE (dest), temp, 0); - emit_move_insn (dest, temp); + tmp = gen_rtx_LO_SUM (GET_MODE (dest), dest, src); + emit_insn (gen_rtx_SET (VOIDmode, dest, tmp)); } } static GTY(()) rtx gen_tls_tga; static rtx -gen_tls_get_addr () +gen_tls_get_addr (void) { if (!gen_tls_tga) - { - gen_tls_tga = init_one_libfunc ("__tls_get_addr"); - } + gen_tls_tga = init_one_libfunc ("__tls_get_addr"); return gen_tls_tga; } static GTY(()) rtx thread_pointer_rtx; static rtx -gen_thread_pointer () +gen_thread_pointer (void) { if (!thread_pointer_rtx) { @@ -1140,153 +1225,128 @@ gen_thread_pointer () return thread_pointer_rtx; } -rtx -ia64_expand_move (op0, op1) - rtx op0, op1; +static rtx +ia64_expand_tls_address (enum tls_model tls_kind, rtx op0, rtx op1) { - enum machine_mode mode = GET_MODE (op0); + rtx tga_op1, tga_op2, tga_ret, tga_eqv, tmp, insns; + rtx orig_op0 = op0; - if (!reload_in_progress && !reload_completed && !ia64_move_ok (op0, op1)) - op1 = force_reg (mode, op1); - - if (mode == Pmode || mode == ptr_mode) + switch (tls_kind) { - enum tls_model tls_kind; - if ((tls_kind = tls_symbolic_operand (op1, Pmode))) - { - rtx tga_op1, tga_op2, tga_ret, tga_eqv, tmp, insns; - rtx orig_op0 = op0; + case TLS_MODEL_GLOBAL_DYNAMIC: + start_sequence (); - switch (tls_kind) - { - case TLS_MODEL_GLOBAL_DYNAMIC: - start_sequence (); - - tga_op1 = gen_reg_rtx (Pmode); - emit_insn (gen_load_ltoff_dtpmod (tga_op1, op1)); - tga_op1 = gen_rtx_MEM (Pmode, tga_op1); - RTX_UNCHANGING_P (tga_op1) = 1; - - tga_op2 = gen_reg_rtx (Pmode); - emit_insn (gen_load_ltoff_dtprel (tga_op2, op1)); - tga_op2 = gen_rtx_MEM (Pmode, tga_op2); - RTX_UNCHANGING_P (tga_op2) = 1; - - tga_ret = emit_library_call_value (gen_tls_get_addr (), NULL_RTX, - LCT_CONST, Pmode, 2, tga_op1, - Pmode, tga_op2, Pmode); - - insns = get_insns (); - end_sequence (); - - if (GET_MODE (op0) != Pmode) - op0 = tga_ret; - emit_libcall_block (insns, op0, tga_ret, op1); - break; + tga_op1 = gen_reg_rtx (Pmode); + emit_insn (gen_load_ltoff_dtpmod (tga_op1, op1)); + tga_op1 = gen_rtx_MEM (Pmode, tga_op1); + RTX_UNCHANGING_P (tga_op1) = 1; - case TLS_MODEL_LOCAL_DYNAMIC: - /* ??? This isn't the completely proper way to do local-dynamic - If the call to __tls_get_addr is used only by a single symbol, - then we should (somehow) move the dtprel to the second arg - to avoid the extra add. */ - start_sequence (); + tga_op2 = gen_reg_rtx (Pmode); + emit_insn (gen_load_ltoff_dtprel (tga_op2, op1)); + tga_op2 = gen_rtx_MEM (Pmode, tga_op2); + RTX_UNCHANGING_P (tga_op2) = 1; - tga_op1 = gen_reg_rtx (Pmode); - emit_insn (gen_load_ltoff_dtpmod (tga_op1, op1)); - tga_op1 = gen_rtx_MEM (Pmode, tga_op1); - RTX_UNCHANGING_P (tga_op1) = 1; + tga_ret = emit_library_call_value (gen_tls_get_addr (), NULL_RTX, + LCT_CONST, Pmode, 2, tga_op1, + Pmode, tga_op2, Pmode); - tga_op2 = const0_rtx; + insns = get_insns (); + end_sequence (); - tga_ret = emit_library_call_value (gen_tls_get_addr (), NULL_RTX, - LCT_CONST, Pmode, 2, tga_op1, - Pmode, tga_op2, Pmode); + if (GET_MODE (op0) != Pmode) + op0 = tga_ret; + emit_libcall_block (insns, op0, tga_ret, op1); + break; - insns = get_insns (); - end_sequence (); + case TLS_MODEL_LOCAL_DYNAMIC: + /* ??? This isn't the completely proper way to do local-dynamic + If the call to __tls_get_addr is used only by a single symbol, + then we should (somehow) move the dtprel to the second arg + to avoid the extra add. */ + start_sequence (); - tga_eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx), - UNSPEC_LD_BASE); - tmp = gen_reg_rtx (Pmode); - emit_libcall_block (insns, tmp, tga_ret, tga_eqv); + tga_op1 = gen_reg_rtx (Pmode); + emit_insn (gen_load_ltoff_dtpmod (tga_op1, op1)); + tga_op1 = gen_rtx_MEM (Pmode, tga_op1); + RTX_UNCHANGING_P (tga_op1) = 1; - if (!register_operand (op0, Pmode)) - op0 = gen_reg_rtx (Pmode); - if (TARGET_TLS64) - { - emit_insn (gen_load_dtprel (op0, op1)); - emit_insn (gen_adddi3 (op0, tmp, op0)); - } - else - emit_insn (gen_add_dtprel (op0, tmp, op1)); - break; + tga_op2 = const0_rtx; - case TLS_MODEL_INITIAL_EXEC: - tmp = gen_reg_rtx (Pmode); - emit_insn (gen_load_ltoff_tprel (tmp, op1)); - tmp = gen_rtx_MEM (Pmode, tmp); - RTX_UNCHANGING_P (tmp) = 1; - tmp = force_reg (Pmode, tmp); + tga_ret = emit_library_call_value (gen_tls_get_addr (), NULL_RTX, + LCT_CONST, Pmode, 2, tga_op1, + Pmode, tga_op2, Pmode); - if (!register_operand (op0, Pmode)) - op0 = gen_reg_rtx (Pmode); - emit_insn (gen_adddi3 (op0, tmp, gen_thread_pointer ())); - break; + insns = get_insns (); + end_sequence (); - case TLS_MODEL_LOCAL_EXEC: - if (!register_operand (op0, Pmode)) - op0 = gen_reg_rtx (Pmode); - if (TARGET_TLS64) - { - emit_insn (gen_load_tprel (op0, op1)); - emit_insn (gen_adddi3 (op0, gen_thread_pointer (), op0)); - } - else - emit_insn (gen_add_tprel (op0, gen_thread_pointer (), op1)); - break; + tga_eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx), + UNSPEC_LD_BASE); + tmp = gen_reg_rtx (Pmode); + emit_libcall_block (insns, tmp, tga_ret, tga_eqv); - default: - abort (); - } + if (!register_operand (op0, Pmode)) + op0 = gen_reg_rtx (Pmode); + if (TARGET_TLS64) + { + emit_insn (gen_load_dtprel (op0, op1)); + emit_insn (gen_adddi3 (op0, tmp, op0)); + } + else + emit_insn (gen_add_dtprel (op0, tmp, op1)); + break; - if (orig_op0 == op0) - return NULL_RTX; - if (GET_MODE (orig_op0) == Pmode) - return op0; - return gen_lowpart (GET_MODE (orig_op0), op0); + case TLS_MODEL_INITIAL_EXEC: + tmp = gen_reg_rtx (Pmode); + emit_insn (gen_load_ltoff_tprel (tmp, op1)); + tmp = gen_rtx_MEM (Pmode, tmp); + RTX_UNCHANGING_P (tmp) = 1; + tmp = force_reg (Pmode, tmp); + + if (!register_operand (op0, Pmode)) + op0 = gen_reg_rtx (Pmode); + emit_insn (gen_adddi3 (op0, tmp, gen_thread_pointer ())); + break; + + case TLS_MODEL_LOCAL_EXEC: + if (!register_operand (op0, Pmode)) + op0 = gen_reg_rtx (Pmode); + if (TARGET_TLS64) + { + emit_insn (gen_load_tprel (op0, op1)); + emit_insn (gen_adddi3 (op0, gen_thread_pointer (), op0)); } - else if (!TARGET_NO_PIC && - (symbolic_operand (op1, Pmode) || - symbolic_operand (op1, ptr_mode))) + else + emit_insn (gen_add_tprel (op0, gen_thread_pointer (), op1)); + break; + + default: + abort (); + } + + if (orig_op0 == op0) + return NULL_RTX; + if (GET_MODE (orig_op0) == Pmode) + return op0; + return gen_lowpart (GET_MODE (orig_op0), op0); +} + +rtx +ia64_expand_move (rtx op0, rtx op1) +{ + enum machine_mode mode = GET_MODE (op0); + + if (!reload_in_progress && !reload_completed && !ia64_move_ok (op0, op1)) + op1 = force_reg (mode, op1); + + if ((mode == Pmode || mode == ptr_mode) && symbolic_operand (op1, VOIDmode)) + { + enum tls_model tls_kind; + if ((tls_kind = tls_symbolic_operand (op1, VOIDmode))) + return ia64_expand_tls_address (tls_kind, op0, op1); + + if (!TARGET_NO_PIC && reload_completed) { - /* Before optimization starts, delay committing to any particular - type of PIC address load. If this function gets deferred, we - may acquire information that changes the value of the - sdata_symbolic_operand predicate. - - But don't delay for function pointers. Loading a function address - actually loads the address of the descriptor not the function. - If we represent these as SYMBOL_REFs, then they get cse'd with - calls, and we end up with calls to the descriptor address instead - of calls to the function address. Functions are not candidates - for sdata anyways. - - Don't delay for LABEL_REF because the splitter loses REG_LABEL - notes. Don't delay for pool addresses on general principals; - they'll never become non-local behind our back. */ - - if (rtx_equal_function_value_matters - && GET_CODE (op1) != LABEL_REF - && ! (GET_CODE (op1) == SYMBOL_REF - && (SYMBOL_REF_FLAG (op1) - || CONSTANT_POOL_ADDRESS_P (op1) - || STRING_POOL_ADDRESS_P (op1)))) - if (GET_MODE (op1) == DImode) - emit_insn (gen_movdi_symbolic (op0, op1)); - else - emit_insn (gen_movsi_symbolic (op0, op1)); - else - ia64_expand_load_address (op0, op1, NULL_RTX); + ia64_expand_load_address (op0, op1); return NULL_RTX; } } @@ -1294,102 +1354,272 @@ ia64_expand_move (op0, op1) return op1; } -/* Split a post-reload TImode reference into two DImode components. */ +/* Split a move from OP1 to OP0 conditional on COND. */ -rtx -ia64_split_timode (out, in, scratch) - rtx out[2]; - rtx in, scratch; +void +ia64_emit_cond_move (rtx op0, rtx op1, rtx cond) +{ + rtx insn, first = get_last_insn (); + + emit_move_insn (op0, op1); + + for (insn = get_last_insn (); insn != first; insn = PREV_INSN (insn)) + if (INSN_P (insn)) + PATTERN (insn) = gen_rtx_COND_EXEC (VOIDmode, copy_rtx (cond), + PATTERN (insn)); +} + +/* Split a post-reload TImode or TFmode reference into two DImode + components. This is made extra difficult by the fact that we do + not get any scratch registers to work with, because reload cannot + be prevented from giving us a scratch that overlaps the register + pair involved. So instead, when addressing memory, we tweak the + pointer register up and back down with POST_INCs. Or up and not + back down when we can get away with it. + + REVERSED is true when the loads must be done in reversed order + (high word first) for correctness. DEAD is true when the pointer + dies with the second insn we generate and therefore the second + address must not carry a postmodify. + + May return an insn which is to be emitted after the moves. */ + +static rtx +ia64_split_tmode (rtx out[2], rtx in, bool reversed, bool dead) { + rtx fixup = 0; + switch (GET_CODE (in)) { case REG: - out[0] = gen_rtx_REG (DImode, REGNO (in)); - out[1] = gen_rtx_REG (DImode, REGNO (in) + 1); - return NULL_RTX; + out[reversed] = gen_rtx_REG (DImode, REGNO (in)); + out[!reversed] = gen_rtx_REG (DImode, REGNO (in) + 1); + break; + + case CONST_INT: + case CONST_DOUBLE: + /* Cannot occur reversed. */ + if (reversed) abort (); + + if (GET_MODE (in) != TFmode) + split_double (in, &out[0], &out[1]); + else + /* split_double does not understand how to split a TFmode + quantity into a pair of DImode constants. */ + { + REAL_VALUE_TYPE r; + unsigned HOST_WIDE_INT p[2]; + long l[4]; /* TFmode is 128 bits */ + + REAL_VALUE_FROM_CONST_DOUBLE (r, in); + real_to_target (l, &r, TFmode); + + if (FLOAT_WORDS_BIG_ENDIAN) + { + p[0] = (((unsigned HOST_WIDE_INT) l[0]) << 32) + l[1]; + p[1] = (((unsigned HOST_WIDE_INT) l[2]) << 32) + l[3]; + } + else + { + p[0] = (((unsigned HOST_WIDE_INT) l[3]) << 32) + l[2]; + p[1] = (((unsigned HOST_WIDE_INT) l[1]) << 32) + l[0]; + } + out[0] = GEN_INT (p[0]); + out[1] = GEN_INT (p[1]); + } + break; case MEM: { rtx base = XEXP (in, 0); + rtx offset; switch (GET_CODE (base)) { case REG: - out[0] = adjust_address (in, DImode, 0); - break; - case POST_MODIFY: - base = XEXP (base, 0); - out[0] = adjust_address (in, DImode, 0); + if (!reversed) + { + out[0] = adjust_automodify_address + (in, DImode, gen_rtx_POST_INC (Pmode, base), 0); + out[1] = adjust_automodify_address + (in, DImode, dead ? 0 : gen_rtx_POST_DEC (Pmode, base), 8); + } + else + { + /* Reversal requires a pre-increment, which can only + be done as a separate insn. */ + emit_insn (gen_adddi3 (base, base, GEN_INT (8))); + out[0] = adjust_automodify_address + (in, DImode, gen_rtx_POST_DEC (Pmode, base), 8); + out[1] = adjust_address (in, DImode, 0); + } break; - /* Since we're changing the mode, we need to change to POST_MODIFY - as well to preserve the size of the increment. Either that or - do the update in two steps, but we've already got this scratch - register handy so let's use it. */ case POST_INC: - base = XEXP (base, 0); - out[0] - = change_address (in, DImode, - gen_rtx_POST_MODIFY - (Pmode, base, plus_constant (base, 16))); + if (reversed || dead) abort (); + /* Just do the increment in two steps. */ + out[0] = adjust_automodify_address (in, DImode, 0, 0); + out[1] = adjust_automodify_address (in, DImode, 0, 8); break; + case POST_DEC: + if (reversed || dead) abort (); + /* Add 8, subtract 24. */ base = XEXP (base, 0); - out[0] - = change_address (in, DImode, - gen_rtx_POST_MODIFY - (Pmode, base, plus_constant (base, -16))); + out[0] = adjust_automodify_address + (in, DImode, gen_rtx_POST_INC (Pmode, base), 0); + out[1] = adjust_automodify_address + (in, DImode, + gen_rtx_POST_MODIFY (Pmode, base, plus_constant (base, -24)), + 8); + break; + + case POST_MODIFY: + if (reversed || dead) abort (); + /* Extract and adjust the modification. This case is + trickier than the others, because we might have an + index register, or we might have a combined offset that + doesn't fit a signed 9-bit displacement field. We can + assume the incoming expression is already legitimate. */ + offset = XEXP (base, 1); + base = XEXP (base, 0); + + out[0] = adjust_automodify_address + (in, DImode, gen_rtx_POST_INC (Pmode, base), 0); + + if (GET_CODE (XEXP (offset, 1)) == REG) + { + /* Can't adjust the postmodify to match. Emit the + original, then a separate addition insn. */ + out[1] = adjust_automodify_address (in, DImode, 0, 8); + fixup = gen_adddi3 (base, base, GEN_INT (-8)); + } + else if (GET_CODE (XEXP (offset, 1)) != CONST_INT) + abort (); + else if (INTVAL (XEXP (offset, 1)) < -256 + 8) + { + /* Again the postmodify cannot be made to match, but + in this case it's more efficient to get rid of the + postmodify entirely and fix up with an add insn. */ + out[1] = adjust_automodify_address (in, DImode, base, 8); + fixup = gen_adddi3 (base, base, + GEN_INT (INTVAL (XEXP (offset, 1)) - 8)); + } + else + { + /* Combined offset still fits in the displacement field. + (We cannot overflow it at the high end.) */ + out[1] = adjust_automodify_address + (in, DImode, + gen_rtx_POST_MODIFY (Pmode, base, + gen_rtx_PLUS (Pmode, base, + GEN_INT (INTVAL (XEXP (offset, 1)) - 8))), + 8); + } break; + default: abort (); } - - if (scratch == NULL_RTX) - abort (); - out[1] = change_address (in, DImode, scratch); - return gen_adddi3 (scratch, base, GEN_INT (8)); + break; } - case CONST_INT: - case CONST_DOUBLE: - split_double (in, &out[0], &out[1]); - return NULL_RTX; - default: abort (); } + + return fixup; } -/* ??? Fixing GR->FR TFmode moves during reload is hard. You need to go +/* Split a TImode or TFmode move instruction after reload. + This is used by *movtf_internal and *movti_internal. */ +void +ia64_split_tmode_move (rtx operands[]) +{ + rtx in[2], out[2], insn; + rtx fixup[2]; + bool dead = false; + bool reversed = false; + + /* It is possible for reload to decide to overwrite a pointer with + the value it points to. In that case we have to do the loads in + the appropriate order so that the pointer is not destroyed too + early. Also we must not generate a postmodify for that second + load, or rws_access_regno will abort. */ + if (GET_CODE (operands[1]) == MEM + && reg_overlap_mentioned_p (operands[0], operands[1])) + { + rtx base = XEXP (operands[1], 0); + while (GET_CODE (base) != REG) + base = XEXP (base, 0); + + if (REGNO (base) == REGNO (operands[0])) + reversed = true; + dead = true; + } + /* Another reason to do the moves in reversed order is if the first + element of the target register pair is also the second element of + the source register pair. */ + if (GET_CODE (operands[0]) == REG && GET_CODE (operands[1]) == REG + && REGNO (operands[0]) == REGNO (operands[1]) + 1) + reversed = true; + + fixup[0] = ia64_split_tmode (in, operands[1], reversed, dead); + fixup[1] = ia64_split_tmode (out, operands[0], reversed, dead); + +#define MAYBE_ADD_REG_INC_NOTE(INSN, EXP) \ + if (GET_CODE (EXP) == MEM \ + && (GET_CODE (XEXP (EXP, 0)) == POST_MODIFY \ + || GET_CODE (XEXP (EXP, 0)) == POST_INC \ + || GET_CODE (XEXP (EXP, 0)) == POST_DEC)) \ + REG_NOTES (INSN) = gen_rtx_EXPR_LIST (REG_INC, \ + XEXP (XEXP (EXP, 0), 0), \ + REG_NOTES (INSN)) + + insn = emit_insn (gen_rtx_SET (VOIDmode, out[0], in[0])); + MAYBE_ADD_REG_INC_NOTE (insn, in[0]); + MAYBE_ADD_REG_INC_NOTE (insn, out[0]); + + insn = emit_insn (gen_rtx_SET (VOIDmode, out[1], in[1])); + MAYBE_ADD_REG_INC_NOTE (insn, in[1]); + MAYBE_ADD_REG_INC_NOTE (insn, out[1]); + + if (fixup[0]) + emit_insn (fixup[0]); + if (fixup[1]) + emit_insn (fixup[1]); + +#undef MAYBE_ADD_REG_INC_NOTE +} + +/* ??? Fixing GR->FR XFmode moves during reload is hard. You need to go through memory plus an extra GR scratch register. Except that you can either get the first from SECONDARY_MEMORY_NEEDED or the second from SECONDARY_RELOAD_CLASS, but not both. We got into problems in the first place by allowing a construct like - (subreg:TF (reg:TI)), which we got from a union containing a long double. + (subreg:XF (reg:TI)), which we got from a union containing a long double. This solution attempts to prevent this situation from occurring. When we see something like the above, we spill the inner register to memory. */ rtx -spill_tfmode_operand (in, force) - rtx in; - int force; +spill_xfmode_operand (rtx in, int force) { if (GET_CODE (in) == SUBREG && GET_MODE (SUBREG_REG (in)) == TImode && GET_CODE (SUBREG_REG (in)) == REG) { - rtx mem = gen_mem_addressof (SUBREG_REG (in), NULL_TREE, true); - return gen_rtx_MEM (TFmode, copy_to_reg (XEXP (mem, 0))); + rtx mem = gen_mem_addressof (SUBREG_REG (in), NULL_TREE, /*rescan=*/true); + return gen_rtx_MEM (XFmode, copy_to_reg (XEXP (mem, 0))); } else if (force && GET_CODE (in) == REG) { - rtx mem = gen_mem_addressof (in, NULL_TREE, true); - return gen_rtx_MEM (TFmode, copy_to_reg (XEXP (mem, 0))); + rtx mem = gen_mem_addressof (in, NULL_TREE, /*rescan=*/true); + return gen_rtx_MEM (XFmode, copy_to_reg (XEXP (mem, 0))); } else if (GET_CODE (in) == MEM && GET_CODE (XEXP (in, 0)) == ADDRESSOF) - return change_address (in, TFmode, copy_to_reg (XEXP (in, 0))); + return change_address (in, XFmode, copy_to_reg (XEXP (in, 0))); else return in; } @@ -1397,10 +1627,10 @@ spill_tfmode_operand (in, force) /* Emit comparison instruction if necessary, returning the expression that holds the compare result in the proper mode. */ +static GTY(()) rtx cmptf_libfunc; + rtx -ia64_expand_compare (code, mode) - enum rtx_code code; - enum machine_mode mode; +ia64_expand_compare (enum rtx_code code, enum machine_mode mode) { rtx op0 = ia64_compare_op0, op1 = ia64_compare_op1; rtx cmp; @@ -1414,6 +1644,59 @@ ia64_expand_compare (code, mode) else abort (); } + /* HPUX TFmode compare requires a library call to _U_Qfcmp, which takes a + magic number as its third argument, that indicates what to do. + The return value is an integer to be compared against zero. */ + else if (TARGET_HPUX && GET_MODE (op0) == TFmode) + { + enum qfcmp_magic { + QCMP_INV = 1, /* Raise FP_INVALID on SNaN as a side effect. */ + QCMP_UNORD = 2, + QCMP_EQ = 4, + QCMP_LT = 8, + QCMP_GT = 16 + } magic; + enum rtx_code ncode; + rtx ret, insns; + if (GET_MODE (op1) != TFmode) + abort (); + switch (code) + { + /* 1 = equal, 0 = not equal. Equality operators do + not raise FP_INVALID when given an SNaN operand. */ + case EQ: magic = QCMP_EQ; ncode = NE; break; + case NE: magic = QCMP_EQ; ncode = EQ; break; + /* isunordered() from C99. */ + case UNORDERED: magic = QCMP_UNORD; ncode = NE; break; + /* Relational operators raise FP_INVALID when given + an SNaN operand. */ + case LT: magic = QCMP_LT |QCMP_INV; ncode = NE; break; + case LE: magic = QCMP_LT|QCMP_EQ|QCMP_INV; ncode = NE; break; + case GT: magic = QCMP_GT |QCMP_INV; ncode = NE; break; + case GE: magic = QCMP_GT|QCMP_EQ|QCMP_INV; ncode = NE; break; + /* FUTURE: Implement UNEQ, UNLT, UNLE, UNGT, UNGE, LTGT. + Expanders for buneq etc. weuld have to be added to ia64.md + for this to be useful. */ + default: abort (); + } + + start_sequence (); + + ret = emit_library_call_value (cmptf_libfunc, 0, LCT_CONST, DImode, 3, + op0, TFmode, op1, TFmode, + GEN_INT (magic), DImode); + cmp = gen_reg_rtx (BImode); + emit_insn (gen_rtx_SET (VOIDmode, cmp, + gen_rtx_fmt_ee (ncode, BImode, + ret, const0_rtx))); + + insns = get_insns (); + end_sequence (); + + emit_libcall_block (insns, cmp, cmp, + gen_rtx_fmt_ee (code, BImode, op0, op1)); + code = NE; + } else { cmp = gen_reg_rtx (BImode); @@ -1426,16 +1709,15 @@ ia64_expand_compare (code, mode) } /* Emit the appropriate sequence for a call. */ + void -ia64_expand_call (retval, addr, nextarg, sibcall_p) - rtx retval; - rtx addr; - rtx nextarg ATTRIBUTE_UNUSED; - int sibcall_p; +ia64_expand_call (rtx retval, rtx addr, rtx nextarg ATTRIBUTE_UNUSED, + int sibcall_p) { rtx insn, b0; addr = XEXP (addr, 0); + addr = convert_memory_address (DImode, addr); b0 = gen_rtx_REG (DImode, R_BR (0)); /* ??? Should do this for functions known to bind local too. */ @@ -1465,8 +1747,9 @@ ia64_expand_call (retval, addr, nextarg, sibcall_p) if (sibcall_p) use_reg (&CALL_INSN_FUNCTION_USAGE (insn), b0); } + void -ia64_reload_gp () +ia64_reload_gp (void) { rtx tmp; @@ -1506,10 +1789,8 @@ ia64_reload_gp () } void -ia64_split_call (retval, addr, retaddr, scratch_r, scratch_b, - noreturn_p, sibcall_p) - rtx retval, addr, retaddr, scratch_r, scratch_b; - int noreturn_p, sibcall_p; +ia64_split_call (rtx retval, rtx addr, rtx retaddr, rtx scratch_r, + rtx scratch_b, int noreturn_p, int sibcall_p) { rtx insn; bool is_desc = false; @@ -1522,8 +1803,8 @@ ia64_split_call (retval, addr, retaddr, scratch_r, scratch_b, bool addr_dead_p; /* ??? We are currently constrained to *not* use peep2, because - we can legitimiately change the global lifetime of the GP - (in the form of killing where previously live). This is + we can legitimately change the global lifetime of the GP + (in the form of killing where previously live). This is because a call through a descriptor doesn't use the previous value of the GP, while a direct call does, and we do not commit to either form until the split here. @@ -1571,9 +1852,15 @@ ia64_split_call (retval, addr, retaddr, scratch_r, scratch_b, /* Begin the assembly file. */ +static void +ia64_file_start (void) +{ + default_file_start (); + emit_safe_across_calls (); +} + void -emit_safe_across_calls (f) - FILE *f; +emit_safe_across_calls (void) { unsigned int rs, re; int out_state; @@ -1590,19 +1877,19 @@ emit_safe_across_calls (f) continue; if (out_state == 0) { - fputs ("\t.pred.safe_across_calls ", f); + fputs ("\t.pred.safe_across_calls ", asm_out_file); out_state = 1; } else - fputc (',', f); + fputc (',', asm_out_file); if (re == rs + 1) - fprintf (f, "p%u", rs); + fprintf (asm_out_file, "p%u", rs); else - fprintf (f, "p%u-p%u", rs, re - 1); + fprintf (asm_out_file, "p%u-p%u", rs, re - 1); rs = re + 1; } if (out_state) - fputc ('\n', f); + fputc ('\n', asm_out_file); } /* Helper function for ia64_compute_frame_size: find an appropriate general @@ -1611,8 +1898,7 @@ emit_safe_across_calls (f) TRY_LOCALS is true if we should attempt to locate a local regnum. */ static int -find_gr_spill (try_locals) - int try_locals; +find_gr_spill (int try_locals) { int regno; @@ -1661,7 +1947,7 @@ find_gr_spill (try_locals) static int last_scratch_gr_reg; static int -next_scratch_gr_reg () +next_scratch_gr_reg (void) { int i, regno; @@ -1686,9 +1972,7 @@ next_scratch_gr_reg () diddle_return_value. Mark REG in current_frame_info.gr_used_mask. */ static void -mark_reg_gr_used_mask (reg, data) - rtx reg; - void *data ATTRIBUTE_UNUSED; +mark_reg_gr_used_mask (rtx reg, void *data ATTRIBUTE_UNUSED) { unsigned int regno = REGNO (reg); if (regno < 32) @@ -1704,8 +1988,7 @@ mark_reg_gr_used_mask (reg, data) needed for local variables. */ static void -ia64_compute_frame_size (size) - HOST_WIDE_INT size; +ia64_compute_frame_size (HOST_WIDE_INT size) { HOST_WIDE_INT total_size; HOST_WIDE_INT spill_size = 0; @@ -1767,7 +2050,7 @@ ia64_compute_frame_size (size) i = regno - OUT_REG (0) + 1; /* When -p profiling, we need one output register for the mcount argument. - Likwise for -a profiling for the bb_init_func argument. For -ax + Likewise for -a profiling for the bb_init_func argument. For -ax profiling, we need two output registers for the two bb_init_trace_func arguments. */ if (current_function_profile) @@ -1778,7 +2061,7 @@ ia64_compute_frame_size (size) current_frame_info.n_rotate_regs = 0; /* Discover which registers need spilling, and how much room that - will take. Begin with floating point and general registers, + will take. Begin with floating point and general registers, which will always wind up on the stack. */ for (regno = FR_REG (2); regno <= FR_REG (127); regno++) @@ -1809,7 +2092,7 @@ ia64_compute_frame_size (size) /* Now come all special registers that might get saved in other general registers. */ - + if (frame_pointer_needed) { current_frame_info.reg_fp = find_gr_spill (1); @@ -1971,8 +2254,7 @@ ia64_compute_frame_size (size) /* Compute the initial difference between the specified pair of registers. */ HOST_WIDE_INT -ia64_initial_elimination_offset (from, to) - int from, to; +ia64_initial_elimination_offset (int from, int to) { HOST_WIDE_INT offset; @@ -2044,10 +2326,7 @@ struct spill_fill_data static struct spill_fill_data spill_fill_data; static void -setup_spill_pointers (n_spills, init_reg, cfa_off) - int n_spills; - rtx init_reg; - HOST_WIDE_INT cfa_off; +setup_spill_pointers (int n_spills, rtx init_reg, HOST_WIDE_INT cfa_off) { int i; @@ -2073,15 +2352,13 @@ setup_spill_pointers (n_spills, init_reg, cfa_off) } static void -finish_spill_pointers () +finish_spill_pointers (void) { current_frame_info.gr_used_mask = spill_fill_data.save_gr_used_mask; } static rtx -spill_restore_mem (reg, cfa_off) - rtx reg; - HOST_WIDE_INT cfa_off; +spill_restore_mem (rtx reg, HOST_WIDE_INT cfa_off) { int iter = spill_fill_data.next_iter; HOST_WIDE_INT disp = spill_fill_data.prev_off[iter] - cfa_off; @@ -2194,10 +2471,8 @@ spill_restore_mem (reg, cfa_off) } static void -do_spill (move_fn, reg, cfa_off, frame_reg) - rtx (*move_fn) PARAMS ((rtx, rtx, rtx)); - rtx reg, frame_reg; - HOST_WIDE_INT cfa_off; +do_spill (rtx (*move_fn) (rtx, rtx, rtx), rtx reg, HOST_WIDE_INT cfa_off, + rtx frame_reg) { int iter = spill_fill_data.next_iter; rtx mem, insn; @@ -2213,7 +2488,7 @@ do_spill (move_fn, reg, cfa_off, frame_reg) RTX_FRAME_RELATED_P (insn) = 1; - /* Don't even pretend that the unwind code can intuit its way + /* Don't even pretend that the unwind code can intuit its way through a pair of interleaved post_modify iterators. Just provide the correct answer. */ @@ -2239,10 +2514,7 @@ do_spill (move_fn, reg, cfa_off, frame_reg) } static void -do_restore (move_fn, reg, cfa_off) - rtx (*move_fn) PARAMS ((rtx, rtx, rtx)); - rtx reg; - HOST_WIDE_INT cfa_off; +do_restore (rtx (*move_fn) (rtx, rtx, rtx), rtx reg, HOST_WIDE_INT cfa_off) { int iter = spill_fill_data.next_iter; rtx insn; @@ -2254,28 +2526,22 @@ do_restore (move_fn, reg, cfa_off) /* Wrapper functions that discards the CONST_INT spill offset. These exist so that we can give gr_spill/gr_fill the offset they need and - use a consistant function interface. */ + use a consistent function interface. */ static rtx -gen_movdi_x (dest, src, offset) - rtx dest, src; - rtx offset ATTRIBUTE_UNUSED; +gen_movdi_x (rtx dest, rtx src, rtx offset ATTRIBUTE_UNUSED) { return gen_movdi (dest, src); } static rtx -gen_fr_spill_x (dest, src, offset) - rtx dest, src; - rtx offset ATTRIBUTE_UNUSED; +gen_fr_spill_x (rtx dest, rtx src, rtx offset ATTRIBUTE_UNUSED) { return gen_fr_spill (dest, src); } static rtx -gen_fr_restore_x (dest, src, offset) - rtx dest, src; - rtx offset ATTRIBUTE_UNUSED; +gen_fr_restore_x (rtx dest, rtx src, rtx offset ATTRIBUTE_UNUSED) { return gen_fr_restore (dest, src); } @@ -2303,7 +2569,7 @@ gen_fr_restore_x (dest, src, offset) adds instruction. */ void -ia64_expand_prologue () +ia64_expand_prologue (void) { rtx insn, ar_pfs_save_reg, ar_unat_save_reg; int i, epilogue_p, regno, alt_regno, cfa_off, n_varargs; @@ -2382,7 +2648,7 @@ ia64_expand_prologue () regno = next_scratch_gr_reg (); ar_pfs_save_reg = gen_rtx_REG (DImode, regno); - insn = emit_insn (gen_alloc (ar_pfs_save_reg, + insn = emit_insn (gen_alloc (ar_pfs_save_reg, GEN_INT (current_frame_info.n_input_regs), GEN_INT (current_frame_info.n_local_regs), GEN_INT (current_frame_info.n_output_regs), @@ -2412,7 +2678,7 @@ ia64_expand_prologue () else { regno = next_scratch_gr_reg (); - offset = gen_rtx_REG (DImode, regno); + offset = gen_rtx_REG (DImode, regno); emit_move_insn (offset, frame_size_rtx); } @@ -2630,7 +2896,7 @@ ia64_expand_prologue () { if (cfa_off & 15) abort (); - reg = gen_rtx_REG (TFmode, regno); + reg = gen_rtx_REG (XFmode, regno); do_spill (gen_fr_spill_x, reg, cfa_off, reg); cfa_off -= 16; } @@ -2649,8 +2915,7 @@ ia64_expand_prologue () insn to prevent such scheduling. */ void -ia64_expand_epilogue (sibcall_p) - int sibcall_p; +ia64_expand_epilogue (int sibcall_p) { rtx insn, reg, alt_reg, ar_unat_save_reg; int regno, alt_regno, cfa_off; @@ -2664,7 +2929,7 @@ ia64_expand_epilogue (sibcall_p) setup_spill_pointers (current_frame_info.n_spilled, hard_frame_pointer_rtx, 0); else - setup_spill_pointers (current_frame_info.n_spilled, stack_pointer_rtx, + setup_spill_pointers (current_frame_info.n_spilled, stack_pointer_rtx, current_frame_info.total_size); if (current_frame_info.total_size != 0) @@ -2716,7 +2981,7 @@ ia64_expand_epilogue (sibcall_p) } else ar_unat_save_reg = NULL_RTX; - + if (current_frame_info.reg_save_ar_pfs != 0) { alt_reg = gen_rtx_REG (DImode, current_frame_info.reg_save_ar_pfs); @@ -2766,7 +3031,7 @@ ia64_expand_epilogue (sibcall_p) do_restore (gen_gr_restore, reg, cfa_off); cfa_off -= 8; } - + /* Restore the branch registers. Handle B0 specially, as it may have gotten stored in some GR register. */ if (TEST_HARD_REG_BIT (current_frame_info.mask, BR_REG (0))) @@ -2783,7 +3048,7 @@ ia64_expand_epilogue (sibcall_p) reg = gen_rtx_REG (DImode, BR_REG (0)); emit_move_insn (reg, alt_reg); } - + for (regno = BR_REG (1); regno <= BR_REG (7); ++regno) if (TEST_HARD_REG_BIT (current_frame_info.mask, regno)) { @@ -2801,7 +3066,7 @@ ia64_expand_epilogue (sibcall_p) { if (cfa_off & 15) abort (); - reg = gen_rtx_REG (TFmode, regno); + reg = gen_rtx_REG (XFmode, regno); do_restore (gen_fr_restore_x, reg, cfa_off); cfa_off -= 16; } @@ -2867,17 +3132,17 @@ ia64_expand_epilogue (sibcall_p) if (cfun->machine->ia64_eh_epilogue_bsp) emit_insn (gen_set_bsp (cfun->machine->ia64_eh_epilogue_bsp)); - + if (! sibcall_p) emit_jump_insn (gen_return_internal (gen_rtx_REG (DImode, BR_REG (0)))); else { int fp = GR_REG (2); /* We need a throw away register here, r0 and r1 are reserved, so r2 is the - first available call clobbered register. If there was a frame_pointer - register, we may have swapped the names of r2 and HARD_FRAME_POINTER_REGNUM, + first available call clobbered register. If there was a frame_pointer + register, we may have swapped the names of r2 and HARD_FRAME_POINTER_REGNUM, so we have to make sure we're using the string "r2" when emitting - the register name for the assmbler. */ + the register name for the assembler. */ if (current_frame_info.reg_fp && current_frame_info.reg_fp == GR_REG (2)) fp = HARD_FRAME_POINTER_REGNUM; @@ -2900,7 +3165,7 @@ ia64_expand_epilogue (sibcall_p) function. */ int -ia64_direct_return () +ia64_direct_return (void) { if (reload_completed && ! frame_pointer_needed) { @@ -2921,9 +3186,7 @@ ia64_direct_return () during early compilation. */ rtx -ia64_return_addr_rtx (count, frame) - HOST_WIDE_INT count; - rtx frame ATTRIBUTE_UNUSED; +ia64_return_addr_rtx (HOST_WIDE_INT count, rtx frame ATTRIBUTE_UNUSED) { if (count != 0) return NULL; @@ -2934,8 +3197,7 @@ ia64_return_addr_rtx (count, frame) address is saved. */ void -ia64_split_return_addr_rtx (dest) - rtx dest; +ia64_split_return_addr_rtx (rtx dest) { rtx src; @@ -2984,9 +3246,7 @@ ia64_split_return_addr_rtx (dest) } int -ia64_hard_regno_rename_ok (from, to) - int from; - int to; +ia64_hard_regno_rename_ok (int from, int to) { /* Don't clobber any of the registers we reserved for the prologue. */ if (to == current_frame_info.reg_fp @@ -3020,18 +3280,15 @@ ia64_hard_regno_rename_ok (from, to) aligned objects and detect the cases when @fptr is needed. */ static bool -ia64_assemble_integer (x, size, aligned_p) - rtx x; - unsigned int size; - int aligned_p; +ia64_assemble_integer (rtx x, unsigned int size, int aligned_p) { - if (size == (TARGET_ILP32 ? 4 : 8) + if (size == POINTER_SIZE / BITS_PER_UNIT && aligned_p && !(TARGET_NO_PIC || TARGET_AUTO_PIC) && GET_CODE (x) == SYMBOL_REF - && SYMBOL_REF_FLAG (x)) + && SYMBOL_REF_FUNCTION_P (x)) { - if (TARGET_ILP32) + if (POINTER_SIZE == 32) fputs ("\tdata4\t@fptr(", asm_out_file); else fputs ("\tdata8\t@fptr(", asm_out_file); @@ -3045,9 +3302,7 @@ ia64_assemble_integer (x, size, aligned_p) /* Emit the function prologue. */ static void -ia64_output_function_prologue (file, size) - FILE *file; - HOST_WIDE_INT size ATTRIBUTE_UNUSED; +ia64_output_function_prologue (FILE *file, HOST_WIDE_INT size ATTRIBUTE_UNUSED) { int mask, grsave, grsave_prev; @@ -3097,7 +3352,7 @@ ia64_output_function_prologue (file, size) grsave = current_frame_info.reg_save_pr; } - if (mask) + if (mask && TARGET_GNU_AS) fprintf (file, "\t.prologue %d, %d\n", mask, ia64_dbx_register_number (grsave)); else @@ -3114,8 +3369,7 @@ ia64_output_function_prologue (file, size) /* Emit the .body directive at the scheduled end of the prologue. */ static void -ia64_output_function_end_prologue (file) - FILE *file; +ia64_output_function_end_prologue (FILE *file) { if (!flag_unwind_tables && (!flag_exceptions || USING_SJLJ_EXCEPTIONS)) return; @@ -3126,9 +3380,8 @@ ia64_output_function_end_prologue (file) /* Emit the function epilogue. */ static void -ia64_output_function_epilogue (file, size) - FILE *file ATTRIBUTE_UNUSED; - HOST_WIDE_INT size ATTRIBUTE_UNUSED; +ia64_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED, + HOST_WIDE_INT size ATTRIBUTE_UNUSED) { int i; @@ -3153,8 +3406,7 @@ ia64_output_function_epilogue (file, size) } int -ia64_dbx_register_number (regno) - int regno; +ia64_dbx_register_number (int regno) { /* In ia64_expand_prologue we quite literally renamed the frame pointer from its home at loc79 to something inside the register frame. We @@ -3179,11 +3431,24 @@ ia64_dbx_register_number (regno) } void -ia64_initialize_trampoline (addr, fnaddr, static_chain) - rtx addr, fnaddr, static_chain; +ia64_initialize_trampoline (rtx addr, rtx fnaddr, rtx static_chain) { rtx addr_reg, eight = GEN_INT (8); + /* The Intel assembler requires that the global __ia64_trampoline symbol + be declared explicitly */ + if (!TARGET_GNU_AS) + { + static bool declared_ia64_trampoline = false; + + if (!declared_ia64_trampoline) + { + declared_ia64_trampoline = true; + (*targetm.asm_out.globalize_label) (asm_out_file, + "__ia64_trampoline"); + } + } + /* Load up our iterator. */ addr_reg = gen_reg_rtx (Pmode); emit_move_insn (addr_reg, addr); @@ -3212,12 +3477,9 @@ ia64_initialize_trampoline (addr, fnaddr, static_chain) We generate the actual spill instructions during prologue generation. */ void -ia64_setup_incoming_varargs (cum, int_mode, type, pretend_size, second_time) - CUMULATIVE_ARGS cum; - int int_mode; - tree type; - int * pretend_size; - int second_time ATTRIBUTE_UNUSED; +ia64_setup_incoming_varargs (CUMULATIVE_ARGS cum, int int_mode, tree type, + int * pretend_size, + int second_time ATTRIBUTE_UNUSED) { /* Skip the current argument. */ ia64_function_arg_advance (&cum, int_mode, type, 1); @@ -3239,9 +3501,7 @@ ia64_setup_incoming_varargs (cum, int_mode, type, pretend_size, second_time) SFmode). 128-bit quad-precision floats are excluded. */ static enum machine_mode -hfa_element_mode (type, nested) - tree type; - int nested; +hfa_element_mode (tree type, int nested) { enum machine_mode element_mode = VOIDmode; enum machine_mode mode; @@ -3263,16 +3523,15 @@ hfa_element_mode (type, nested) types though. */ case COMPLEX_TYPE: if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_COMPLEX_FLOAT - && (TYPE_MODE (type) != TCmode || INTEL_EXTENDED_IEEE_FORMAT)) - return mode_for_size (GET_MODE_UNIT_SIZE (TYPE_MODE (type)) - * BITS_PER_UNIT, MODE_FLOAT, 0); + && TYPE_MODE (type) != TCmode) + return GET_MODE_INNER (TYPE_MODE (type)); else return VOIDmode; case REAL_TYPE: /* We want to return VOIDmode for raw REAL_TYPEs, but the actual mode if this is contained within an aggregate. */ - if (nested && (TYPE_MODE (type) != TFmode || INTEL_EXTENDED_IEEE_FORMAT)) + if (nested && TYPE_MODE (type) != TFmode) return TYPE_MODE (type); else return VOIDmode; @@ -3315,40 +3574,62 @@ hfa_element_mode (type, nested) return VOIDmode; } +/* Return the number of words required to hold a quantity of TYPE and MODE + when passed as an argument. */ +static int +ia64_function_arg_words (tree type, enum machine_mode mode) +{ + int words; + + if (mode == BLKmode) + words = int_size_in_bytes (type); + else + words = GET_MODE_SIZE (mode); + + return (words + UNITS_PER_WORD - 1) / UNITS_PER_WORD; /* round up */ +} + +/* Return the number of registers that should be skipped so the current + argument (described by TYPE and WORDS) will be properly aligned. + + Integer and float arguments larger than 8 bytes start at the next + even boundary. Aggregates larger than 8 bytes start at the next + even boundary if the aggregate has 16 byte alignment. Note that + in the 32-bit ABI, TImode and TFmode have only 8-byte alignment + but are still to be aligned in registers. + + ??? The ABI does not specify how to handle aggregates with + alignment from 9 to 15 bytes, or greater than 16. We handle them + all as if they had 16 byte alignment. Such aggregates can occur + only if gcc extensions are used. */ +static int +ia64_function_arg_offset (CUMULATIVE_ARGS *cum, tree type, int words) +{ + if ((cum->words & 1) == 0) + return 0; + + if (type + && TREE_CODE (type) != INTEGER_TYPE + && TREE_CODE (type) != REAL_TYPE) + return TYPE_ALIGN (type) > 8 * BITS_PER_UNIT; + else + return words > 1; +} + /* Return rtx for register where argument is passed, or zero if it is passed on the stack. */ - /* ??? 128-bit quad-precision floats are always passed in general registers. */ rtx -ia64_function_arg (cum, mode, type, named, incoming) - CUMULATIVE_ARGS *cum; - enum machine_mode mode; - tree type; - int named; - int incoming; +ia64_function_arg (CUMULATIVE_ARGS *cum, enum machine_mode mode, tree type, + int named, int incoming) { int basereg = (incoming ? GR_ARG_FIRST : AR_ARG_FIRST); - int words = (((mode == BLKmode ? int_size_in_bytes (type) - : GET_MODE_SIZE (mode)) + UNITS_PER_WORD - 1) - / UNITS_PER_WORD); - int offset = 0; + int words = ia64_function_arg_words (type, mode); + int offset = ia64_function_arg_offset (cum, type, words); enum machine_mode hfa_mode = VOIDmode; - /* Integer and float arguments larger than 8 bytes start at the next even - boundary. Aggregates larger than 8 bytes start at the next even boundary - if the aggregate has 16 byte alignment. Net effect is that types with - alignment greater than 8 start at the next even boundary. */ - /* ??? The ABI does not specify how to handle aggregates with alignment from - 9 to 15 bytes, or greater than 16. We handle them all as if they had - 16 byte alignment. Such aggregates can occur only if gcc extensions are - used. */ - if ((type ? (TYPE_ALIGN (type) > 8 * BITS_PER_UNIT) - : (words > 1)) - && (cum->words & 1)) - offset = 1; - /* If all argument slots are used, then it must go on the stack. */ if (cum->words + offset >= MAX_ARGUMENT_SLOTS) return 0; @@ -3408,6 +3689,7 @@ ia64_function_arg (cum, mode, type, named, incoming) for (; offset < byte_size && int_regs < MAX_ARGUMENT_SLOTS; i++) { enum machine_mode gr_mode = DImode; + unsigned int gr_size; /* If we have an odd 4 byte hunk because we ran out of FR regs, then this goes in a GR reg left adjusted/little endian, right @@ -3421,22 +3703,25 @@ ia64_function_arg (cum, mode, type, named, incoming) adjusted/little endian. */ else if (byte_size - offset == 4) gr_mode = SImode; - /* Complex floats need to have float mode. */ - if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT) - gr_mode = hfa_mode; loc[i] = gen_rtx_EXPR_LIST (VOIDmode, gen_rtx_REG (gr_mode, (basereg + int_regs)), GEN_INT (offset)); - offset += GET_MODE_SIZE (gr_mode); - int_regs += GET_MODE_SIZE (gr_mode) <= UNITS_PER_WORD - ? 1 : GET_MODE_SIZE (gr_mode) / UNITS_PER_WORD; + + gr_size = GET_MODE_SIZE (gr_mode); + offset += gr_size; + if (gr_size == UNITS_PER_WORD + || (gr_size < UNITS_PER_WORD && offset % UNITS_PER_WORD == 0)) + int_regs++; + else if (gr_size > UNITS_PER_WORD) + int_regs += gr_size / UNITS_PER_WORD; } - /* If we ended up using just one location, just return that one loc. */ + /* If we ended up using just one location, just return that one loc, but + change the mode back to the argument mode. */ if (i == 1) - return XEXP (loc[0], 0); + return gen_rtx_REG (mode, REGNO (XEXP (loc[0], 0))); else return gen_rtx_PARALLEL (mode, gen_rtvec_v (i, loc)); } @@ -3444,8 +3729,8 @@ ia64_function_arg (cum, mode, type, named, incoming) /* Integral and aggregates go in general registers. If we have run out of FR registers, then FP values must also go in general registers. This can happen when we have a SFmode HFA. */ - else if (((mode == TFmode) && ! INTEL_EXTENDED_IEEE_FORMAT) - || (! FLOAT_MODE_P (mode) || cum->fp_regs == MAX_ARGUMENT_SLOTS)) + else if (mode == TFmode || mode == TCmode + || (! FLOAT_MODE_P (mode) || cum->fp_regs == MAX_ARGUMENT_SLOTS)) { int byte_size = ((mode == BLKmode) ? int_size_in_bytes (type) : GET_MODE_SIZE (mode)); @@ -3467,24 +3752,37 @@ ia64_function_arg (cum, mode, type, named, incoming) } /* If there is a prototype, then FP values go in a FR register when - named, and in a GR registeer when unnamed. */ + named, and in a GR register when unnamed. */ else if (cum->prototype) { - if (! named) - return gen_rtx_REG (mode, basereg + cum->words + offset); - else + if (named) return gen_rtx_REG (mode, FR_ARG_FIRST + cum->fp_regs); + /* In big-endian mode, an anonymous SFmode value must be represented + as (parallel:SF [(expr_list (reg:DI n) (const_int 0))]) to force + the value into the high half of the general register. */ + else if (BYTES_BIG_ENDIAN && mode == SFmode) + return gen_rtx_PARALLEL (mode, + gen_rtvec (1, + gen_rtx_EXPR_LIST (VOIDmode, + gen_rtx_REG (DImode, basereg + cum->words + offset), + const0_rtx))); + else + return gen_rtx_REG (mode, basereg + cum->words + offset); } /* If there is no prototype, then FP values go in both FR and GR registers. */ else { + /* See comment above. */ + enum machine_mode inner_mode = + (BYTES_BIG_ENDIAN && mode == SFmode) ? DImode : mode; + rtx fp_reg = gen_rtx_EXPR_LIST (VOIDmode, gen_rtx_REG (mode, (FR_ARG_FIRST + cum->fp_regs)), const0_rtx); rtx gr_reg = gen_rtx_EXPR_LIST (VOIDmode, - gen_rtx_REG (mode, + gen_rtx_REG (inner_mode, (basereg + cum->words + offset)), const0_rtx); @@ -3498,23 +3796,11 @@ ia64_function_arg (cum, mode, type, named, incoming) in memory. */ int -ia64_function_arg_partial_nregs (cum, mode, type, named) - CUMULATIVE_ARGS *cum; - enum machine_mode mode; - tree type; - int named ATTRIBUTE_UNUSED; -{ - int words = (((mode == BLKmode ? int_size_in_bytes (type) - : GET_MODE_SIZE (mode)) + UNITS_PER_WORD - 1) - / UNITS_PER_WORD); - int offset = 0; - - /* Arguments with alignment larger than 8 bytes start at the next even - boundary. */ - if ((type ? (TYPE_ALIGN (type) > 8 * BITS_PER_UNIT) - : (words > 1)) - && (cum->words & 1)) - offset = 1; +ia64_function_arg_partial_nregs (CUMULATIVE_ARGS *cum, enum machine_mode mode, + tree type, int named ATTRIBUTE_UNUSED) +{ + int words = ia64_function_arg_words (type, mode); + int offset = ia64_function_arg_offset (cum, type, words); /* If all argument slots are used, then it must go on the stack. */ if (cum->words + offset >= MAX_ARGUMENT_SLOTS) @@ -3535,29 +3821,17 @@ ia64_function_arg_partial_nregs (cum, mode, type, named) ia64_function_arg. */ void -ia64_function_arg_advance (cum, mode, type, named) - CUMULATIVE_ARGS *cum; - enum machine_mode mode; - tree type; - int named; -{ - int words = (((mode == BLKmode ? int_size_in_bytes (type) - : GET_MODE_SIZE (mode)) + UNITS_PER_WORD - 1) - / UNITS_PER_WORD); - int offset = 0; +ia64_function_arg_advance (CUMULATIVE_ARGS *cum, enum machine_mode mode, + tree type, int named) +{ + int words = ia64_function_arg_words (type, mode); + int offset = ia64_function_arg_offset (cum, type, words); enum machine_mode hfa_mode = VOIDmode; /* If all arg slots are already full, then there is nothing to do. */ if (cum->words >= MAX_ARGUMENT_SLOTS) return; - /* Arguments with alignment larger than 8 bytes start at the next even - boundary. */ - if ((type ? (TYPE_ALIGN (type) > 8 * BITS_PER_UNIT) - : (words > 1)) - && (cum->words & 1)) - offset = 1; - cum->words += words + offset; /* Check for and handle homogeneous FP aggregates. */ @@ -3607,7 +3881,7 @@ ia64_function_arg_advance (cum, mode, type, named) cum->int_regs = cum->words; /* If there is a prototype, then FP values go in a FR register when - named, and in a GR registeer when unnamed. */ + named, and in a GR register when unnamed. */ else if (cum->prototype) { if (! named) @@ -3619,7 +3893,7 @@ ia64_function_arg_advance (cum, mode, type, named) /* If there is no prototype, then FP values go in both FR and GR registers. */ else - { + { /* ??? Complex types should not reach here. */ cum->fp_regs += (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT ? 2 : 1); cum->int_regs = cum->words; @@ -3630,34 +3904,49 @@ ia64_function_arg_advance (cum, mode, type, named) /* ??? At present this is a GCC extension to the IA-64 ABI. */ int -ia64_function_arg_pass_by_reference (cum, mode, type, named) - CUMULATIVE_ARGS *cum ATTRIBUTE_UNUSED; - enum machine_mode mode ATTRIBUTE_UNUSED; - tree type; - int named ATTRIBUTE_UNUSED; +ia64_function_arg_pass_by_reference (CUMULATIVE_ARGS *cum ATTRIBUTE_UNUSED, + enum machine_mode mode ATTRIBUTE_UNUSED, + tree type, int named ATTRIBUTE_UNUSED) { return type && TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST; } + +/* True if it is OK to do sibling call optimization for the specified + call expression EXP. DECL will be the called function, or NULL if + this is an indirect call. */ +static bool +ia64_function_ok_for_sibcall (tree decl, tree exp ATTRIBUTE_UNUSED) +{ + /* We must always return with our current GP. This means we can + only sibcall to functions defined in the current module. */ + return decl && (*targetm.binds_local_p) (decl); +} /* Implement va_arg. */ rtx -ia64_va_arg (valist, type) - tree valist, type; +ia64_va_arg (tree valist, tree type) { tree t; /* Variable sized types are passed by reference. */ if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST) { - rtx addr = std_expand_builtin_va_arg (valist, build_pointer_type (type)); - return gen_rtx_MEM (ptr_mode, force_reg (Pmode, addr)); + rtx addr = force_reg (ptr_mode, + std_expand_builtin_va_arg (valist, build_pointer_type (type))); +#ifdef POINTERS_EXTEND_UNSIGNED + addr = convert_memory_address (Pmode, addr); +#endif + return gen_rtx_MEM (ptr_mode, addr); } - /* Arguments with alignment larger than 8 bytes start at the next even - boundary. */ - if (TYPE_ALIGN (type) > 8 * BITS_PER_UNIT) + /* Aggregate arguments with alignment larger than 8 bytes start at + the next even boundary. Integer and floating point arguments + do so if they are larger than 8 bytes, whether or not they are + also aligned larger than 8 bytes. */ + if ((TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == INTEGER_TYPE) + ? int_size_in_bytes (type) > 8 : TYPE_ALIGN (type) > 8 * BITS_PER_UNIT) { t = build (PLUS_EXPR, TREE_TYPE (valist), valist, build_int_2 (2 * UNITS_PER_WORD - 1, 0)); @@ -3675,8 +3964,7 @@ ia64_va_arg (valist, type) in a register. */ int -ia64_return_in_memory (valtype) - tree valtype; +ia64_return_in_memory (tree valtype) { enum machine_mode mode; enum machine_mode hfa_mode; @@ -3712,9 +4000,7 @@ ia64_return_in_memory (valtype) /* Return rtx for register that holds the function return value. */ rtx -ia64_function_value (valtype, func) - tree valtype; - tree func ATTRIBUTE_UNUSED; +ia64_function_value (tree valtype, tree func ATTRIBUTE_UNUSED) { enum machine_mode mode; enum machine_mode hfa_mode; @@ -3747,8 +4033,7 @@ ia64_function_value (valtype, func) else return gen_rtx_PARALLEL (mode, gen_rtvec_v (i, loc)); } - else if (FLOAT_TYPE_P (valtype) && - ((mode != TFmode) || INTEL_EXTENDED_IEEE_FORMAT)) + else if (FLOAT_TYPE_P (valtype) && mode != TFmode && mode != TCmode) return gen_rtx_REG (mode, FR_ARG_FIRST); else { @@ -3777,15 +4062,27 @@ ia64_function_value (valtype, func) } } +/* This is called from dwarf2out.c via ASM_OUTPUT_DWARF_DTPREL. + We need to emit DTP-relative relocations. */ + +void +ia64_output_dwarf_dtprel (FILE *file, int size, rtx x) +{ + if (size != 8) + abort (); + fputs ("\tdata8.ua\t@dtprel(", file); + output_addr_const (file, x); + fputs (")", file); +} + /* Print a memory address as an operand to reference that memory location. */ /* ??? Do we need this? It gets used only for 'a' operands. We could perhaps also call this from ia64_print_operand for memory addresses. */ void -ia64_print_operand_address (stream, address) - FILE * stream ATTRIBUTE_UNUSED; - rtx address ATTRIBUTE_UNUSED; +ia64_print_operand_address (FILE * stream ATTRIBUTE_UNUSED, + rtx address ATTRIBUTE_UNUSED) { } @@ -3810,10 +4107,7 @@ ia64_print_operand_address (stream, address) r Print register name, or constant 0 as r0. HP compatibility for Linux kernel. */ void -ia64_print_operand (file, x, code) - FILE * file; - rtx x; - int code; +ia64_print_operand (FILE * file, rtx x, int code) { const char *str; @@ -3921,9 +4215,7 @@ ia64_print_operand (file, x, code) break; } - putc (',', file); - putc (' ', file); - fprintf (file, HOST_WIDE_INT_PRINT_DEC, value); + fprintf (file, ", " HOST_WIDE_INT_PRINT_DEC, value); return; } @@ -3974,7 +4266,7 @@ ia64_print_operand (file, x, code) case '+': { const char *which; - + /* For conditional branches, returns or calls, substitute sptk, dptk, dpnt, or spnt for %s. */ x = find_reg_note (current_output_insn, REG_BR_PROB, 0); @@ -4047,13 +4339,88 @@ ia64_print_operand (file, x, code) return; } -/* Calulate the cost of moving data from a register in class FROM to +/* Compute a (partial) cost for rtx X. Return true if the complete + cost has been computed, and false if subexpressions should be + scanned. In either case, *TOTAL contains the cost result. */ +/* ??? This is incomplete. */ + +static bool +ia64_rtx_costs (rtx x, int code, int outer_code, int *total) +{ + switch (code) + { + case CONST_INT: + switch (outer_code) + { + case SET: + *total = CONST_OK_FOR_J (INTVAL (x)) ? 0 : COSTS_N_INSNS (1); + return true; + case PLUS: + if (CONST_OK_FOR_I (INTVAL (x))) + *total = 0; + else if (CONST_OK_FOR_J (INTVAL (x))) + *total = 1; + else + *total = COSTS_N_INSNS (1); + return true; + default: + if (CONST_OK_FOR_K (INTVAL (x)) || CONST_OK_FOR_L (INTVAL (x))) + *total = 0; + else + *total = COSTS_N_INSNS (1); + return true; + } + + case CONST_DOUBLE: + *total = COSTS_N_INSNS (1); + return true; + + case CONST: + case SYMBOL_REF: + case LABEL_REF: + *total = COSTS_N_INSNS (3); + return true; + + case MULT: + /* For multiplies wider than HImode, we have to go to the FPU, + which normally involves copies. Plus there's the latency + of the multiply itself, and the latency of the instructions to + transfer integer regs to FP regs. */ + /* ??? Check for FP mode. */ + if (GET_MODE_SIZE (GET_MODE (x)) > 2) + *total = COSTS_N_INSNS (10); + else + *total = COSTS_N_INSNS (2); + return true; + + case PLUS: + case MINUS: + case ASHIFT: + case ASHIFTRT: + case LSHIFTRT: + *total = COSTS_N_INSNS (1); + return true; + + case DIV: + case UDIV: + case MOD: + case UMOD: + /* We make divide expensive, so that divide-by-constant will be + optimized to a multiply. */ + *total = COSTS_N_INSNS (60); + return true; + + default: + return false; + } +} + +/* Calculate the cost of moving data from a register in class FROM to one in class TO, using MODE. */ int -ia64_register_move_cost (mode, from, to) - enum machine_mode mode; - enum reg_class from, to; +ia64_register_move_cost (enum machine_mode mode, enum reg_class from, + enum reg_class to) { /* ADDL_REGS is the same as GR_REGS for movement purposes. */ if (to == ADDL_REGS) @@ -4069,11 +4436,11 @@ ia64_register_move_cost (mode, from, to) to = from, from = tmp; } - /* Moving from FR<->GR in TFmode must be more expensive than 2, + /* Moving from FR<->GR in XFmode must be more expensive than 2, so that we get secondary memory reloads. Between FR_REGS, we have to make this at least as expensive as MEMORY_MOVE_COST to avoid spectacularly poor register class preferencing. */ - if (mode == TFmode) + if (mode == XFmode) { if (to != GR_REGS || from != GR_REGS) return MEMORY_MOVE_COST (mode, to, 0); @@ -4125,10 +4492,8 @@ ia64_register_move_cost (mode, from, to) is required. */ enum reg_class -ia64_secondary_reload_class (class, mode, x) - enum reg_class class; - enum machine_mode mode ATTRIBUTE_UNUSED; - rtx x; +ia64_secondary_reload_class (enum reg_class class, + enum machine_mode mode ATTRIBUTE_UNUSED, rtx x) { int regno = -1; @@ -4162,10 +4527,10 @@ ia64_secondary_reload_class (class, mode, x) break; case FR_REGS: - /* Need to go through general regsters to get to other class regs. */ + /* Need to go through general registers to get to other class regs. */ if (regno >= 0 && ! (FR_REGNO_P (regno) || GENERAL_REGNO_P (regno))) return GR_REGS; - + /* This can happen when a paradoxical subreg is an operand to the muldi3 pattern. */ /* ??? This shouldn't be necessary after instruction scheduling is @@ -4206,28 +4571,19 @@ ia64_secondary_reload_class (class, mode, x) return GR_REGS; break; - case GR_REGS: - /* Since we have no offsettable memory addresses, we need a temporary - to hold the address of the second word. */ - if (mode == TImode) - return GR_REGS; - break; - default: break; } return NO_REGS; } + /* Emit text to declare externally defined variables and functions, because the Intel assembler does not support undefined externals. */ void -ia64_asm_output_external (file, decl, name) - FILE *file; - tree decl; - const char *name; +ia64_asm_output_external (FILE *file, tree decl, const char *name) { int save_referenced; @@ -4237,7 +4593,7 @@ ia64_asm_output_external (file, decl, name) if (TARGET_GNU_AS && (!TARGET_HPUX_LD || TREE_CODE (decl) != FUNCTION_DECL - || strstr(name, "__builtin_") == name)) + || strstr (name, "__builtin_") == name)) return; /* ??? The Intel assembler creates a reference that needs to be satisfied by @@ -4253,7 +4609,7 @@ ia64_asm_output_external (file, decl, name) return; if (TARGET_HPUX_LD) - ia64_hpux_add_extern_decl (name); + ia64_hpux_add_extern_decl (decl); else { /* assemble_name will set TREE_SYMBOL_REFERENCED, so we must save and @@ -4269,8 +4625,7 @@ ia64_asm_output_external (file, decl, name) /* Parse the -mfixed-range= option string. */ static void -fix_range (const_str) - const char *const_str; +fix_range (const char *const_str) { int i, first, last; char *str, *dash, *comma; @@ -4333,7 +4688,7 @@ fix_range (const_str) } static struct machine_function * -ia64_init_machine_status () +ia64_init_machine_status (void) { return ggc_alloc_cleared (sizeof (struct machine_function)); } @@ -4341,8 +4696,25 @@ ia64_init_machine_status () /* Handle TARGET_OPTIONS switches. */ void -ia64_override_options () +ia64_override_options (void) { + static struct pta + { + const char *const name; /* processor name or nickname. */ + const enum processor_type processor; + } + const processor_alias_table[] = + { + {"itanium", PROCESSOR_ITANIUM}, + {"itanium1", PROCESSOR_ITANIUM}, + {"merced", PROCESSOR_ITANIUM}, + {"itanium2", PROCESSOR_ITANIUM2}, + {"mckinley", PROCESSOR_ITANIUM2}, + }; + + int const pta_size = ARRAY_SIZE (processor_alias_table); + int i; + if (TARGET_AUTO_PIC) target_flags |= MASK_CONST_GP; @@ -4358,6 +4730,18 @@ ia64_override_options () target_flags &= ~MASK_INLINE_INT_DIV_THR; } + if (TARGET_INLINE_SQRT_LAT && TARGET_INLINE_SQRT_THR) + { + warning ("cannot optimize square root for both latency and throughput"); + target_flags &= ~MASK_INLINE_SQRT_THR; + } + + if (TARGET_INLINE_SQRT_LAT) + { + warning ("not yet implemented: latency-optimized inline square root"); + target_flags &= ~MASK_INLINE_SQRT_LAT; + } + if (ia64_fixed_range_string) fix_range (ia64_fixed_range_string); @@ -4371,35 +4755,32 @@ ia64_override_options () ia64_tls_size = tmp; } + if (!ia64_tune_string) + ia64_tune_string = "itanium2"; + + for (i = 0; i < pta_size; i++) + if (! strcmp (ia64_tune_string, processor_alias_table[i].name)) + { + ia64_tune = processor_alias_table[i].processor; + break; + } + + if (i == pta_size) + error ("bad value (%s) for -tune= switch", ia64_tune_string); + ia64_flag_schedule_insns2 = flag_schedule_insns_after_reload; flag_schedule_insns_after_reload = 0; ia64_section_threshold = g_switch_set ? g_switch_value : IA64_DEFAULT_GVALUE; init_machine_status = ia64_init_machine_status; - - /* Tell the compiler which flavor of TFmode we're using. */ - if (INTEL_EXTENDED_IEEE_FORMAT) - real_format_for_mode[TFmode - QFmode] = &ieee_extended_intel_128_format; } -static enum attr_itanium_requires_unit0 ia64_safe_itanium_requires_unit0 PARAMS((rtx)); -static enum attr_itanium_class ia64_safe_itanium_class PARAMS((rtx)); -static enum attr_type ia64_safe_type PARAMS((rtx)); - -static enum attr_itanium_requires_unit0 -ia64_safe_itanium_requires_unit0 (insn) - rtx insn; -{ - if (recog_memoized (insn) >= 0) - return get_attr_itanium_requires_unit0 (insn); - else - return ITANIUM_REQUIRES_UNIT0_NO; -} +static enum attr_itanium_class ia64_safe_itanium_class (rtx); +static enum attr_type ia64_safe_type (rtx); static enum attr_itanium_class -ia64_safe_itanium_class (insn) - rtx insn; +ia64_safe_itanium_class (rtx insn) { if (recog_memoized (insn) >= 0) return get_attr_itanium_class (insn); @@ -4408,8 +4789,7 @@ ia64_safe_itanium_class (insn) } static enum attr_type -ia64_safe_type (insn) - rtx insn; +ia64_safe_type (rtx insn) { if (recog_memoized (insn) >= 0) return get_attr_type (insn); @@ -4487,26 +4867,21 @@ struct reg_flags unsigned int is_sibcall : 1; /* Is this a sibling or normal call? */ }; -static void rws_update PARAMS ((struct reg_write_state *, int, - struct reg_flags, int)); -static int rws_access_regno PARAMS ((int, struct reg_flags, int)); -static int rws_access_reg PARAMS ((rtx, struct reg_flags, int)); -static void update_set_flags PARAMS ((rtx, struct reg_flags *, int *, rtx *)); -static int set_src_needs_barrier PARAMS ((rtx, struct reg_flags, int, rtx)); -static int rtx_needs_barrier PARAMS ((rtx, struct reg_flags, int)); -static void init_insn_group_barriers PARAMS ((void)); -static int group_barrier_needed_p PARAMS ((rtx)); -static int safe_group_barrier_needed_p PARAMS ((rtx)); +static void rws_update (struct reg_write_state *, int, struct reg_flags, int); +static int rws_access_regno (int, struct reg_flags, int); +static int rws_access_reg (rtx, struct reg_flags, int); +static void update_set_flags (rtx, struct reg_flags *, int *, rtx *); +static int set_src_needs_barrier (rtx, struct reg_flags, int, rtx); +static int rtx_needs_barrier (rtx, struct reg_flags, int); +static void init_insn_group_barriers (void); +static int group_barrier_needed_p (rtx); +static int safe_group_barrier_needed_p (rtx); /* Update *RWS for REGNO, which is being written by the current instruction, with predicate PRED, and associated register flags in FLAGS. */ static void -rws_update (rws, regno, flags, pred) - struct reg_write_state *rws; - int regno; - struct reg_flags flags; - int pred; +rws_update (struct reg_write_state *rws, int regno, struct reg_flags flags, int pred) { if (pred) rws[regno].write_count++; @@ -4524,10 +4899,7 @@ rws_update (rws, regno, flags, pred) a dependency with an earlier instruction in the same group. */ static int -rws_access_regno (regno, flags, pred) - int regno; - struct reg_flags flags; - int pred; +rws_access_regno (int regno, struct reg_flags flags, int pred) { int need_barrier = 0; @@ -4562,7 +4934,7 @@ rws_access_regno (regno, flags, pred) /* ??? This assumes that P and P+1 are always complementary predicates for P even. */ if (flags.is_and && rws_sum[regno].written_by_and) - ; + ; else if (flags.is_or && rws_sum[regno].written_by_or) ; else if ((rws_sum[regno].first_pred ^ 1) != pred) @@ -4645,10 +5017,7 @@ rws_access_regno (regno, flags, pred) } static int -rws_access_reg (reg, flags, pred) - rtx reg; - struct reg_flags flags; - int pred; +rws_access_reg (rtx reg, struct reg_flags flags, int pred) { int regno = REGNO (reg); int n = HARD_REGNO_NREGS (REGNO (reg), GET_MODE (reg)); @@ -4668,11 +5037,7 @@ rws_access_reg (reg, flags, pred) the condition, stored in *PFLAGS, *PPRED and *PCOND. */ static void -update_set_flags (x, pflags, ppred, pcond) - rtx x; - struct reg_flags *pflags; - int *ppred; - rtx *pcond; +update_set_flags (rtx x, struct reg_flags *pflags, int *ppred, rtx *pcond) { rtx src = SET_SRC (x); @@ -4686,7 +5051,7 @@ update_set_flags (x, pflags, ppred, pcond) case IF_THEN_ELSE: if (SET_DEST (x) == pc_rtx) /* X is a conditional branch. */ - return; + return; else { int is_complemented = 0; @@ -4749,13 +5114,9 @@ update_set_flags (x, pflags, ppred, pcond) source of a given SET rtx found in X needs a barrier. FLAGS and PRED are as in rtx_needs_barrier. COND is an rtx that holds the condition for this insn. */ - + static int -set_src_needs_barrier (x, flags, pred, cond) - rtx x; - struct reg_flags flags; - int pred; - rtx cond; +set_src_needs_barrier (rtx x, struct reg_flags flags, int pred, rtx cond) { int need_barrier = 0; rtx dst; @@ -4790,15 +5151,12 @@ set_src_needs_barrier (x, flags, pred, cond) return need_barrier; } -/* Handle an access to rtx X of type FLAGS using predicate register PRED. - Return 1 is this access creates a dependency with an earlier instruction - in the same group. */ +/* Handle an access to rtx X of type FLAGS using predicate register + PRED. Return 1 if this access creates a dependency with an earlier + instruction in the same group. */ static int -rtx_needs_barrier (x, flags, pred) - rtx x; - struct reg_flags flags; - int pred; +rtx_needs_barrier (rtx x, struct reg_flags flags, int pred) { int i, j; int is_complemented = 0; @@ -4814,7 +5172,7 @@ rtx_needs_barrier (x, flags, pred) switch (GET_CODE (x)) { - case SET: + case SET: update_set_flags (x, &new_flags, &pred, &cond); need_barrier = set_src_needs_barrier (x, new_flags, pred, cond); if (GET_CODE (SET_SRC (x)) != CALL) @@ -4984,7 +5342,7 @@ rtx_needs_barrier (x, flags, pred) case NEG: case NOT: case SIGN_EXTEND: case ZERO_EXTEND: case TRUNCATE: case FLOAT_EXTEND: case FLOAT_TRUNCATE: case FLOAT: case FIX: case UNSIGNED_FLOAT: case UNSIGNED_FIX: case ABS: - case SQRT: case FFS: + case SQRT: case FFS: case POPCOUNT: need_barrier = rtx_needs_barrier (XEXP (x, 0), flags, pred); break; @@ -5017,14 +5375,13 @@ rtx_needs_barrier (x, flags, pred) new_flags, pred); break; } - + case UNSPEC_FR_SPILL: case UNSPEC_FR_RESTORE: - case UNSPEC_POPCNT: - need_barrier = rtx_needs_barrier (XVECEXP (x, 0, 0), flags, pred); - break; - + case UNSPEC_GETF_EXP: + case UNSPEC_SETF_EXP: case UNSPEC_ADDP4: + case UNSPEC_FR_SQRT_RECIP_APPROX: need_barrier = rtx_needs_barrier (XVECEXP (x, 0, 0), flags, pred); break; @@ -5122,7 +5479,7 @@ rtx_needs_barrier (x, flags, pred) sequence of insns. */ static void -init_insn_group_barriers () +init_insn_group_barriers (void) { memset (rws_sum, 0, sizeof (rws_sum)); first_instruction = 1; @@ -5133,8 +5490,7 @@ init_insn_group_barriers () Return nonzero if so. */ static int -group_barrier_needed_p (insn) - rtx insn; +group_barrier_needed_p (rtx insn) { rtx pat; int need_barrier = 0; @@ -5230,7 +5586,10 @@ group_barrier_needed_p (insn) abort (); } - if (first_instruction) + if (first_instruction && INSN_P (insn) + && ia64_safe_itanium_class (insn) != ITANIUM_CLASS_IGNORE + && GET_CODE (PATTERN (insn)) != USE + && GET_CODE (PATTERN (insn)) != CLOBBER) { need_barrier = 0; first_instruction = 0; @@ -5242,8 +5601,7 @@ group_barrier_needed_p (insn) /* Like group_barrier_needed_p, but do not clobber the current state. */ static int -safe_group_barrier_needed_p (insn) - rtx insn; +safe_group_barrier_needed_p (rtx insn) { struct reg_write_state rws_saved[NUM_REGS]; int saved_first_instruction; @@ -5260,17 +5618,15 @@ safe_group_barrier_needed_p (insn) return t; } -/* INSNS is an chain of instructions. Scan the chain, and insert stop bits - as necessary to eliminate dependendencies. This function assumes that - a final instruction scheduling pass has been run which has already - inserted most of the necessary stop bits. This function only inserts - new ones at basic block boundaries, since these are invisible to the - scheduler. */ +/* Scan the current function and insert stop bits as necessary to + eliminate dependencies. This function assumes that a final + instruction scheduling pass has been run which has already + inserted most of the necessary stop bits. This function only + inserts new ones at basic block boundaries, since these are + invisible to the scheduler. */ static void -emit_insn_group_barriers (dump, insns) - FILE *dump; - rtx insns; +emit_insn_group_barriers (FILE *dump) { rtx insn; rtx last_label = 0; @@ -5278,7 +5634,7 @@ emit_insn_group_barriers (dump, insns) init_insn_group_barriers (); - for (insn = insns; insn; insn = NEXT_INSN (insn)) + for (insn = get_insns (); insn; insn = NEXT_INSN (insn)) { if (GET_CODE (insn) == CODE_LABEL) { @@ -5326,15 +5682,13 @@ emit_insn_group_barriers (dump, insns) This function has to emit all necessary group barriers. */ static void -emit_all_insn_group_barriers (dump, insns) - FILE *dump ATTRIBUTE_UNUSED; - rtx insns; +emit_all_insn_group_barriers (FILE *dump ATTRIBUTE_UNUSED) { rtx insn; init_insn_group_barriers (); - for (insn = insns; insn; insn = NEXT_INSN (insn)) + for (insn = get_insns (); insn; insn = NEXT_INSN (insn)) { if (GET_CODE (insn) == BARRIER) { @@ -5363,10 +5717,11 @@ emit_all_insn_group_barriers (dump, insns) } } } + -static int errata_find_address_regs PARAMS ((rtx *, void *)); -static void errata_emit_nops PARAMS ((rtx)); -static void fixup_errata PARAMS ((void)); +static int errata_find_address_regs (rtx *, void *); +static void errata_emit_nops (rtx); +static void fixup_errata (void); /* This structure is used to track some details about the previous insns groups so we can determine if it may be necessary to insert NOPs to @@ -5384,9 +5739,7 @@ static int group_idx; conditionally set in the previous group is used as an address register. It ensures that for_each_rtx returns 1 in that case. */ static int -errata_find_address_regs (xp, data) - rtx *xp; - void *data ATTRIBUTE_UNUSED; +errata_find_address_regs (rtx *xp, void *data ATTRIBUTE_UNUSED) { rtx x = *xp; if (GET_CODE (x) != MEM) @@ -5409,8 +5762,7 @@ errata_find_address_regs (xp, data) last_group and emits additional NOPs if necessary to work around an Itanium A/B step erratum. */ static void -errata_emit_nops (insn) - rtx insn; +errata_emit_nops (rtx insn) { struct group *this_group = last_group + group_idx; struct group *prev_group = last_group + (group_idx ^ 1); @@ -5483,7 +5835,7 @@ errata_emit_nops (insn) /* Emit extra nops if they are required to work around hardware errata. */ static void -fixup_errata () +fixup_errata (void) { rtx insn; @@ -5508,153 +5860,106 @@ fixup_errata () } } -/* Instruction scheduling support. */ -/* Describe one bundle. */ -struct bundle -{ - /* Zero if there's no possibility of a stop in this bundle other than - at the end, otherwise the position of the optional stop bit. */ - int possible_stop; - /* The types of the three slots. */ - enum attr_type t[3]; - /* The pseudo op to be emitted into the assembler output. */ - const char *name; -}; +/* Instruction scheduling support. */ #define NR_BUNDLES 10 -/* A list of all available bundles. */ +/* A list of names of all available bundles. */ -static const struct bundle bundle[NR_BUNDLES] = +static const char *bundle_name [NR_BUNDLES] = { - { 2, { TYPE_M, TYPE_I, TYPE_I }, ".mii" }, - { 1, { TYPE_M, TYPE_M, TYPE_I }, ".mmi" }, - { 0, { TYPE_M, TYPE_F, TYPE_I }, ".mfi" }, - { 0, { TYPE_M, TYPE_M, TYPE_F }, ".mmf" }, + ".mii", + ".mmi", + ".mfi", + ".mmf", #if NR_BUNDLES == 10 - { 0, { TYPE_B, TYPE_B, TYPE_B }, ".bbb" }, - { 0, { TYPE_M, TYPE_B, TYPE_B }, ".mbb" }, + ".bbb", + ".mbb", #endif - { 0, { TYPE_M, TYPE_I, TYPE_B }, ".mib" }, - { 0, { TYPE_M, TYPE_M, TYPE_B }, ".mmb" }, - { 0, { TYPE_M, TYPE_F, TYPE_B }, ".mfb" }, - /* .mfi needs to occur earlier than .mlx, so that we only generate it if - it matches an L type insn. Otherwise we'll try to generate L type - nops. */ - { 0, { TYPE_M, TYPE_L, TYPE_X }, ".mlx" } + ".mib", + ".mmb", + ".mfb", + ".mlx" }; -/* Describe a packet of instructions. Packets consist of two bundles that - are visible to the hardware in one scheduling window. */ +/* Nonzero if we should insert stop bits into the schedule. */ -struct ia64_packet -{ - const struct bundle *t1, *t2; - /* Precomputed value of the first split issue in this packet if a cycle - starts at its beginning. */ - int first_split; - /* For convenience, the insn types are replicated here so we don't have - to go through T1 and T2 all the time. */ - enum attr_type t[6]; -}; +int ia64_final_schedule = 0; -/* An array containing all possible packets. */ -#define NR_PACKETS (NR_BUNDLES * NR_BUNDLES) -static struct ia64_packet packets[NR_PACKETS]; +/* Codes of the corresponding quieryied units: */ -/* Map attr_type to a string with the name. */ +static int _0mii_, _0mmi_, _0mfi_, _0mmf_; +static int _0bbb_, _0mbb_, _0mib_, _0mmb_, _0mfb_, _0mlx_; -static const char *const type_names[] = -{ - "UNKNOWN", "A", "I", "M", "F", "B", "L", "X", "S" -}; +static int _1mii_, _1mmi_, _1mfi_, _1mmf_; +static int _1bbb_, _1mbb_, _1mib_, _1mmb_, _1mfb_, _1mlx_; -/* Nonzero if we should insert stop bits into the schedule. */ -int ia64_final_schedule = 0; +static int pos_1, pos_2, pos_3, pos_4, pos_5, pos_6; -static int itanium_split_issue PARAMS ((const struct ia64_packet *, int)); -static rtx ia64_single_set PARAMS ((rtx)); -static int insn_matches_slot PARAMS ((const struct ia64_packet *, enum attr_type, int, rtx)); -static void ia64_emit_insn_before PARAMS ((rtx, rtx)); -static void maybe_rotate PARAMS ((FILE *)); -static void finish_last_head PARAMS ((FILE *, int)); -static void rotate_one_bundle PARAMS ((FILE *)); -static void rotate_two_bundles PARAMS ((FILE *)); -static void nop_cycles_until PARAMS ((int, FILE *)); -static void cycle_end_fill_slots PARAMS ((FILE *)); -static int packet_matches_p PARAMS ((const struct ia64_packet *, int, int *)); -static int get_split PARAMS ((const struct ia64_packet *, int)); -static int find_best_insn PARAMS ((rtx *, enum attr_type *, int, - const struct ia64_packet *, int)); -static void find_best_packet PARAMS ((int *, const struct ia64_packet **, - rtx *, enum attr_type *, int)); -static int itanium_reorder PARAMS ((FILE *, rtx *, rtx *, int)); -static void dump_current_packet PARAMS ((FILE *)); -static void schedule_stop PARAMS ((FILE *)); -static rtx gen_nop_type PARAMS ((enum attr_type)); -static void ia64_emit_nops PARAMS ((void)); +/* The following variable value is an insn group barrier. */ -/* Map a bundle number to its pseudo-op. */ +static rtx dfa_stop_insn; -const char * -get_bundle_name (b) - int b; -{ - return bundle[b].name; -} +/* The following variable value is the last issued insn. */ -/* Compute the slot which will cause a split issue in packet P if the - current cycle begins at slot BEGIN. */ +static rtx last_scheduled_insn; -static int -itanium_split_issue (p, begin) - const struct ia64_packet *p; - int begin; -{ - int type_count[TYPE_S]; - int i; - int split = 6; +/* The following variable value is size of the DFA state. */ - if (begin < 3) - { - /* Always split before and after MMF. */ - if (p->t[0] == TYPE_M && p->t[1] == TYPE_M && p->t[2] == TYPE_F) - return 3; - if (p->t[3] == TYPE_M && p->t[4] == TYPE_M && p->t[5] == TYPE_F) - return 3; - /* Always split after MBB and BBB. */ - if (p->t[1] == TYPE_B) - return 3; - /* Split after first bundle in MIB BBB combination. */ - if (p->t[2] == TYPE_B && p->t[3] == TYPE_B) - return 3; - } +static size_t dfa_state_size; - memset (type_count, 0, sizeof type_count); - for (i = begin; i < split; i++) - { - enum attr_type t0 = p->t[i]; - /* An MLX bundle reserves the same units as an MFI bundle. */ - enum attr_type t = (t0 == TYPE_L ? TYPE_F - : t0 == TYPE_X ? TYPE_I - : t0); +/* The following variable value is pointer to a DFA state used as + temporary variable. */ - /* Itanium can execute up to 3 branches, 2 floating point, 2 memory, and - 2 integer per cycle. */ - int max = (t == TYPE_B ? 3 : 2); - if (type_count[t] == max) - return i; +static state_t temp_dfa_state = NULL; - type_count[t]++; - } - return split; +/* The following variable value is DFA state after issuing the last + insn. */ + +static state_t prev_cycle_state = NULL; + +/* The following array element values are TRUE if the corresponding + insn requires to add stop bits before it. */ + +static char *stops_p; + +/* The following variable is used to set up the mentioned above array. */ + +static int stop_before_p = 0; + +/* The following variable value is length of the arrays `clocks' and + `add_cycles'. */ + +static int clocks_length; + +/* The following array element values are cycles on which the + corresponding insn will be issued. The array is used only for + Itanium1. */ + +static int *clocks; + +/* The following array element values are numbers of cycles should be + added to improve insn scheduling for MM_insns for Itanium1. */ + +static int *add_cycles; + +static rtx ia64_single_set (rtx); +static void ia64_emit_insn_before (rtx, rtx); + +/* Map a bundle number to its pseudo-op. */ + +const char * +get_bundle_name (int b) +{ + return bundle_name[b]; } + /* Return the maximum number of instructions a cpu can issue. */ static int -ia64_issue_rate () +ia64_issue_rate (void) { return 6; } @@ -5662,8 +5967,7 @@ ia64_issue_rate () /* Helper function - like single_set, but look inside COND_EXEC. */ static rtx -ia64_single_set (insn) - rtx insn; +ia64_single_set (rtx insn) { rtx x = PATTERN (insn), ret; if (GET_CODE (x) == COND_EXEC) @@ -5693,1273 +5997,1438 @@ ia64_single_set (insn) a dependency LINK or INSN on DEP_INSN. COST is the current cost. */ static int -ia64_adjust_cost (insn, link, dep_insn, cost) - rtx insn, link, dep_insn; - int cost; +ia64_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost) { - enum attr_type dep_type; enum attr_itanium_class dep_class; enum attr_itanium_class insn_class; - rtx dep_set, set, src, addr; - - if (GET_CODE (PATTERN (insn)) == CLOBBER - || GET_CODE (PATTERN (insn)) == USE - || GET_CODE (PATTERN (dep_insn)) == CLOBBER - || GET_CODE (PATTERN (dep_insn)) == USE - /* @@@ Not accurate for indirect calls. */ - || GET_CODE (insn) == CALL_INSN - || ia64_safe_type (insn) == TYPE_S) - return 0; - if (REG_NOTE_KIND (link) == REG_DEP_OUTPUT - || REG_NOTE_KIND (link) == REG_DEP_ANTI) - return 0; + if (REG_NOTE_KIND (link) != REG_DEP_OUTPUT) + return cost; - dep_type = ia64_safe_type (dep_insn); - dep_class = ia64_safe_itanium_class (dep_insn); insn_class = ia64_safe_itanium_class (insn); - - /* Compares that feed a conditional branch can execute in the same - cycle. */ - dep_set = ia64_single_set (dep_insn); - set = ia64_single_set (insn); - - if (dep_type != TYPE_F - && dep_set - && GET_CODE (SET_DEST (dep_set)) == REG - && PR_REG (REGNO (SET_DEST (dep_set))) - && GET_CODE (insn) == JUMP_INSN) + dep_class = ia64_safe_itanium_class (dep_insn); + if (dep_class == ITANIUM_CLASS_ST || dep_class == ITANIUM_CLASS_STF + || insn_class == ITANIUM_CLASS_ST || insn_class == ITANIUM_CLASS_STF) return 0; - if (dep_set && GET_CODE (SET_DEST (dep_set)) == MEM) - { - /* ??? Can't find any information in the documenation about whether - a sequence - st [rx] = ra - ld rb = [ry] - splits issue. Assume it doesn't. */ - return 0; - } - - src = set ? SET_SRC (set) : 0; - addr = 0; - if (set) - { - if (GET_CODE (SET_DEST (set)) == MEM) - addr = XEXP (SET_DEST (set), 0); - else if (GET_CODE (SET_DEST (set)) == SUBREG - && GET_CODE (SUBREG_REG (SET_DEST (set))) == MEM) - addr = XEXP (SUBREG_REG (SET_DEST (set)), 0); - else - { - addr = src; - if (GET_CODE (addr) == UNSPEC && XVECLEN (addr, 0) > 0) - addr = XVECEXP (addr, 0, 0); - while (GET_CODE (addr) == SUBREG || GET_CODE (addr) == ZERO_EXTEND) - addr = XEXP (addr, 0); - - /* Note that LO_SUM is used for GOT loads. */ - if (GET_CODE (addr) == MEM || GET_CODE (addr) == LO_SUM) - addr = XEXP (addr, 0); - else - addr = 0; - } - } - - if (addr && GET_CODE (addr) == POST_MODIFY) - addr = XEXP (addr, 0); - - set = ia64_single_set (dep_insn); - - if ((dep_class == ITANIUM_CLASS_IALU - || dep_class == ITANIUM_CLASS_ILOG - || dep_class == ITANIUM_CLASS_LD) - && (insn_class == ITANIUM_CLASS_LD - || insn_class == ITANIUM_CLASS_ST)) - { - if (! addr || ! set) - abort (); - /* This isn't completely correct - an IALU that feeds an address has - a latency of 1 cycle if it's issued in an M slot, but 2 cycles - otherwise. Unfortunately there's no good way to describe this. */ - if (reg_overlap_mentioned_p (SET_DEST (set), addr)) - return cost + 1; - } - - if ((dep_class == ITANIUM_CLASS_IALU - || dep_class == ITANIUM_CLASS_ILOG - || dep_class == ITANIUM_CLASS_LD) - && (insn_class == ITANIUM_CLASS_MMMUL - || insn_class == ITANIUM_CLASS_MMSHF - || insn_class == ITANIUM_CLASS_MMSHFI)) - return 3; - - if (dep_class == ITANIUM_CLASS_FMAC - && (insn_class == ITANIUM_CLASS_FMISC - || insn_class == ITANIUM_CLASS_FCVTFX - || insn_class == ITANIUM_CLASS_XMPY)) - return 7; - - if ((dep_class == ITANIUM_CLASS_FMAC - || dep_class == ITANIUM_CLASS_FMISC - || dep_class == ITANIUM_CLASS_FCVTFX - || dep_class == ITANIUM_CLASS_XMPY) - && insn_class == ITANIUM_CLASS_STF) - return 8; - - /* Intel docs say only LD, ST, IALU, ILOG, ISHF consumers have latency 4, - but HP engineers say any non-MM operation. */ - if ((dep_class == ITANIUM_CLASS_MMMUL - || dep_class == ITANIUM_CLASS_MMSHF - || dep_class == ITANIUM_CLASS_MMSHFI) - && insn_class != ITANIUM_CLASS_MMMUL - && insn_class != ITANIUM_CLASS_MMSHF - && insn_class != ITANIUM_CLASS_MMSHFI) - return 4; - return cost; } -/* Describe the current state of the Itanium pipeline. */ -static struct -{ - /* The first slot that is used in the current cycle. */ - int first_slot; - /* The next slot to fill. */ - int cur; - /* The packet we have selected for the current issue window. */ - const struct ia64_packet *packet; - /* The position of the split issue that occurs due to issue width - limitations (6 if there's no split issue). */ - int split; - /* Record data about the insns scheduled so far in the same issue - window. The elements up to but not including FIRST_SLOT belong - to the previous cycle, the ones starting with FIRST_SLOT belong - to the current cycle. */ - enum attr_type types[6]; - rtx insns[6]; - int stopbit[6]; - /* Nonzero if we decided to schedule a stop bit. */ - int last_was_stop; -} sched_data; - -/* Temporary arrays; they have enough elements to hold all insns that - can be ready at the same time while scheduling of the current block. - SCHED_READY can hold ready insns, SCHED_TYPES their types. */ -static rtx *sched_ready; -static enum attr_type *sched_types; - -/* Determine whether an insn INSN of type ITYPE can fit into slot SLOT - of packet P. */ - -static int -insn_matches_slot (p, itype, slot, insn) - const struct ia64_packet *p; - enum attr_type itype; - int slot; - rtx insn; -{ - enum attr_itanium_requires_unit0 u0; - enum attr_type stype = p->t[slot]; - - if (insn) - { - u0 = ia64_safe_itanium_requires_unit0 (insn); - if (u0 == ITANIUM_REQUIRES_UNIT0_YES) - { - int i; - for (i = sched_data.first_slot; i < slot; i++) - if (p->t[i] == stype - || (stype == TYPE_F && p->t[i] == TYPE_L) - || (stype == TYPE_I && p->t[i] == TYPE_X)) - return 0; - } - if (GET_CODE (insn) == CALL_INSN) - { - /* Reject calls in multiway branch packets. We want to limit - the number of multiway branches we generate (since the branch - predictor is limited), and this seems to work fairly well. - (If we didn't do this, we'd have to add another test here to - force calls into the third slot of the bundle.) */ - if (slot < 3) - { - if (p->t[1] == TYPE_B) - return 0; - } - else - { - if (p->t[4] == TYPE_B) - return 0; - } - } - } - - if (itype == stype) - return 1; - if (itype == TYPE_A) - return stype == TYPE_M || stype == TYPE_I; - return 0; -} - /* Like emit_insn_before, but skip cycle_display notes. ??? When cycle display notes are implemented, update this. */ static void -ia64_emit_insn_before (insn, before) - rtx insn, before; +ia64_emit_insn_before (rtx insn, rtx before) { emit_insn_before (insn, before); } -/* When rotating a bundle out of the issue window, insert a bundle selector - insn in front of it. DUMP is the scheduling dump file or NULL. START - is either 0 or 3, depending on whether we want to emit a bundle selector - for the first bundle or the second bundle in the current issue window. - - The selector insns are emitted this late because the selected packet can - be changed until parts of it get rotated out. */ +/* The following function marks insns who produce addresses for load + and store insns. Such insns will be placed into M slots because it + decrease latency time for Itanium1 (see function + `ia64_produce_address_p' and the DFA descriptions). */ static void -finish_last_head (dump, start) - FILE *dump; - int start; +ia64_dependencies_evaluation_hook (rtx head, rtx tail) { - const struct ia64_packet *p = sched_data.packet; - const struct bundle *b = start == 0 ? p->t1 : p->t2; - int bundle_type = b - bundle; - rtx insn; - int i; - - if (! ia64_final_schedule) - return; - - for (i = start; sched_data.insns[i] == 0; i++) - if (i == start + 3) - abort (); - insn = sched_data.insns[i]; - - if (dump) - fprintf (dump, "// Emitting template before %d: %s\n", - INSN_UID (insn), b->name); + rtx insn, link, next, next_tail; - ia64_emit_insn_before (gen_bundle_selector (GEN_INT (bundle_type)), insn); + next_tail = NEXT_INSN (tail); + for (insn = head; insn != next_tail; insn = NEXT_INSN (insn)) + if (INSN_P (insn)) + insn->call = 0; + for (insn = head; insn != next_tail; insn = NEXT_INSN (insn)) + if (INSN_P (insn) + && ia64_safe_itanium_class (insn) == ITANIUM_CLASS_IALU) + { + for (link = INSN_DEPEND (insn); link != 0; link = XEXP (link, 1)) + { + next = XEXP (link, 0); + if ((ia64_safe_itanium_class (next) == ITANIUM_CLASS_ST + || ia64_safe_itanium_class (next) == ITANIUM_CLASS_STF) + && ia64_st_address_bypass_p (insn, next)) + break; + else if ((ia64_safe_itanium_class (next) == ITANIUM_CLASS_LD + || ia64_safe_itanium_class (next) + == ITANIUM_CLASS_FLD) + && ia64_ld_address_bypass_p (insn, next)) + break; + } + insn->call = link != 0; + } } -/* We can't schedule more insns this cycle. Fix up the scheduling state - and advance FIRST_SLOT and CUR. - We have to distribute the insns that are currently found between - FIRST_SLOT and CUR into the slots of the packet we have selected. So - far, they are stored successively in the fields starting at FIRST_SLOT; - now they must be moved to the correct slots. - DUMP is the current scheduling dump file, or NULL. */ +/* We're beginning a new block. Initialize data structures as necessary. */ static void -cycle_end_fill_slots (dump) - FILE *dump; +ia64_sched_init (FILE *dump ATTRIBUTE_UNUSED, + int sched_verbose ATTRIBUTE_UNUSED, + int max_ready ATTRIBUTE_UNUSED) { - const struct ia64_packet *packet = sched_data.packet; - int slot, i; - enum attr_type tmp_types[6]; - rtx tmp_insns[6]; - - memcpy (tmp_types, sched_data.types, 6 * sizeof (enum attr_type)); - memcpy (tmp_insns, sched_data.insns, 6 * sizeof (rtx)); +#ifdef ENABLE_CHECKING + rtx insn; - for (i = slot = sched_data.first_slot; i < sched_data.cur; i++) - { - enum attr_type t = tmp_types[i]; - if (t != ia64_safe_type (tmp_insns[i])) + if (reload_completed) + for (insn = NEXT_INSN (current_sched_info->prev_head); + insn != current_sched_info->next_tail; + insn = NEXT_INSN (insn)) + if (SCHED_GROUP_P (insn)) abort (); - while (! insn_matches_slot (packet, t, slot, tmp_insns[i])) - { - if (slot > sched_data.split) - abort (); - if (dump) - fprintf (dump, "// Packet needs %s, have %s\n", - type_names[packet->t[slot]], type_names[t]); - sched_data.types[slot] = packet->t[slot]; - sched_data.insns[slot] = 0; - sched_data.stopbit[slot] = 0; - - /* ??? TYPE_L instructions always fill up two slots, but we don't - support TYPE_L nops. */ - if (packet->t[slot] == TYPE_L) - abort (); - - slot++; - } - - /* Do _not_ use T here. If T == TYPE_A, then we'd risk changing the - actual slot type later. */ - sched_data.types[slot] = packet->t[slot]; - sched_data.insns[slot] = tmp_insns[i]; - sched_data.stopbit[slot] = 0; - slot++; - - /* TYPE_L instructions always fill up two slots. */ - if (t == TYPE_L) - { - sched_data.types[slot] = packet->t[slot]; - sched_data.insns[slot] = 0; - sched_data.stopbit[slot] = 0; - slot++; - } - } - - /* This isn't right - there's no need to pad out until the forced split; - the CPU will automatically split if an insn isn't ready. */ -#if 0 - while (slot < sched_data.split) - { - sched_data.types[slot] = packet->t[slot]; - sched_data.insns[slot] = 0; - sched_data.stopbit[slot] = 0; - slot++; - } #endif - - sched_data.first_slot = sched_data.cur = slot; + last_scheduled_insn = NULL_RTX; + init_insn_group_barriers (); } -/* Bundle rotations, as described in the Itanium optimization manual. - We can rotate either one or both bundles out of the issue window. - DUMP is the current scheduling dump file, or NULL. */ - -static void -rotate_one_bundle (dump) - FILE *dump; -{ - if (dump) - fprintf (dump, "// Rotating one bundle.\n"); - - finish_last_head (dump, 0); - if (sched_data.cur > 3) - { - sched_data.cur -= 3; - sched_data.first_slot -= 3; - memmove (sched_data.types, - sched_data.types + 3, - sched_data.cur * sizeof *sched_data.types); - memmove (sched_data.stopbit, - sched_data.stopbit + 3, - sched_data.cur * sizeof *sched_data.stopbit); - memmove (sched_data.insns, - sched_data.insns + 3, - sched_data.cur * sizeof *sched_data.insns); - sched_data.packet - = &packets[(sched_data.packet->t2 - bundle) * NR_BUNDLES]; - } - else - { - sched_data.cur = 0; - sched_data.first_slot = 0; - } -} +/* We are about to being issuing insns for this clock cycle. + Override the default sort algorithm to better slot instructions. */ -static void -rotate_two_bundles (dump) - FILE *dump; +static int +ia64_dfa_sched_reorder (FILE *dump, int sched_verbose, rtx *ready, + int *pn_ready, int clock_var ATTRIBUTE_UNUSED, + int reorder_type) { - if (dump) - fprintf (dump, "// Rotating two bundles.\n"); - - if (sched_data.cur == 0) - return; - - finish_last_head (dump, 0); - if (sched_data.cur > 3) - finish_last_head (dump, 3); - sched_data.cur = 0; - sched_data.first_slot = 0; -} - -/* We're beginning a new block. Initialize data structures as necessary. */ + int n_asms; + int n_ready = *pn_ready; + rtx *e_ready = ready + n_ready; + rtx *insnp; -static void -ia64_sched_init (dump, sched_verbose, max_ready) - FILE *dump ATTRIBUTE_UNUSED; - int sched_verbose ATTRIBUTE_UNUSED; - int max_ready; -{ - static int initialized = 0; + if (sched_verbose) + fprintf (dump, "// ia64_dfa_sched_reorder (type %d):\n", reorder_type); - if (! initialized) + if (reorder_type == 0) { - int b1, b2, i; - - initialized = 1; - - for (i = b1 = 0; b1 < NR_BUNDLES; b1++) - { - const struct bundle *t1 = bundle + b1; - for (b2 = 0; b2 < NR_BUNDLES; b2++, i++) - { - const struct bundle *t2 = bundle + b2; + /* First, move all USEs, CLOBBERs and other crud out of the way. */ + n_asms = 0; + for (insnp = ready; insnp < e_ready; insnp++) + if (insnp < e_ready) + { + rtx insn = *insnp; + enum attr_type t = ia64_safe_type (insn); + if (t == TYPE_UNKNOWN) + { + if (GET_CODE (PATTERN (insn)) == ASM_INPUT + || asm_noperands (PATTERN (insn)) >= 0) + { + rtx lowest = ready[n_asms]; + ready[n_asms] = insn; + *insnp = lowest; + n_asms++; + } + else + { + rtx highest = ready[n_ready - 1]; + ready[n_ready - 1] = insn; + *insnp = highest; + return 1; + } + } + } - packets[i].t1 = t1; - packets[i].t2 = t2; - } - } - for (i = 0; i < NR_PACKETS; i++) + if (n_asms < n_ready) { - int j; - for (j = 0; j < 3; j++) - packets[i].t[j] = packets[i].t1->t[j]; - for (j = 0; j < 3; j++) - packets[i].t[j + 3] = packets[i].t2->t[j]; - packets[i].first_split = itanium_split_issue (packets + i, 0); + /* Some normal insns to process. Skip the asms. */ + ready += n_asms; + n_ready -= n_asms; } - + else if (n_ready > 0) + return 1; } - init_insn_group_barriers (); - - memset (&sched_data, 0, sizeof sched_data); - sched_types = (enum attr_type *) xmalloc (max_ready - * sizeof (enum attr_type)); - sched_ready = (rtx *) xmalloc (max_ready * sizeof (rtx)); -} - -/* See if the packet P can match the insns we have already scheduled. Return - nonzero if so. In *PSLOT, we store the first slot that is available for - more instructions if we choose this packet. - SPLIT holds the last slot we can use, there's a split issue after it so - scheduling beyond it would cause us to use more than one cycle. */ + if (ia64_final_schedule) + { + int deleted = 0; + int nr_need_stop = 0; -static int -packet_matches_p (p, split, pslot) - const struct ia64_packet *p; - int split; - int *pslot; -{ - int filled = sched_data.cur; - int first = sched_data.first_slot; - int i, slot; - - /* First, check if the first of the two bundles must be a specific one (due - to stop bits). */ - if (first > 0 && sched_data.stopbit[0] && p->t1->possible_stop != 1) - return 0; - if (first > 1 && sched_data.stopbit[1] && p->t1->possible_stop != 2) - return 0; + for (insnp = ready; insnp < e_ready; insnp++) + if (safe_group_barrier_needed_p (*insnp)) + nr_need_stop++; - for (i = 0; i < first; i++) - if (! insn_matches_slot (p, sched_data.types[i], i, - sched_data.insns[i])) - return 0; - for (i = slot = first; i < filled; i++) - { - while (slot < split) - { - if (insn_matches_slot (p, sched_data.types[i], slot, - sched_data.insns[i])) - break; - slot++; - } - if (slot == split) + if (reorder_type == 1 && n_ready == nr_need_stop) return 0; - slot++; + if (reorder_type == 0) + return 1; + insnp = e_ready; + /* Move down everything that needs a stop bit, preserving + relative order. */ + while (insnp-- > ready + deleted) + while (insnp >= ready + deleted) + { + rtx insn = *insnp; + if (! safe_group_barrier_needed_p (insn)) + break; + memmove (ready + 1, ready, (insnp - ready) * sizeof (rtx)); + *ready = insn; + deleted++; + } + n_ready -= deleted; + ready += deleted; } - if (pslot) - *pslot = slot; return 1; } -/* A frontend for itanium_split_issue. For a packet P and a slot - number FIRST that describes the start of the current clock cycle, - return the slot number of the first split issue. This function - uses the cached number found in P if possible. */ +/* We are about to being issuing insns for this clock cycle. Override + the default sort algorithm to better slot instructions. */ static int -get_split (p, first) - const struct ia64_packet *p; - int first; +ia64_sched_reorder (FILE *dump, int sched_verbose, rtx *ready, int *pn_ready, + int clock_var) { - if (first == 0) - return p->first_split; - return itanium_split_issue (p, first); + return ia64_dfa_sched_reorder (dump, sched_verbose, ready, + pn_ready, clock_var, 0); } -/* Given N_READY insns in the array READY, whose types are found in the - corresponding array TYPES, return the insn that is best suited to be - scheduled in slot SLOT of packet P. */ +/* Like ia64_sched_reorder, but called after issuing each insn. + Override the default sort algorithm to better slot instructions. */ static int -find_best_insn (ready, types, n_ready, p, slot) - rtx *ready; - enum attr_type *types; - int n_ready; - const struct ia64_packet *p; - int slot; -{ - int best = -1; - int best_pri = 0; - while (n_ready-- > 0) - { - rtx insn = ready[n_ready]; - if (! insn) - continue; - if (best >= 0 && INSN_PRIORITY (ready[n_ready]) < best_pri) - break; - /* If we have equally good insns, one of which has a stricter - slot requirement, prefer the one with the stricter requirement. */ - if (best >= 0 && types[n_ready] == TYPE_A) - continue; - if (insn_matches_slot (p, types[n_ready], slot, insn)) - { - best = n_ready; - best_pri = INSN_PRIORITY (ready[best]); +ia64_sched_reorder2 (FILE *dump ATTRIBUTE_UNUSED, + int sched_verbose ATTRIBUTE_UNUSED, rtx *ready, + int *pn_ready, int clock_var) +{ + if (ia64_tune == PROCESSOR_ITANIUM && reload_completed && last_scheduled_insn) + clocks [INSN_UID (last_scheduled_insn)] = clock_var; + return ia64_dfa_sched_reorder (dump, sched_verbose, ready, pn_ready, + clock_var, 1); +} - /* If there's no way we could get a stricter requirement, stop - looking now. */ - if (types[n_ready] != TYPE_A - && ia64_safe_itanium_requires_unit0 (ready[n_ready])) - break; - break; - } +/* We are about to issue INSN. Return the number of insns left on the + ready queue that can be issued this cycle. */ + +static int +ia64_variable_issue (FILE *dump ATTRIBUTE_UNUSED, + int sched_verbose ATTRIBUTE_UNUSED, + rtx insn ATTRIBUTE_UNUSED, + int can_issue_more ATTRIBUTE_UNUSED) +{ + last_scheduled_insn = insn; + memcpy (prev_cycle_state, curr_state, dfa_state_size); + if (reload_completed) + { + if (group_barrier_needed_p (insn)) + abort (); + if (GET_CODE (insn) == CALL_INSN) + init_insn_group_barriers (); + stops_p [INSN_UID (insn)] = stop_before_p; + stop_before_p = 0; } - return best; + return 1; } -/* Select the best packet to use given the current scheduler state and the - current ready list. - READY is an array holding N_READY ready insns; TYPES is a corresponding - array that holds their types. Store the best packet in *PPACKET and the - number of insns that can be scheduled in the current cycle in *PBEST. */ +/* We are choosing insn from the ready queue. Return nonzero if INSN + can be chosen. */ -static void -find_best_packet (pbest, ppacket, ready, types, n_ready) - int *pbest; - const struct ia64_packet **ppacket; - rtx *ready; - enum attr_type *types; - int n_ready; -{ - int first = sched_data.first_slot; - int best = 0; - int lowest_end = 6; - const struct ia64_packet *best_packet = NULL; - int i; - - for (i = 0; i < NR_PACKETS; i++) - { - const struct ia64_packet *p = packets + i; - int slot; - int split = get_split (p, first); - int win = 0; - int first_slot, last_slot; - int b_nops = 0; +static int +ia64_first_cycle_multipass_dfa_lookahead_guard (rtx insn) +{ + if (insn == NULL_RTX || !INSN_P (insn)) + abort (); + return (!reload_completed + || !safe_group_barrier_needed_p (insn)); +} - if (! packet_matches_p (p, split, &first_slot)) - continue; +/* The following variable value is pseudo-insn used by the DFA insn + scheduler to change the DFA state when the simulated clock is + increased. */ - memcpy (sched_ready, ready, n_ready * sizeof (rtx)); +static rtx dfa_pre_cycle_insn; - win = 0; - last_slot = 6; - for (slot = first_slot; slot < split; slot++) - { - int insn_nr; +/* We are about to being issuing INSN. Return nonzero if we can not + issue it on given cycle CLOCK and return zero if we should not sort + the ready queue on the next clock start. */ - /* Disallow a degenerate case where the first bundle doesn't - contain anything but NOPs! */ - if (first_slot == 0 && win == 0 && slot == 3) - { - win = -1; - break; - } +static int +ia64_dfa_new_cycle (FILE *dump, int verbose, rtx insn, int last_clock, + int clock, int *sort_p) +{ + int setup_clocks_p = FALSE; - insn_nr = find_best_insn (sched_ready, types, n_ready, p, slot); - if (insn_nr >= 0) - { - sched_ready[insn_nr] = 0; - last_slot = slot; - win++; - } - else if (p->t[slot] == TYPE_B) - b_nops++; - } - /* We must disallow MBB/BBB packets if any of their B slots would be - filled with nops. */ - if (last_slot < 3) + if (insn == NULL_RTX || !INSN_P (insn)) + abort (); + if ((reload_completed && safe_group_barrier_needed_p (insn)) + || (last_scheduled_insn + && (GET_CODE (last_scheduled_insn) == CALL_INSN + || GET_CODE (PATTERN (last_scheduled_insn)) == ASM_INPUT + || asm_noperands (PATTERN (last_scheduled_insn)) >= 0))) + { + init_insn_group_barriers (); + if (verbose && dump) + fprintf (dump, "// Stop should be before %d%s\n", INSN_UID (insn), + last_clock == clock ? " + cycle advance" : ""); + stop_before_p = 1; + if (last_clock == clock) { - if (p->t[1] == TYPE_B && (b_nops || last_slot < 2)) - win = -1; + state_transition (curr_state, dfa_stop_insn); + if (TARGET_EARLY_STOP_BITS) + *sort_p = (last_scheduled_insn == NULL_RTX + || GET_CODE (last_scheduled_insn) != CALL_INSN); + else + *sort_p = 0; + return 1; } + else if (reload_completed) + setup_clocks_p = TRUE; + if (GET_CODE (PATTERN (last_scheduled_insn)) == ASM_INPUT + || asm_noperands (PATTERN (last_scheduled_insn)) >= 0) + state_reset (curr_state); else { - if (p->t[4] == TYPE_B && (b_nops || last_slot < 5)) - win = -1; + memcpy (curr_state, prev_cycle_state, dfa_state_size); + state_transition (curr_state, dfa_stop_insn); + state_transition (curr_state, dfa_pre_cycle_insn); + state_transition (curr_state, NULL); } + } + else if (reload_completed) + setup_clocks_p = TRUE; + if (setup_clocks_p && ia64_tune == PROCESSOR_ITANIUM + && GET_CODE (PATTERN (insn)) != ASM_INPUT + && asm_noperands (PATTERN (insn)) < 0) + { + enum attr_itanium_class c = ia64_safe_itanium_class (insn); - if (win > best - || (win == best && last_slot < lowest_end)) + if (c != ITANIUM_CLASS_MMMUL && c != ITANIUM_CLASS_MMSHF) { - best = win; - lowest_end = last_slot; - best_packet = p; + rtx link; + int d = -1; + + for (link = LOG_LINKS (insn); link; link = XEXP (link, 1)) + if (REG_NOTE_KIND (link) == 0) + { + enum attr_itanium_class dep_class; + rtx dep_insn = XEXP (link, 0); + + dep_class = ia64_safe_itanium_class (dep_insn); + if ((dep_class == ITANIUM_CLASS_MMMUL + || dep_class == ITANIUM_CLASS_MMSHF) + && last_clock - clocks [INSN_UID (dep_insn)] < 4 + && (d < 0 + || last_clock - clocks [INSN_UID (dep_insn)] < d)) + d = last_clock - clocks [INSN_UID (dep_insn)]; + } + if (d >= 0) + add_cycles [INSN_UID (insn)] = 3 - d; } } - *pbest = best; - *ppacket = best_packet; + return 0; } -/* Reorder the ready list so that the insns that can be issued in this cycle - are found in the correct order at the end of the list. - DUMP is the scheduling dump file, or NULL. READY points to the start, - E_READY to the end of the ready list. MAY_FAIL determines what should be - done if no insns can be scheduled in this cycle: if it is zero, we abort, - otherwise we return 0. - Return 1 if any insns can be scheduled in this cycle. */ + -static int -itanium_reorder (dump, ready, e_ready, may_fail) - FILE *dump; - rtx *ready; - rtx *e_ready; - int may_fail; -{ - const struct ia64_packet *best_packet; - int n_ready = e_ready - ready; - int first = sched_data.first_slot; - int i, best, best_split, filled; +/* The following page contains abstract data `bundle states' which are + used for bundling insns (inserting nops and template generation). */ + +/* The following describes state of insn bundling. */ + +struct bundle_state +{ + /* Unique bundle state number to identify them in the debugging + output */ + int unique_num; + rtx insn; /* corresponding insn, NULL for the 1st and the last state */ + /* number nops before and after the insn */ + short before_nops_num, after_nops_num; + int insn_num; /* insn number (0 - for initial state, 1 - for the 1st + insn */ + int cost; /* cost of the state in cycles */ + int accumulated_insns_num; /* number of all previous insns including + nops. L is considered as 2 insns */ + int branch_deviation; /* deviation of previous branches from 3rd slots */ + struct bundle_state *next; /* next state with the same insn_num */ + struct bundle_state *originator; /* originator (previous insn state) */ + /* All bundle states are in the following chain. */ + struct bundle_state *allocated_states_chain; + /* The DFA State after issuing the insn and the nops. */ + state_t dfa_state; +}; - for (i = 0; i < n_ready; i++) - sched_types[i] = ia64_safe_type (ready[i]); +/* The following is map insn number to the corresponding bundle state. */ - find_best_packet (&best, &best_packet, ready, sched_types, n_ready); +static struct bundle_state **index_to_bundle_states; - if (best == 0) - { - if (may_fail) - return 0; - abort (); - } +/* The unique number of next bundle state. */ - if (dump) - { - fprintf (dump, "// Selected bundles: %s %s (%d insns)\n", - best_packet->t1->name, - best_packet->t2 ? best_packet->t2->name : NULL, best); - } +static int bundle_states_num; - best_split = itanium_split_issue (best_packet, first); - packet_matches_p (best_packet, best_split, &filled); +/* All allocated bundle states are in the following chain. */ - for (i = filled; i < best_split; i++) - { - int insn_nr; +static struct bundle_state *allocated_bundle_states_chain; - insn_nr = find_best_insn (ready, sched_types, n_ready, best_packet, i); - if (insn_nr >= 0) - { - rtx insn = ready[insn_nr]; - memmove (ready + insn_nr, ready + insn_nr + 1, - (n_ready - insn_nr - 1) * sizeof (rtx)); - memmove (sched_types + insn_nr, sched_types + insn_nr + 1, - (n_ready - insn_nr - 1) * sizeof (enum attr_type)); - ready[--n_ready] = insn; - } - } +/* All allocated but not used bundle states are in the following + chain. */ - sched_data.packet = best_packet; - sched_data.split = best_split; - return 1; -} +static struct bundle_state *free_bundle_state_chain; -/* Dump information about the current scheduling state to file DUMP. */ -static void -dump_current_packet (dump) - FILE *dump; +/* The following function returns a free bundle state. */ + +static struct bundle_state * +get_free_bundle_state (void) { - int i; - fprintf (dump, "// %d slots filled:", sched_data.cur); - for (i = 0; i < sched_data.first_slot; i++) + struct bundle_state *result; + + if (free_bundle_state_chain != NULL) { - rtx insn = sched_data.insns[i]; - fprintf (dump, " %s", type_names[sched_data.types[i]]); - if (insn) - fprintf (dump, "/%s", type_names[ia64_safe_type (insn)]); - if (sched_data.stopbit[i]) - fprintf (dump, " ;;"); + result = free_bundle_state_chain; + free_bundle_state_chain = result->next; } - fprintf (dump, " :::"); - for (i = sched_data.first_slot; i < sched_data.cur; i++) + else { - rtx insn = sched_data.insns[i]; - enum attr_type t = ia64_safe_type (insn); - fprintf (dump, " (%d) %s", INSN_UID (insn), type_names[t]); + result = xmalloc (sizeof (struct bundle_state)); + result->dfa_state = xmalloc (dfa_state_size); + result->allocated_states_chain = allocated_bundle_states_chain; + allocated_bundle_states_chain = result; } - fprintf (dump, "\n"); + result->unique_num = bundle_states_num++; + return result; + } -/* Schedule a stop bit. DUMP is the current scheduling dump file, or - NULL. */ +/* The following function frees given bundle state. */ static void -schedule_stop (dump) - FILE *dump; +free_bundle_state (struct bundle_state *state) { - const struct ia64_packet *best = sched_data.packet; - int i; - int best_stop = 6; + state->next = free_bundle_state_chain; + free_bundle_state_chain = state; +} - if (dump) - fprintf (dump, "// Stop bit, cur = %d.\n", sched_data.cur); +/* Start work with abstract data `bundle states'. */ - if (sched_data.cur == 0) - { - if (dump) - fprintf (dump, "// At start of bundle, so nothing to do.\n"); +static void +initiate_bundle_states (void) +{ + bundle_states_num = 0; + free_bundle_state_chain = NULL; + allocated_bundle_states_chain = NULL; +} - rotate_two_bundles (NULL); - return; - } +/* Finish work with abstract data `bundle states'. */ - for (i = -1; i < NR_PACKETS; i++) +static void +finish_bundle_states (void) +{ + struct bundle_state *curr_state, *next_state; + + for (curr_state = allocated_bundle_states_chain; + curr_state != NULL; + curr_state = next_state) { - /* This is a slight hack to give the current packet the first chance. - This is done to avoid e.g. switching from MIB to MBB bundles. */ - const struct ia64_packet *p = (i >= 0 ? packets + i : sched_data.packet); - int split = get_split (p, sched_data.first_slot); - const struct bundle *compare; - int next, stoppos; + next_state = curr_state->allocated_states_chain; + free (curr_state->dfa_state); + free (curr_state); + } +} - if (! packet_matches_p (p, split, &next)) - continue; +/* Hash table of the bundle states. The key is dfa_state and insn_num + of the bundle states. */ - compare = next > 3 ? p->t2 : p->t1; +static htab_t bundle_state_table; - stoppos = 3; - if (compare->possible_stop) - stoppos = compare->possible_stop; - if (next > 3) - stoppos += 3; +/* The function returns hash of BUNDLE_STATE. */ - if (stoppos < next || stoppos >= best_stop) - { - if (compare->possible_stop == 0) - continue; - stoppos = (next > 3 ? 6 : 3); - } - if (stoppos < next || stoppos >= best_stop) - continue; +static unsigned +bundle_state_hash (const void *bundle_state) +{ + const struct bundle_state *state = (struct bundle_state *) bundle_state; + unsigned result, i; + + for (result = i = 0; i < dfa_state_size; i++) + result += (((unsigned char *) state->dfa_state) [i] + << ((i % CHAR_BIT) * 3 + CHAR_BIT)); + return result + state->insn_num; +} + +/* The function returns nonzero if the bundle state keys are equal. */ + +static int +bundle_state_eq_p (const void *bundle_state_1, const void *bundle_state_2) +{ + const struct bundle_state * state1 = (struct bundle_state *) bundle_state_1; + const struct bundle_state * state2 = (struct bundle_state *) bundle_state_2; + + return (state1->insn_num == state2->insn_num + && memcmp (state1->dfa_state, state2->dfa_state, + dfa_state_size) == 0); +} - if (dump) - fprintf (dump, "// switching from %s %s to %s %s (stop at %d)\n", - best->t1->name, best->t2->name, p->t1->name, p->t2->name, - stoppos); +/* The function inserts the BUNDLE_STATE into the hash table. The + function returns nonzero if the bundle has been inserted into the + table. The table contains the best bundle state with given key. */ - best_stop = stoppos; - best = p; +static int +insert_bundle_state (struct bundle_state *bundle_state) +{ + void **entry_ptr; + + entry_ptr = htab_find_slot (bundle_state_table, bundle_state, 1); + if (*entry_ptr == NULL) + { + bundle_state->next = index_to_bundle_states [bundle_state->insn_num]; + index_to_bundle_states [bundle_state->insn_num] = bundle_state; + *entry_ptr = (void *) bundle_state; + return TRUE; } + else if (bundle_state->cost < ((struct bundle_state *) *entry_ptr)->cost + || (bundle_state->cost == ((struct bundle_state *) *entry_ptr)->cost + && (((struct bundle_state *)*entry_ptr)->accumulated_insns_num + > bundle_state->accumulated_insns_num + || (((struct bundle_state *) + *entry_ptr)->accumulated_insns_num + == bundle_state->accumulated_insns_num + && ((struct bundle_state *) + *entry_ptr)->branch_deviation + > bundle_state->branch_deviation)))) - sched_data.packet = best; - cycle_end_fill_slots (dump); - while (sched_data.cur < best_stop) { - sched_data.types[sched_data.cur] = best->t[sched_data.cur]; - sched_data.insns[sched_data.cur] = 0; - sched_data.stopbit[sched_data.cur] = 0; - sched_data.cur++; + struct bundle_state temp; + + temp = *(struct bundle_state *) *entry_ptr; + *(struct bundle_state *) *entry_ptr = *bundle_state; + ((struct bundle_state *) *entry_ptr)->next = temp.next; + *bundle_state = temp; } - sched_data.stopbit[sched_data.cur - 1] = 1; - sched_data.first_slot = best_stop; + return FALSE; +} - if (dump) - dump_current_packet (dump); +/* Start work with the hash table. */ + +static void +initiate_bundle_state_table (void) +{ + bundle_state_table = htab_create (50, bundle_state_hash, bundle_state_eq_p, + (htab_del) 0); } -/* If necessary, perform one or two rotations on the scheduling state. - This should only be called if we are starting a new cycle. */ +/* Finish work with the hash table. */ static void -maybe_rotate (dump) - FILE *dump; +finish_bundle_state_table (void) { - cycle_end_fill_slots (dump); - if (sched_data.cur == 6) - rotate_two_bundles (dump); - else if (sched_data.cur >= 3) - rotate_one_bundle (dump); - sched_data.first_slot = sched_data.cur; + htab_delete (bundle_state_table); } -/* The clock cycle when ia64_sched_reorder was last called. */ -static int prev_cycle; + -/* The first insn scheduled in the previous cycle. This is the saved - value of sched_data.first_slot. */ -static int prev_first; +/* The following variable is a insn `nop' used to check bundle states + with different number of inserted nops. */ -/* Emit NOPs to fill the delay between PREV_CYCLE and CLOCK_VAR. Used to - pad out the delay between MM (shifts, etc.) and integer operations. */ +static rtx ia64_nop; -static void -nop_cycles_until (clock_var, dump) - int clock_var; - FILE *dump; +/* The following function tries to issue NOPS_NUM nops for the current + state without advancing processor cycle. If it failed, the + function returns FALSE and frees the current state. */ + +static int +try_issue_nops (struct bundle_state *curr_state, int nops_num) { - int prev_clock = prev_cycle; - int cycles_left = clock_var - prev_clock; - bool did_stop = false; + int i; - /* Finish the previous cycle; pad it out with NOPs. */ - if (sched_data.cur == 3) + for (i = 0; i < nops_num; i++) + if (state_transition (curr_state->dfa_state, ia64_nop) >= 0) + { + free_bundle_state (curr_state); + return FALSE; + } + return TRUE; +} + +/* The following function tries to issue INSN for the current + state without advancing processor cycle. If it failed, the + function returns FALSE and frees the current state. */ + +static int +try_issue_insn (struct bundle_state *curr_state, rtx insn) +{ + if (insn && state_transition (curr_state->dfa_state, insn) >= 0) { - sched_emit_insn (gen_insn_group_barrier (GEN_INT (3))); - did_stop = true; - maybe_rotate (dump); + free_bundle_state (curr_state); + return FALSE; } - else if (sched_data.cur > 0) - { - int need_stop = 0; - int split = itanium_split_issue (sched_data.packet, prev_first); - - if (sched_data.cur < 3 && split > 3) - { - split = 3; - need_stop = 1; - } + return TRUE; +} - if (split > sched_data.cur) - { - int i; - for (i = sched_data.cur; i < split; i++) - { - rtx t = sched_emit_insn (gen_nop_type (sched_data.packet->t[i])); - sched_data.types[i] = sched_data.packet->t[i]; - sched_data.insns[i] = t; - sched_data.stopbit[i] = 0; - } - sched_data.cur = split; - } +/* The following function tries to issue BEFORE_NOPS_NUM nops and INSN + starting with ORIGINATOR without advancing processor cycle. If + TRY_BUNDLE_END_P is TRUE, the function also/only (if + ONLY_BUNDLE_END_P is TRUE) tries to issue nops to fill all bundle. + If it was successful, the function creates new bundle state and + insert into the hash table and into `index_to_bundle_states'. */ - if (! need_stop && sched_data.cur > 0 && sched_data.cur < 6 - && cycles_left > 1) +static void +issue_nops_and_insn (struct bundle_state *originator, int before_nops_num, + rtx insn, int try_bundle_end_p, int only_bundle_end_p) +{ + struct bundle_state *curr_state; + + curr_state = get_free_bundle_state (); + memcpy (curr_state->dfa_state, originator->dfa_state, dfa_state_size); + curr_state->insn = insn; + curr_state->insn_num = originator->insn_num + 1; + curr_state->cost = originator->cost; + curr_state->originator = originator; + curr_state->before_nops_num = before_nops_num; + curr_state->after_nops_num = 0; + curr_state->accumulated_insns_num + = originator->accumulated_insns_num + before_nops_num; + curr_state->branch_deviation = originator->branch_deviation; + if (insn == NULL_RTX) + abort (); + else if (INSN_CODE (insn) == CODE_FOR_insn_group_barrier) + { + if (GET_MODE (insn) == TImode) + abort (); + if (!try_issue_nops (curr_state, before_nops_num)) + return; + if (!try_issue_insn (curr_state, insn)) + return; + memcpy (temp_dfa_state, curr_state->dfa_state, dfa_state_size); + if (state_transition (temp_dfa_state, dfa_pre_cycle_insn) >= 0 + && curr_state->accumulated_insns_num % 3 != 0) { - int i; - for (i = sched_data.cur; i < 6; i++) - { - rtx t = sched_emit_insn (gen_nop_type (sched_data.packet->t[i])); - sched_data.types[i] = sched_data.packet->t[i]; - sched_data.insns[i] = t; - sched_data.stopbit[i] = 0; - } - sched_data.cur = 6; - cycles_left--; - need_stop = 1; + free_bundle_state (curr_state); + return; } - - if (need_stop || sched_data.cur == 6) + } + else if (GET_MODE (insn) != TImode) + { + if (!try_issue_nops (curr_state, before_nops_num)) + return; + if (!try_issue_insn (curr_state, insn)) + return; + curr_state->accumulated_insns_num++; + if (GET_CODE (PATTERN (insn)) == ASM_INPUT + || asm_noperands (PATTERN (insn)) >= 0) + abort (); + if (ia64_safe_type (insn) == TYPE_L) + curr_state->accumulated_insns_num++; + } + else + { + state_transition (curr_state->dfa_state, dfa_pre_cycle_insn); + state_transition (curr_state->dfa_state, NULL); + curr_state->cost++; + if (!try_issue_nops (curr_state, before_nops_num)) + return; + if (!try_issue_insn (curr_state, insn)) + return; + curr_state->accumulated_insns_num++; + if (GET_CODE (PATTERN (insn)) == ASM_INPUT + || asm_noperands (PATTERN (insn)) >= 0) { - sched_emit_insn (gen_insn_group_barrier (GEN_INT (3))); - did_stop = true; + /* Finish bundle containing asm insn. */ + curr_state->after_nops_num + = 3 - curr_state->accumulated_insns_num % 3; + curr_state->accumulated_insns_num + += 3 - curr_state->accumulated_insns_num % 3; } - maybe_rotate (dump); + else if (ia64_safe_type (insn) == TYPE_L) + curr_state->accumulated_insns_num++; } - - cycles_left--; - while (cycles_left > 0) + if (ia64_safe_type (insn) == TYPE_B) + curr_state->branch_deviation + += 2 - (curr_state->accumulated_insns_num - 1) % 3; + if (try_bundle_end_p && curr_state->accumulated_insns_num % 3 != 0) { - sched_emit_insn (gen_bundle_selector (GEN_INT (0))); - sched_emit_insn (gen_nop_type (TYPE_M)); - sched_emit_insn (gen_nop_type (TYPE_I)); - if (cycles_left > 1) + if (!only_bundle_end_p && insert_bundle_state (curr_state)) { - sched_emit_insn (gen_insn_group_barrier (GEN_INT (2))); - cycles_left--; + state_t dfa_state; + struct bundle_state *curr_state1; + struct bundle_state *allocated_states_chain; + + curr_state1 = get_free_bundle_state (); + dfa_state = curr_state1->dfa_state; + allocated_states_chain = curr_state1->allocated_states_chain; + *curr_state1 = *curr_state; + curr_state1->dfa_state = dfa_state; + curr_state1->allocated_states_chain = allocated_states_chain; + memcpy (curr_state1->dfa_state, curr_state->dfa_state, + dfa_state_size); + curr_state = curr_state1; } - sched_emit_insn (gen_nop_type (TYPE_I)); - sched_emit_insn (gen_insn_group_barrier (GEN_INT (3))); - did_stop = true; - cycles_left--; + if (!try_issue_nops (curr_state, + 3 - curr_state->accumulated_insns_num % 3)) + return; + curr_state->after_nops_num + = 3 - curr_state->accumulated_insns_num % 3; + curr_state->accumulated_insns_num + += 3 - curr_state->accumulated_insns_num % 3; } - - if (did_stop) - init_insn_group_barriers (); + if (!insert_bundle_state (curr_state)) + free_bundle_state (curr_state); + return; } -/* We are about to being issuing insns for this clock cycle. - Override the default sort algorithm to better slot instructions. */ +/* The following function returns position in the two window bundle + for given STATE. */ static int -ia64_internal_sched_reorder (dump, sched_verbose, ready, pn_ready, - reorder_type, clock_var) - FILE *dump ATTRIBUTE_UNUSED; - int sched_verbose ATTRIBUTE_UNUSED; - rtx *ready; - int *pn_ready; - int reorder_type, clock_var; +get_max_pos (state_t state) { - int n_asms; - int n_ready = *pn_ready; - rtx *e_ready = ready + n_ready; - rtx *insnp; + if (cpu_unit_reservation_p (state, pos_6)) + return 6; + else if (cpu_unit_reservation_p (state, pos_5)) + return 5; + else if (cpu_unit_reservation_p (state, pos_4)) + return 4; + else if (cpu_unit_reservation_p (state, pos_3)) + return 3; + else if (cpu_unit_reservation_p (state, pos_2)) + return 2; + else if (cpu_unit_reservation_p (state, pos_1)) + return 1; + else + return 0; +} - if (sched_verbose) - { - fprintf (dump, "// ia64_sched_reorder (type %d):\n", reorder_type); - dump_current_packet (dump); - } +/* The function returns code of a possible template for given position + and state. The function should be called only with 2 values of + position equal to 3 or 6. */ - /* Work around the pipeline flush that will occurr if the results of - an MM instruction are accessed before the result is ready. Intel - documentation says this only happens with IALU, ISHF, ILOG, LD, - and ST consumers, but experimental evidence shows that *any* non-MM - type instruction will incurr the flush. */ - if (reorder_type == 0 && clock_var > 0 && ia64_final_schedule) +static int +get_template (state_t state, int pos) +{ + switch (pos) { - for (insnp = ready; insnp < e_ready; insnp++) - { - rtx insn = *insnp, link; - enum attr_itanium_class t = ia64_safe_itanium_class (insn); + case 3: + if (cpu_unit_reservation_p (state, _0mii_)) + return 0; + else if (cpu_unit_reservation_p (state, _0mmi_)) + return 1; + else if (cpu_unit_reservation_p (state, _0mfi_)) + return 2; + else if (cpu_unit_reservation_p (state, _0mmf_)) + return 3; + else if (cpu_unit_reservation_p (state, _0bbb_)) + return 4; + else if (cpu_unit_reservation_p (state, _0mbb_)) + return 5; + else if (cpu_unit_reservation_p (state, _0mib_)) + return 6; + else if (cpu_unit_reservation_p (state, _0mmb_)) + return 7; + else if (cpu_unit_reservation_p (state, _0mfb_)) + return 8; + else if (cpu_unit_reservation_p (state, _0mlx_)) + return 9; + else + abort (); + case 6: + if (cpu_unit_reservation_p (state, _1mii_)) + return 0; + else if (cpu_unit_reservation_p (state, _1mmi_)) + return 1; + else if (cpu_unit_reservation_p (state, _1mfi_)) + return 2; + else if (_1mmf_ >= 0 && cpu_unit_reservation_p (state, _1mmf_)) + return 3; + else if (cpu_unit_reservation_p (state, _1bbb_)) + return 4; + else if (cpu_unit_reservation_p (state, _1mbb_)) + return 5; + else if (cpu_unit_reservation_p (state, _1mib_)) + return 6; + else if (cpu_unit_reservation_p (state, _1mmb_)) + return 7; + else if (cpu_unit_reservation_p (state, _1mfb_)) + return 8; + else if (cpu_unit_reservation_p (state, _1mlx_)) + return 9; + else + abort (); + default: + abort (); + } +} - if (t == ITANIUM_CLASS_MMMUL - || t == ITANIUM_CLASS_MMSHF - || t == ITANIUM_CLASS_MMSHFI) - continue; +/* The following function returns an insn important for insn bundling + followed by INSN and before TAIL. */ - for (link = LOG_LINKS (insn); link; link = XEXP (link, 1)) - if (REG_NOTE_KIND (link) == 0) - { - rtx other = XEXP (link, 0); - enum attr_itanium_class t0 = ia64_safe_itanium_class (other); - if (t0 == ITANIUM_CLASS_MMSHF || t0 == ITANIUM_CLASS_MMMUL) - { - nop_cycles_until (clock_var, sched_verbose ? dump : NULL); - goto out; - } - } - } - } - out: +static rtx +get_next_important_insn (rtx insn, rtx tail) +{ + for (; insn && insn != tail; insn = NEXT_INSN (insn)) + if (INSN_P (insn) + && ia64_safe_itanium_class (insn) != ITANIUM_CLASS_IGNORE + && GET_CODE (PATTERN (insn)) != USE + && GET_CODE (PATTERN (insn)) != CLOBBER) + return insn; + return NULL_RTX; +} - prev_first = sched_data.first_slot; - prev_cycle = clock_var; +/* The following function does insn bundling. Bundling means + inserting templates and nop insns to fit insn groups into permitted + templates. Instruction scheduling uses NDFA (non-deterministic + finite automata) encoding informations about the templates and the + inserted nops. Nondeterminism of the automata permits follows + all possible insn sequences very fast. + + Unfortunately it is not possible to get information about inserting + nop insns and used templates from the automata states. The + automata only says that we can issue an insn possibly inserting + some nops before it and using some template. Therefore insn + bundling in this function is implemented by using DFA + (deterministic finite automata). We follows all possible insn + sequences by inserting 0-2 nops (that is what the NDFA describe for + insn scheduling) before/after each insn being bundled. We know the + start of simulated processor cycle from insn scheduling (insn + starting a new cycle has TImode). + + Simple implementation of insn bundling would create enormous + number of possible insn sequences satisfying information about new + cycle ticks taken from the insn scheduling. To make the algorithm + practical we use dynamic programming. Each decision (about + inserting nops and implicitly about previous decisions) is described + by structure bundle_state (see above). If we generate the same + bundle state (key is automaton state after issuing the insns and + nops for it), we reuse already generated one. As consequence we + reject some decisions which can not improve the solution and + reduce memory for the algorithm. + + When we reach the end of EBB (extended basic block), we choose the + best sequence and then, moving back in EBB, insert templates for + the best alternative. The templates are taken from querying + automaton state for each insn in chosen bundle states. + + So the algorithm makes two (forward and backward) passes through + EBB. There is an additional forward pass through EBB for Itanium1 + processor. This pass inserts more nops to make dependency between + a producer insn and MMMUL/MMSHF at least 4 cycles long. */ - if (reorder_type == 0) - maybe_rotate (sched_verbose ? dump : NULL); +static void +bundling (FILE *dump, int verbose, rtx prev_head_insn, rtx tail) +{ + struct bundle_state *curr_state, *next_state, *best_state; + rtx insn, next_insn; + int insn_num; + int i, bundle_end_p, only_bundle_end_p, asm_p; + int pos = 0, max_pos, template0, template1; + rtx b; + rtx nop; + enum attr_type type; - /* First, move all USEs, CLOBBERs and other crud out of the way. */ - n_asms = 0; - for (insnp = ready; insnp < e_ready; insnp++) - if (insnp < e_ready) + insn_num = 0; + /* Count insns in the EBB. */ + for (insn = NEXT_INSN (prev_head_insn); + insn && insn != tail; + insn = NEXT_INSN (insn)) + if (INSN_P (insn)) + insn_num++; + if (insn_num == 0) + return; + bundling_p = 1; + dfa_clean_insn_cache (); + initiate_bundle_state_table (); + index_to_bundle_states = xmalloc ((insn_num + 2) + * sizeof (struct bundle_state *)); + /* First (forward) pass -- generation of bundle states. */ + curr_state = get_free_bundle_state (); + curr_state->insn = NULL; + curr_state->before_nops_num = 0; + curr_state->after_nops_num = 0; + curr_state->insn_num = 0; + curr_state->cost = 0; + curr_state->accumulated_insns_num = 0; + curr_state->branch_deviation = 0; + curr_state->next = NULL; + curr_state->originator = NULL; + state_reset (curr_state->dfa_state); + index_to_bundle_states [0] = curr_state; + insn_num = 0; + /* Shift cycle mark if it is put on insn which could be ignored. */ + for (insn = NEXT_INSN (prev_head_insn); + insn != tail; + insn = NEXT_INSN (insn)) + if (INSN_P (insn) + && (ia64_safe_itanium_class (insn) == ITANIUM_CLASS_IGNORE + || GET_CODE (PATTERN (insn)) == USE + || GET_CODE (PATTERN (insn)) == CLOBBER) + && GET_MODE (insn) == TImode) { - rtx insn = *insnp; - enum attr_type t = ia64_safe_type (insn); - if (t == TYPE_UNKNOWN) + PUT_MODE (insn, VOIDmode); + for (next_insn = NEXT_INSN (insn); + next_insn != tail; + next_insn = NEXT_INSN (next_insn)) + if (INSN_P (next_insn) + && ia64_safe_itanium_class (next_insn) != ITANIUM_CLASS_IGNORE + && GET_CODE (PATTERN (next_insn)) != USE + && GET_CODE (PATTERN (next_insn)) != CLOBBER) + { + PUT_MODE (next_insn, TImode); + break; + } + } + /* Froward pass: generation of bundle states. */ + for (insn = get_next_important_insn (NEXT_INSN (prev_head_insn), tail); + insn != NULL_RTX; + insn = next_insn) + { + if (!INSN_P (insn) + || ia64_safe_itanium_class (insn) == ITANIUM_CLASS_IGNORE + || GET_CODE (PATTERN (insn)) == USE + || GET_CODE (PATTERN (insn)) == CLOBBER) + abort (); + type = ia64_safe_type (insn); + next_insn = get_next_important_insn (NEXT_INSN (insn), tail); + insn_num++; + index_to_bundle_states [insn_num] = NULL; + for (curr_state = index_to_bundle_states [insn_num - 1]; + curr_state != NULL; + curr_state = next_state) + { + pos = curr_state->accumulated_insns_num % 3; + next_state = curr_state->next; + /* We must fill up the current bundle in order to start a + subsequent asm insn in a new bundle. Asm insn is always + placed in a separate bundle. */ + only_bundle_end_p + = (next_insn != NULL_RTX + && INSN_CODE (insn) == CODE_FOR_insn_group_barrier + && ia64_safe_type (next_insn) == TYPE_UNKNOWN); + /* We may fill up the current bundle if it is the cycle end + without a group barrier. */ + bundle_end_p + = (only_bundle_end_p || next_insn == NULL_RTX + || (GET_MODE (next_insn) == TImode + && INSN_CODE (insn) != CODE_FOR_insn_group_barrier)); + if (type == TYPE_F || type == TYPE_B || type == TYPE_L + || type == TYPE_S + /* We need to insert 2 nops for cases like M_MII. To + guarantee issuing all insns on the same cycle for + Itanium 1, we need to issue 2 nops after the first M + insn (MnnMII where n is a nop insn). */ + || ((type == TYPE_M || type == TYPE_A) + && ia64_tune == PROCESSOR_ITANIUM + && !bundle_end_p && pos == 1)) + issue_nops_and_insn (curr_state, 2, insn, bundle_end_p, + only_bundle_end_p); + issue_nops_and_insn (curr_state, 1, insn, bundle_end_p, + only_bundle_end_p); + issue_nops_and_insn (curr_state, 0, insn, bundle_end_p, + only_bundle_end_p); + } + if (index_to_bundle_states [insn_num] == NULL) + abort (); + for (curr_state = index_to_bundle_states [insn_num]; + curr_state != NULL; + curr_state = curr_state->next) + if (verbose >= 2 && dump) { - if (GET_CODE (PATTERN (insn)) == ASM_INPUT - || asm_noperands (PATTERN (insn)) >= 0) - { - rtx lowest = ready[n_asms]; - ready[n_asms] = insn; - *insnp = lowest; - n_asms++; - } - else - { - rtx highest = ready[n_ready - 1]; - ready[n_ready - 1] = insn; - *insnp = highest; - if (ia64_final_schedule && group_barrier_needed_p (insn)) - { - schedule_stop (sched_verbose ? dump : NULL); - sched_data.last_was_stop = 1; - maybe_rotate (sched_verbose ? dump : NULL); - } - - return 1; - } + /* This structure is taken from generated code of the + pipeline hazard recognizer (see file insn-attrtab.c). + Please don't forget to change the structure if a new + automaton is added to .md file. */ + struct DFA_chip + { + unsigned short one_automaton_state; + unsigned short oneb_automaton_state; + unsigned short two_automaton_state; + unsigned short twob_automaton_state; + }; + + fprintf + (dump, + "// Bundle state %d (orig %d, cost %d, nops %d/%d, insns %d, branch %d, state %d) for %d\n", + curr_state->unique_num, + (curr_state->originator == NULL + ? -1 : curr_state->originator->unique_num), + curr_state->cost, + curr_state->before_nops_num, curr_state->after_nops_num, + curr_state->accumulated_insns_num, curr_state->branch_deviation, + (ia64_tune == PROCESSOR_ITANIUM + ? ((struct DFA_chip *) curr_state->dfa_state)->oneb_automaton_state + : ((struct DFA_chip *) curr_state->dfa_state)->twob_automaton_state), + INSN_UID (insn)); } - } - if (n_asms < n_ready) - { - /* Some normal insns to process. Skip the asms. */ - ready += n_asms; - n_ready -= n_asms; } - else if (n_ready > 0) - { - /* Only asm insns left. */ - if (ia64_final_schedule && group_barrier_needed_p (ready[n_ready - 1])) + if (index_to_bundle_states [insn_num] == NULL) + /* We should find a solution because the 2nd insn scheduling has + found one. */ + abort (); + /* Find a state corresponding to the best insn sequence. */ + best_state = NULL; + for (curr_state = index_to_bundle_states [insn_num]; + curr_state != NULL; + curr_state = curr_state->next) + /* We are just looking at the states with fully filled up last + bundle. The first we prefer insn sequences with minimal cost + then with minimal inserted nops and finally with branch insns + placed in the 3rd slots. */ + if (curr_state->accumulated_insns_num % 3 == 0 + && (best_state == NULL || best_state->cost > curr_state->cost + || (best_state->cost == curr_state->cost + && (curr_state->accumulated_insns_num + < best_state->accumulated_insns_num + || (curr_state->accumulated_insns_num + == best_state->accumulated_insns_num + && curr_state->branch_deviation + < best_state->branch_deviation))))) + best_state = curr_state; + /* Second (backward) pass: adding nops and templates. */ + insn_num = best_state->before_nops_num; + template0 = template1 = -1; + for (curr_state = best_state; + curr_state->originator != NULL; + curr_state = curr_state->originator) + { + insn = curr_state->insn; + asm_p = (GET_CODE (PATTERN (insn)) == ASM_INPUT + || asm_noperands (PATTERN (insn)) >= 0); + insn_num++; + if (verbose >= 2 && dump) { - schedule_stop (sched_verbose ? dump : NULL); - sched_data.last_was_stop = 1; - maybe_rotate (sched_verbose ? dump : NULL); + struct DFA_chip + { + unsigned short one_automaton_state; + unsigned short oneb_automaton_state; + unsigned short two_automaton_state; + unsigned short twob_automaton_state; + }; + + fprintf + (dump, + "// Best %d (orig %d, cost %d, nops %d/%d, insns %d, branch %d, state %d) for %d\n", + curr_state->unique_num, + (curr_state->originator == NULL + ? -1 : curr_state->originator->unique_num), + curr_state->cost, + curr_state->before_nops_num, curr_state->after_nops_num, + curr_state->accumulated_insns_num, curr_state->branch_deviation, + (ia64_tune == PROCESSOR_ITANIUM + ? ((struct DFA_chip *) curr_state->dfa_state)->oneb_automaton_state + : ((struct DFA_chip *) curr_state->dfa_state)->twob_automaton_state), + INSN_UID (insn)); } - cycle_end_fill_slots (sched_verbose ? dump : NULL); - return 1; - } - - if (ia64_final_schedule) - { - int nr_need_stop = 0; - - for (insnp = ready; insnp < e_ready; insnp++) - if (safe_group_barrier_needed_p (*insnp)) - nr_need_stop++; - - /* Schedule a stop bit if - - all insns require a stop bit, or - - we are starting a new cycle and _any_ insns require a stop bit. - The reason for the latter is that if our schedule is accurate, then - the additional stop won't decrease performance at this point (since - there's a split issue at this point anyway), but it gives us more - freedom when scheduling the currently ready insns. */ - if ((reorder_type == 0 && nr_need_stop) - || (reorder_type == 1 && n_ready == nr_need_stop)) + /* Find the position in the current bundle window. The window can + contain at most two bundles. Two bundle window means that + the processor will make two bundle rotation. */ + max_pos = get_max_pos (curr_state->dfa_state); + if (max_pos == 6 + /* The following (negative template number) means that the + processor did one bundle rotation. */ + || (max_pos == 3 && template0 < 0)) { - schedule_stop (sched_verbose ? dump : NULL); - sched_data.last_was_stop = 1; - maybe_rotate (sched_verbose ? dump : NULL); - if (reorder_type == 1) - return 0; + /* We are at the end of the window -- find template(s) for + its bundle(s). */ + pos = max_pos; + if (max_pos == 3) + template0 = get_template (curr_state->dfa_state, 3); + else + { + template1 = get_template (curr_state->dfa_state, 3); + template0 = get_template (curr_state->dfa_state, 6); + } } - else + if (max_pos > 3 && template1 < 0) + /* It may happen when we have the stop inside a bundle. */ { - int deleted = 0; - insnp = e_ready; - /* Move down everything that needs a stop bit, preserving relative - order. */ - while (insnp-- > ready + deleted) - while (insnp >= ready + deleted) + if (pos > 3) + abort (); + template1 = get_template (curr_state->dfa_state, 3); + pos += 3; + } + if (!asm_p) + /* Emit nops after the current insn. */ + for (i = 0; i < curr_state->after_nops_num; i++) + { + nop = gen_nop (); + emit_insn_after (nop, insn); + pos--; + if (pos < 0) + abort (); + if (pos % 3 == 0) { - rtx insn = *insnp; - if (! safe_group_barrier_needed_p (insn)) - break; - memmove (ready + 1, ready, (insnp - ready) * sizeof (rtx)); - *ready = insn; - deleted++; + /* We are at the start of a bundle: emit the template + (it should be defined). */ + if (template0 < 0) + abort (); + b = gen_bundle_selector (GEN_INT (template0)); + ia64_emit_insn_before (b, nop); + /* If we have two bundle window, we make one bundle + rotation. Otherwise template0 will be undefined + (negative value). */ + template0 = template1; + template1 = -1; } - n_ready -= deleted; - ready += deleted; - if (deleted != nr_need_stop) + } + /* Move the position backward in the window. Group barrier has + no slot. Asm insn takes all bundle. */ + if (INSN_CODE (insn) != CODE_FOR_insn_group_barrier + && GET_CODE (PATTERN (insn)) != ASM_INPUT + && asm_noperands (PATTERN (insn)) < 0) + pos--; + /* Long insn takes 2 slots. */ + if (ia64_safe_type (insn) == TYPE_L) + pos--; + if (pos < 0) + abort (); + if (pos % 3 == 0 + && INSN_CODE (insn) != CODE_FOR_insn_group_barrier + && GET_CODE (PATTERN (insn)) != ASM_INPUT + && asm_noperands (PATTERN (insn)) < 0) + { + /* The current insn is at the bundle start: emit the + template. */ + if (template0 < 0) abort (); + b = gen_bundle_selector (GEN_INT (template0)); + ia64_emit_insn_before (b, insn); + b = PREV_INSN (insn); + insn = b; + /* See comment above in analogous place for emiting nops + after the insn. */ + template0 = template1; + template1 = -1; + } + /* Emit nops after the current insn. */ + for (i = 0; i < curr_state->before_nops_num; i++) + { + nop = gen_nop (); + ia64_emit_insn_before (nop, insn); + nop = PREV_INSN (insn); + insn = nop; + pos--; + if (pos < 0) + abort (); + if (pos % 3 == 0) + { + /* See comment above in analogous place for emiting nops + after the insn. */ + if (template0 < 0) + abort (); + b = gen_bundle_selector (GEN_INT (template0)); + ia64_emit_insn_before (b, insn); + b = PREV_INSN (insn); + insn = b; + template0 = template1; + template1 = -1; + } } } - - return itanium_reorder (sched_verbose ? dump : NULL, - ready, e_ready, reorder_type == 1); + if (ia64_tune == PROCESSOR_ITANIUM) + /* Insert additional cycles for MM-insns (MMMUL and MMSHF). + Itanium1 has a strange design, if the distance between an insn + and dependent MM-insn is less 4 then we have a 6 additional + cycles stall. So we make the distance equal to 4 cycles if it + is less. */ + for (insn = get_next_important_insn (NEXT_INSN (prev_head_insn), tail); + insn != NULL_RTX; + insn = next_insn) + { + if (!INSN_P (insn) + || ia64_safe_itanium_class (insn) == ITANIUM_CLASS_IGNORE + || GET_CODE (PATTERN (insn)) == USE + || GET_CODE (PATTERN (insn)) == CLOBBER) + abort (); + next_insn = get_next_important_insn (NEXT_INSN (insn), tail); + if (INSN_UID (insn) < clocks_length && add_cycles [INSN_UID (insn)]) + /* We found a MM-insn which needs additional cycles. */ + { + rtx last; + int i, j, n; + int pred_stop_p; + + /* Now we are searching for a template of the bundle in + which the MM-insn is placed and the position of the + insn in the bundle (0, 1, 2). Also we are searching + for that there is a stop before the insn. */ + last = prev_active_insn (insn); + pred_stop_p = recog_memoized (last) == CODE_FOR_insn_group_barrier; + if (pred_stop_p) + last = prev_active_insn (last); + n = 0; + for (;; last = prev_active_insn (last)) + if (recog_memoized (last) == CODE_FOR_bundle_selector) + { + template0 = XINT (XVECEXP (PATTERN (last), 0, 0), 0); + if (template0 == 9) + /* The insn is in MLX bundle. Change the template + onto MFI because we will add nops before the + insn. It simplifies subsequent code a lot. */ + PATTERN (last) + = gen_bundle_selector (GEN_INT (2)); /* -> MFI */ + break; + } + else if (recog_memoized (last) != CODE_FOR_insn_group_barrier + && (ia64_safe_itanium_class (last) + != ITANIUM_CLASS_IGNORE)) + n++; + /* Some check of correctness: the stop is not at the + bundle start, there are no more 3 insns in the bundle, + and the MM-insn is not at the start of bundle with + template MLX. */ + if ((pred_stop_p && n == 0) || n > 2 + || (template0 == 9 && n != 0)) + abort (); + /* Put nops after the insn in the bundle. */ + for (j = 3 - n; j > 0; j --) + ia64_emit_insn_before (gen_nop (), insn); + /* It takes into account that we will add more N nops + before the insn lately -- please see code below. */ + add_cycles [INSN_UID (insn)]--; + if (!pred_stop_p || add_cycles [INSN_UID (insn)]) + ia64_emit_insn_before (gen_insn_group_barrier (GEN_INT (3)), + insn); + if (pred_stop_p) + add_cycles [INSN_UID (insn)]--; + for (i = add_cycles [INSN_UID (insn)]; i > 0; i--) + { + /* Insert "MII;" template. */ + ia64_emit_insn_before (gen_bundle_selector (GEN_INT (0)), + insn); + ia64_emit_insn_before (gen_nop (), insn); + ia64_emit_insn_before (gen_nop (), insn); + if (i > 1) + { + /* To decrease code size, we use "MI;I;" + template. */ + ia64_emit_insn_before + (gen_insn_group_barrier (GEN_INT (3)), insn); + i--; + } + ia64_emit_insn_before (gen_nop (), insn); + ia64_emit_insn_before (gen_insn_group_barrier (GEN_INT (3)), + insn); + } + /* Put the MM-insn in the same slot of a bundle with the + same template as the original one. */ + ia64_emit_insn_before (gen_bundle_selector (GEN_INT (template0)), + insn); + /* To put the insn in the same slot, add necessary number + of nops. */ + for (j = n; j > 0; j --) + ia64_emit_insn_before (gen_nop (), insn); + /* Put the stop if the original bundle had it. */ + if (pred_stop_p) + ia64_emit_insn_before (gen_insn_group_barrier (GEN_INT (3)), + insn); + } + } + free (index_to_bundle_states); + finish_bundle_state_table (); + bundling_p = 0; + dfa_clean_insn_cache (); } -static int -ia64_sched_reorder (dump, sched_verbose, ready, pn_ready, clock_var) - FILE *dump; - int sched_verbose; - rtx *ready; - int *pn_ready; - int clock_var; +/* The following function is called at the end of scheduling BB or + EBB. After reload, it inserts stop bits and does insn bundling. */ + +static void +ia64_sched_finish (FILE *dump, int sched_verbose) { - return ia64_internal_sched_reorder (dump, sched_verbose, ready, - pn_ready, 0, clock_var); + if (sched_verbose) + fprintf (dump, "// Finishing schedule.\n"); + if (!reload_completed) + return; + if (reload_completed) + { + final_emit_insn_group_barriers (dump); + bundling (dump, sched_verbose, current_sched_info->prev_head, + current_sched_info->next_tail); + if (sched_verbose && dump) + fprintf (dump, "// finishing %d-%d\n", + INSN_UID (NEXT_INSN (current_sched_info->prev_head)), + INSN_UID (PREV_INSN (current_sched_info->next_tail))); + + return; + } } -/* Like ia64_sched_reorder, but called after issuing each insn. - Override the default sort algorithm to better slot instructions. */ +/* The following function inserts stop bits in scheduled BB or EBB. */ -static int -ia64_sched_reorder2 (dump, sched_verbose, ready, pn_ready, clock_var) - FILE *dump ATTRIBUTE_UNUSED; - int sched_verbose ATTRIBUTE_UNUSED; - rtx *ready; - int *pn_ready; - int clock_var; -{ - if (sched_data.last_was_stop) - return 0; +static void +final_emit_insn_group_barriers (FILE *dump ATTRIBUTE_UNUSED) +{ + rtx insn; + int need_barrier_p = 0; + rtx prev_insn = NULL_RTX; - /* Detect one special case and try to optimize it. - If we have 1.M;;MI 2.MIx, and slots 2.1 (M) and 2.2 (I) are both NOPs, - then we can get better code by transforming this to 1.MFB;; 2.MIx. */ - if (sched_data.first_slot == 1 - && sched_data.stopbit[0] - && ((sched_data.cur == 4 - && (sched_data.types[1] == TYPE_M || sched_data.types[1] == TYPE_A) - && (sched_data.types[2] == TYPE_I || sched_data.types[2] == TYPE_A) - && (sched_data.types[3] != TYPE_M && sched_data.types[3] != TYPE_A)) - || (sched_data.cur == 3 - && (sched_data.types[1] == TYPE_M - || sched_data.types[1] == TYPE_A) - && (sched_data.types[2] != TYPE_M - && sched_data.types[2] != TYPE_I - && sched_data.types[2] != TYPE_A)))) - - { - int i, best; - rtx stop = sched_data.insns[1]; + init_insn_group_barriers (); - /* Search backward for the stop bit that must be there. */ - while (1) + for (insn = NEXT_INSN (current_sched_info->prev_head); + insn != current_sched_info->next_tail; + insn = NEXT_INSN (insn)) + { + if (GET_CODE (insn) == BARRIER) { - int insn_code; - - stop = PREV_INSN (stop); - if (GET_CODE (stop) != INSN) - abort (); - insn_code = recog_memoized (stop); - - /* Ignore .pred.rel.mutex. + rtx last = prev_active_insn (insn); - ??? Update this to ignore cycle display notes too - ??? once those are implemented */ - if (insn_code == CODE_FOR_pred_rel_mutex - || insn_code == CODE_FOR_prologue_use) + if (! last) continue; + if (GET_CODE (last) == JUMP_INSN + && GET_CODE (PATTERN (last)) == ADDR_DIFF_VEC) + last = prev_active_insn (last); + if (recog_memoized (last) != CODE_FOR_insn_group_barrier) + emit_insn_after (gen_insn_group_barrier (GEN_INT (3)), last); - if (insn_code == CODE_FOR_insn_group_barrier) - break; - abort (); + init_insn_group_barriers (); + need_barrier_p = 0; + prev_insn = NULL_RTX; } - - /* Adjust the stop bit's slot selector. */ - if (INTVAL (XVECEXP (PATTERN (stop), 0, 0)) != 1) - abort (); - XVECEXP (PATTERN (stop), 0, 0) = GEN_INT (3); - - sched_data.stopbit[0] = 0; - sched_data.stopbit[2] = 1; - - sched_data.types[5] = sched_data.types[3]; - sched_data.types[4] = sched_data.types[2]; - sched_data.types[3] = sched_data.types[1]; - sched_data.insns[5] = sched_data.insns[3]; - sched_data.insns[4] = sched_data.insns[2]; - sched_data.insns[3] = sched_data.insns[1]; - sched_data.stopbit[5] = sched_data.stopbit[4] = sched_data.stopbit[3] = 0; - sched_data.cur += 2; - sched_data.first_slot = 3; - for (i = 0; i < NR_PACKETS; i++) + else if (INSN_P (insn)) { - const struct ia64_packet *p = packets + i; - if (p->t[0] == TYPE_M && p->t[1] == TYPE_F && p->t[2] == TYPE_B) + if (recog_memoized (insn) == CODE_FOR_insn_group_barrier) { - sched_data.packet = p; - break; + init_insn_group_barriers (); + need_barrier_p = 0; + prev_insn = NULL_RTX; } - } - rotate_one_bundle (sched_verbose ? dump : NULL); - - best = 6; - for (i = 0; i < NR_PACKETS; i++) - { - const struct ia64_packet *p = packets + i; - int split = get_split (p, sched_data.first_slot); - int next; - - /* Disallow multiway branches here. */ - if (p->t[1] == TYPE_B) - continue; - - if (packet_matches_p (p, split, &next) && next < best) + else if (need_barrier_p || group_barrier_needed_p (insn)) { - best = next; - sched_data.packet = p; - sched_data.split = split; + if (TARGET_EARLY_STOP_BITS) + { + rtx last; + + for (last = insn; + last != current_sched_info->prev_head; + last = PREV_INSN (last)) + if (INSN_P (last) && GET_MODE (last) == TImode + && stops_p [INSN_UID (last)]) + break; + if (last == current_sched_info->prev_head) + last = insn; + last = prev_active_insn (last); + if (last + && recog_memoized (last) != CODE_FOR_insn_group_barrier) + emit_insn_after (gen_insn_group_barrier (GEN_INT (3)), + last); + init_insn_group_barriers (); + for (last = NEXT_INSN (last); + last != insn; + last = NEXT_INSN (last)) + if (INSN_P (last)) + group_barrier_needed_p (last); + } + else + { + emit_insn_before (gen_insn_group_barrier (GEN_INT (3)), + insn); + init_insn_group_barriers (); + } + group_barrier_needed_p (insn); + prev_insn = NULL_RTX; } + else if (recog_memoized (insn) >= 0) + prev_insn = insn; + need_barrier_p = (GET_CODE (insn) == CALL_INSN + || GET_CODE (PATTERN (insn)) == ASM_INPUT + || asm_noperands (PATTERN (insn)) >= 0); } - if (best == 6) - abort (); } +} - if (*pn_ready > 0) - { - int more = ia64_internal_sched_reorder (dump, sched_verbose, - ready, pn_ready, 1, - clock_var); - if (more) - return more; - /* Did we schedule a stop? If so, finish this cycle. */ - if (sched_data.cur == sched_data.first_slot) - return 0; - } + - if (sched_verbose) - fprintf (dump, "// Can't issue more this cycle; updating type array.\n"); +/* If the following function returns TRUE, we will use the the DFA + insn scheduler. */ - cycle_end_fill_slots (sched_verbose ? dump : NULL); - if (sched_verbose) - dump_current_packet (dump); - return 0; +static int +ia64_use_dfa_pipeline_interface (void) +{ + return 1; } -/* We are about to issue INSN. Return the number of insns left on the - ready queue that can be issued this cycle. */ +/* If the following function returns TRUE, we will use the the DFA + insn scheduler. */ static int -ia64_variable_issue (dump, sched_verbose, insn, can_issue_more) - FILE *dump; - int sched_verbose; - rtx insn; - int can_issue_more ATTRIBUTE_UNUSED; +ia64_first_cycle_multipass_dfa_lookahead (void) { - enum attr_type t = ia64_safe_type (insn); + return (reload_completed ? 6 : 4); +} - if (sched_data.last_was_stop) - { - int t = sched_data.first_slot; - if (t == 0) - t = 3; - ia64_emit_insn_before (gen_insn_group_barrier (GEN_INT (t)), insn); - init_insn_group_barriers (); - sched_data.last_was_stop = 0; - } +/* The following function initiates variable `dfa_pre_cycle_insn'. */ - if (t == TYPE_UNKNOWN) +static void +ia64_init_dfa_pre_cycle_insn (void) +{ + if (temp_dfa_state == NULL) { - if (sched_verbose) - fprintf (dump, "// Ignoring type %s\n", type_names[t]); - if (GET_CODE (PATTERN (insn)) == ASM_INPUT - || asm_noperands (PATTERN (insn)) >= 0) - { - /* This must be some kind of asm. Clear the scheduling state. */ - rotate_two_bundles (sched_verbose ? dump : NULL); - if (ia64_final_schedule) - group_barrier_needed_p (insn); - } - return 1; + dfa_state_size = state_size (); + temp_dfa_state = xmalloc (dfa_state_size); + prev_cycle_state = xmalloc (dfa_state_size); } + dfa_pre_cycle_insn = make_insn_raw (gen_pre_cycle ()); + PREV_INSN (dfa_pre_cycle_insn) = NEXT_INSN (dfa_pre_cycle_insn) = NULL_RTX; + recog_memoized (dfa_pre_cycle_insn); + dfa_stop_insn = make_insn_raw (gen_insn_group_barrier (GEN_INT (3))); + PREV_INSN (dfa_stop_insn) = NEXT_INSN (dfa_stop_insn) = NULL_RTX; + recog_memoized (dfa_stop_insn); +} + +/* The following function returns the pseudo insn DFA_PRE_CYCLE_INSN + used by the DFA insn scheduler. */ + +static rtx +ia64_dfa_pre_cycle_insn (void) +{ + return dfa_pre_cycle_insn; +} + +/* The following function returns TRUE if PRODUCER (of type ilog or + ld) produces address for CONSUMER (of type st or stf). */ - /* This is _not_ just a sanity check. group_barrier_needed_p will update - important state info. Don't delete this test. */ - if (ia64_final_schedule - && group_barrier_needed_p (insn)) +int +ia64_st_address_bypass_p (rtx producer, rtx consumer) +{ + rtx dest, reg, mem; + + if (producer == NULL_RTX || consumer == NULL_RTX) + abort (); + dest = ia64_single_set (producer); + if (dest == NULL_RTX || (reg = SET_DEST (dest)) == NULL_RTX + || (GET_CODE (reg) != REG && GET_CODE (reg) != SUBREG)) abort (); + if (GET_CODE (reg) == SUBREG) + reg = SUBREG_REG (reg); + dest = ia64_single_set (consumer); + if (dest == NULL_RTX || (mem = SET_DEST (dest)) == NULL_RTX + || GET_CODE (mem) != MEM) + abort (); + return reg_mentioned_p (reg, mem); +} - sched_data.stopbit[sched_data.cur] = 0; - sched_data.insns[sched_data.cur] = insn; - sched_data.types[sched_data.cur] = t; +/* The following function returns TRUE if PRODUCER (of type ilog or + ld) produces address for CONSUMER (of type ld or fld). */ - sched_data.cur++; - if (sched_verbose) - fprintf (dump, "// Scheduling insn %d of type %s\n", - INSN_UID (insn), type_names[t]); +int +ia64_ld_address_bypass_p (rtx producer, rtx consumer) +{ + rtx dest, src, reg, mem; - if (GET_CODE (insn) == CALL_INSN && ia64_final_schedule) - { - schedule_stop (sched_verbose ? dump : NULL); - sched_data.last_was_stop = 1; - } + if (producer == NULL_RTX || consumer == NULL_RTX) + abort (); + dest = ia64_single_set (producer); + if (dest == NULL_RTX || (reg = SET_DEST (dest)) == NULL_RTX + || (GET_CODE (reg) != REG && GET_CODE (reg) != SUBREG)) + abort (); + if (GET_CODE (reg) == SUBREG) + reg = SUBREG_REG (reg); + src = ia64_single_set (consumer); + if (src == NULL_RTX || (mem = SET_SRC (src)) == NULL_RTX) + abort (); + if (GET_CODE (mem) == UNSPEC && XVECLEN (mem, 0) > 0) + mem = XVECEXP (mem, 0, 0); + while (GET_CODE (mem) == SUBREG || GET_CODE (mem) == ZERO_EXTEND) + mem = XEXP (mem, 0); - return 1; + /* Note that LO_SUM is used for GOT loads. */ + if (GET_CODE (mem) != LO_SUM && GET_CODE (mem) != MEM) + abort (); + + return reg_mentioned_p (reg, mem); } -/* Free data allocated by ia64_sched_init. */ +/* The following function returns TRUE if INSN produces address for a + load/store insn. We will place such insns into M slot because it + decreases its latency time. */ -static void -ia64_sched_finish (dump, sched_verbose) - FILE *dump; - int sched_verbose; +int +ia64_produce_address_p (rtx insn) { - if (sched_verbose) - fprintf (dump, "// Finishing schedule.\n"); - rotate_two_bundles (NULL); - free (sched_types); - free (sched_ready); + return insn->call; } + /* Emit pseudo-ops for the assembler to describe predicate relations. At present this assumes that we only consider predicate pairs to @@ -6967,14 +7436,14 @@ ia64_sched_finish (dump, sched_verbose) straight-line code. */ static void -emit_predicate_relation_info () +emit_predicate_relation_info (void) { basic_block bb; FOR_EACH_BB_REVERSE (bb) { int r; - rtx head = bb->head; + rtx head = BB_HEAD (bb); /* We only need such notes at code labels. */ if (GET_CODE (head) != CODE_LABEL) @@ -6988,8 +7457,8 @@ emit_predicate_relation_info () { rtx p = gen_rtx_REG (BImode, r); rtx n = emit_insn_after (gen_pred_rel_mutex (p), head); - if (head == bb->end) - bb->end = n; + if (head == BB_END (bb)) + BB_END (bb) = n; head = n; } } @@ -7000,8 +7469,8 @@ emit_predicate_relation_info () the call. */ FOR_EACH_BB_REVERSE (bb) { - rtx insn = bb->head; - + rtx insn = BB_HEAD (bb); + while (1) { if (GET_CODE (insn) == CALL_INSN @@ -7010,130 +7479,23 @@ emit_predicate_relation_info () { rtx b = emit_insn_before (gen_safe_across_calls_all (), insn); rtx a = emit_insn_after (gen_safe_across_calls_normal (), insn); - if (bb->head == insn) - bb->head = b; - if (bb->end == insn) - bb->end = a; + if (BB_HEAD (bb) == insn) + BB_HEAD (bb) = b; + if (BB_END (bb) == insn) + BB_END (bb) = a; } - - if (insn == bb->end) + + if (insn == BB_END (bb)) break; insn = NEXT_INSN (insn); } } } -/* Generate a NOP instruction of type T. We will never generate L type - nops. */ - -static rtx -gen_nop_type (t) - enum attr_type t; -{ - switch (t) - { - case TYPE_M: - return gen_nop_m (); - case TYPE_I: - return gen_nop_i (); - case TYPE_B: - return gen_nop_b (); - case TYPE_F: - return gen_nop_f (); - case TYPE_X: - return gen_nop_x (); - default: - abort (); - } -} - -/* After the last scheduling pass, fill in NOPs. It's easier to do this - here than while scheduling. */ - -static void -ia64_emit_nops () -{ - rtx insn; - const struct bundle *b = 0; - int bundle_pos = 0; - - for (insn = get_insns (); insn; insn = NEXT_INSN (insn)) - { - rtx pat; - enum attr_type t; - pat = INSN_P (insn) ? PATTERN (insn) : const0_rtx; - if (GET_CODE (pat) == USE || GET_CODE (pat) == CLOBBER) - continue; - if ((GET_CODE (pat) == UNSPEC && XINT (pat, 1) == UNSPEC_BUNDLE_SELECTOR) - || GET_CODE (insn) == CODE_LABEL) - { - if (b) - while (bundle_pos < 3) - { - emit_insn_before (gen_nop_type (b->t[bundle_pos]), insn); - bundle_pos++; - } - if (GET_CODE (insn) != CODE_LABEL) - b = bundle + INTVAL (XVECEXP (pat, 0, 0)); - else - b = 0; - bundle_pos = 0; - continue; - } - else if (GET_CODE (pat) == UNSPEC_VOLATILE - && XINT (pat, 1) == UNSPECV_INSN_GROUP_BARRIER) - { - int t = INTVAL (XVECEXP (pat, 0, 0)); - if (b) - while (bundle_pos < t) - { - emit_insn_before (gen_nop_type (b->t[bundle_pos]), insn); - bundle_pos++; - } - continue; - } - - if (bundle_pos == 3) - b = 0; - - if (b && INSN_P (insn)) - { - t = ia64_safe_type (insn); - if (asm_noperands (PATTERN (insn)) >= 0 - || GET_CODE (PATTERN (insn)) == ASM_INPUT) - { - while (bundle_pos < 3) - { - if (b->t[bundle_pos] != TYPE_L) - emit_insn_before (gen_nop_type (b->t[bundle_pos]), insn); - bundle_pos++; - } - continue; - } - - if (t == TYPE_UNKNOWN) - continue; - while (bundle_pos < 3) - { - if (t == b->t[bundle_pos] - || (t == TYPE_A && (b->t[bundle_pos] == TYPE_M - || b->t[bundle_pos] == TYPE_I))) - break; - - emit_insn_before (gen_nop_type (b->t[bundle_pos]), insn); - bundle_pos++; - } - if (bundle_pos < 3) - bundle_pos++; - } - } -} - /* Perform machine dependent operations on the rtl chain INSNS. */ -void -ia64_reorg (insns) - rtx insns; +static void +ia64_reorg (void) { /* We are freeing block_for_insn in the toplev to keep compatibility with old MDEP_REORGS that are not CFG based. Recompute it now. */ @@ -7151,17 +7513,91 @@ ia64_reorg (insns) { timevar_push (TV_SCHED2); ia64_final_schedule = 1; + + initiate_bundle_states (); + ia64_nop = make_insn_raw (gen_nop ()); + PREV_INSN (ia64_nop) = NEXT_INSN (ia64_nop) = NULL_RTX; + recog_memoized (ia64_nop); + clocks_length = get_max_uid () + 1; + stops_p = xcalloc (1, clocks_length); + if (ia64_tune == PROCESSOR_ITANIUM) + { + clocks = xcalloc (clocks_length, sizeof (int)); + add_cycles = xcalloc (clocks_length, sizeof (int)); + } + if (ia64_tune == PROCESSOR_ITANIUM2) + { + pos_1 = get_cpu_unit_code ("2_1"); + pos_2 = get_cpu_unit_code ("2_2"); + pos_3 = get_cpu_unit_code ("2_3"); + pos_4 = get_cpu_unit_code ("2_4"); + pos_5 = get_cpu_unit_code ("2_5"); + pos_6 = get_cpu_unit_code ("2_6"); + _0mii_ = get_cpu_unit_code ("2b_0mii."); + _0mmi_ = get_cpu_unit_code ("2b_0mmi."); + _0mfi_ = get_cpu_unit_code ("2b_0mfi."); + _0mmf_ = get_cpu_unit_code ("2b_0mmf."); + _0bbb_ = get_cpu_unit_code ("2b_0bbb."); + _0mbb_ = get_cpu_unit_code ("2b_0mbb."); + _0mib_ = get_cpu_unit_code ("2b_0mib."); + _0mmb_ = get_cpu_unit_code ("2b_0mmb."); + _0mfb_ = get_cpu_unit_code ("2b_0mfb."); + _0mlx_ = get_cpu_unit_code ("2b_0mlx."); + _1mii_ = get_cpu_unit_code ("2b_1mii."); + _1mmi_ = get_cpu_unit_code ("2b_1mmi."); + _1mfi_ = get_cpu_unit_code ("2b_1mfi."); + _1mmf_ = get_cpu_unit_code ("2b_1mmf."); + _1bbb_ = get_cpu_unit_code ("2b_1bbb."); + _1mbb_ = get_cpu_unit_code ("2b_1mbb."); + _1mib_ = get_cpu_unit_code ("2b_1mib."); + _1mmb_ = get_cpu_unit_code ("2b_1mmb."); + _1mfb_ = get_cpu_unit_code ("2b_1mfb."); + _1mlx_ = get_cpu_unit_code ("2b_1mlx."); + } + else + { + pos_1 = get_cpu_unit_code ("1_1"); + pos_2 = get_cpu_unit_code ("1_2"); + pos_3 = get_cpu_unit_code ("1_3"); + pos_4 = get_cpu_unit_code ("1_4"); + pos_5 = get_cpu_unit_code ("1_5"); + pos_6 = get_cpu_unit_code ("1_6"); + _0mii_ = get_cpu_unit_code ("1b_0mii."); + _0mmi_ = get_cpu_unit_code ("1b_0mmi."); + _0mfi_ = get_cpu_unit_code ("1b_0mfi."); + _0mmf_ = get_cpu_unit_code ("1b_0mmf."); + _0bbb_ = get_cpu_unit_code ("1b_0bbb."); + _0mbb_ = get_cpu_unit_code ("1b_0mbb."); + _0mib_ = get_cpu_unit_code ("1b_0mib."); + _0mmb_ = get_cpu_unit_code ("1b_0mmb."); + _0mfb_ = get_cpu_unit_code ("1b_0mfb."); + _0mlx_ = get_cpu_unit_code ("1b_0mlx."); + _1mii_ = get_cpu_unit_code ("1b_1mii."); + _1mmi_ = get_cpu_unit_code ("1b_1mmi."); + _1mfi_ = get_cpu_unit_code ("1b_1mfi."); + _1mmf_ = get_cpu_unit_code ("1b_1mmf."); + _1bbb_ = get_cpu_unit_code ("1b_1bbb."); + _1mbb_ = get_cpu_unit_code ("1b_1mbb."); + _1mib_ = get_cpu_unit_code ("1b_1mib."); + _1mmb_ = get_cpu_unit_code ("1b_1mmb."); + _1mfb_ = get_cpu_unit_code ("1b_1mfb."); + _1mlx_ = get_cpu_unit_code ("1b_1mlx."); + } schedule_ebbs (rtl_dump_file); + finish_bundle_states (); + if (ia64_tune == PROCESSOR_ITANIUM) + { + free (add_cycles); + free (clocks); + } + free (stops_p); + emit_insn_group_barriers (rtl_dump_file); + ia64_final_schedule = 0; timevar_pop (TV_SCHED2); - - /* This relies on the NOTE_INSN_BASIC_BLOCK notes to be in the same - place as they were during scheduling. */ - emit_insn_group_barriers (rtl_dump_file, insns); - ia64_emit_nops (); } else - emit_all_insn_group_barriers (rtl_dump_file, insns); + emit_all_insn_group_barriers (rtl_dump_file); /* A call must not be the last instruction in a function, so that the return address is still within the function, so that unwinding works @@ -7174,11 +7610,12 @@ ia64_reorg (insns) insn = get_last_insn (); if (! INSN_P (insn)) insn = prev_active_insn (insn); - if (GET_CODE (insn) == INSN - && GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE - && XINT (PATTERN (insn), 1) == UNSPECV_INSN_GROUP_BARRIER) - { - saw_stop = 1; + /* Skip over insns that expand to nothing. */ + while (GET_CODE (insn) == INSN && get_attr_empty (insn) == EMPTY_YES) + { + if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE + && XINT (PATTERN (insn), 1) == UNSPECV_INSN_GROUP_BARRIER) + saw_stop = 1; insn = prev_active_insn (insn); } if (GET_CODE (insn) == CALL_INSN) @@ -7197,8 +7634,7 @@ ia64_reorg (insns) /* Return true if REGNO is used by the epilogue. */ int -ia64_epilogue_uses (regno) - int regno; +ia64_epilogue_uses (int regno) { switch (regno) { @@ -7237,8 +7673,7 @@ ia64_epilogue_uses (regno) /* Return true if REGNO is used by the frame unwinder. */ int -ia64_eh_uses (regno) - int regno; +ia64_eh_uses (int regno) { if (! reload_completed) return 0; @@ -7262,18 +7697,7 @@ ia64_eh_uses (regno) return 0; } -/* For ia64, SYMBOL_REF_FLAG set means that it is a function. - - We add @ to the name if this goes in small data/bss. We can only put - a variable in small data/bss if it is defined in this module or a module - that we are statically linked with. We can't check the second condition, - but TREE_STATIC gives us the first one. */ - -/* ??? If we had IPA, we could check the second condition. We could support - programmer added section attributes if the variable is not defined in this - module. */ - -/* ??? See the v850 port for a cleaner way to do this. */ +/* Return true if this goes in small data/bss. */ /* ??? We could also support own long data here. Generating movl/add/ld8 instead of addl,ld8/ld8. This makes the code bigger, but should make the @@ -7281,12 +7705,19 @@ ia64_eh_uses (regno) types which can't go in sdata/sbss. */ static bool -ia64_in_small_data_p (exp) - tree exp; +ia64_in_small_data_p (tree exp) { if (TARGET_NO_SDATA) return false; + /* We want to merge strings, so we never consider them small data. */ + if (TREE_CODE (exp) == STRING_CST) + return false; + + /* Functions are never small data. */ + if (TREE_CODE (exp) == FUNCTION_DECL) + return false; + if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp)) { const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp)); @@ -7306,93 +7737,6 @@ ia64_in_small_data_p (exp) return false; } - -static void -ia64_encode_section_info (decl, first) - tree decl; - int first ATTRIBUTE_UNUSED; -{ - const char *symbol_str; - bool is_local; - rtx symbol; - char encoding = 0; - - if (TREE_CODE (decl) == FUNCTION_DECL) - { - SYMBOL_REF_FLAG (XEXP (DECL_RTL (decl), 0)) = 1; - return; - } - - /* Careful not to prod global register variables. */ - if (TREE_CODE (decl) != VAR_DECL - || GET_CODE (DECL_RTL (decl)) != MEM - || GET_CODE (XEXP (DECL_RTL (decl), 0)) != SYMBOL_REF) - return; - - symbol = XEXP (DECL_RTL (decl), 0); - symbol_str = XSTR (symbol, 0); - - is_local = (*targetm.binds_local_p) (decl); - - if (TREE_CODE (decl) == VAR_DECL && DECL_THREAD_LOCAL (decl)) - encoding = " GLil"[decl_tls_model (decl)]; - /* Determine if DECL will wind up in .sdata/.sbss. */ - else if (is_local && ia64_in_small_data_p (decl)) - encoding = 's'; - - /* Finally, encode this into the symbol string. */ - if (encoding) - { - char *newstr; - size_t len; - - if (symbol_str[0] == ENCODE_SECTION_INFO_CHAR) - { - if (encoding == symbol_str[1]) - return; - /* ??? Sdata became thread or thread becaome not thread. Lose. */ - abort (); - } - - len = strlen (symbol_str); - newstr = alloca (len + 3); - newstr[0] = ENCODE_SECTION_INFO_CHAR; - newstr[1] = encoding; - memcpy (newstr + 2, symbol_str, len + 1); - - XSTR (symbol, 0) = ggc_alloc_string (newstr, len + 2); - } - - /* This decl is marked as being in small data/bss but it shouldn't be; - one likely explanation for this is that the decl has been moved into - a different section from the one it was in when encode_section_info - was first called. Remove the encoding. */ - else if (symbol_str[0] == ENCODE_SECTION_INFO_CHAR) - XSTR (symbol, 0) = ggc_strdup (symbol_str + 2); -} - -static const char * -ia64_strip_name_encoding (str) - const char *str; -{ - if (str[0] == ENCODE_SECTION_INFO_CHAR) - str += 2; - if (str[0] == '*') - str++; - return str; -} - -/* True if it is OK to do sibling call optimization for the specified - call expression EXP. DECL will be the called function, or NULL if - this is an indirect call. */ -bool -ia64_function_ok_for_sibcall (decl) - tree decl; -{ - /* We must always return with our current GP. This means we can - only sibcall to functions defined in the current module. */ - return decl && (*targetm.binds_local_p) (decl); -} /* Output assembly directives for prologue regions. */ @@ -7407,7 +7751,7 @@ static bool need_copy_state; /* The function emits unwind directives for the start of an epilogue. */ static void -process_epilogue () +process_epilogue (void) { /* If this isn't the last block of the function, then we need to label the current state, and copy it back in at the start of the next block. */ @@ -7425,9 +7769,7 @@ process_epilogue () which result in emitting an assembly directive required for unwinding. */ static int -process_set (asm_out_file, pat) - FILE *asm_out_file; - rtx pat; +process_set (FILE *asm_out_file, rtx pat) { rtx src = SET_SRC (pat); rtx dest = SET_DEST (pat); @@ -7460,12 +7802,8 @@ process_set (asm_out_file, pat) if (op0 == dest && GET_CODE (op1) == CONST_INT) { if (INTVAL (op1) < 0) - { - fputs ("\t.fframe ", asm_out_file); - fprintf (asm_out_file, HOST_WIDE_INT_PRINT_DEC, - -INTVAL (op1)); - fputc ('\n', asm_out_file); - } + fprintf (asm_out_file, "\t.fframe "HOST_WIDE_INT_PRINT_DEC"\n", + -INTVAL (op1)); else process_epilogue (); } @@ -7641,9 +7979,7 @@ process_set (asm_out_file, pat) /* This function looks at a single insn and emits any directives required to unwind this insn. */ void -process_for_unwind_directive (asm_out_file, insn) - FILE *asm_out_file; - rtx insn; +process_for_unwind_directive (FILE *asm_out_file, rtx insn) { if (flag_unwind_tables || (flag_exceptions && !USING_SJLJ_EXCEPTIONS)) @@ -7700,7 +8036,7 @@ process_for_unwind_directive (asm_out_file, insn) void -ia64_init_builtins () +ia64_init_builtins (void) { tree psi_type_node = build_pointer_type (integer_type_node); tree pdi_type_node = build_pointer_type (long_integer_type_node); @@ -7744,6 +8080,36 @@ ia64_init_builtins () tree void_ftype_pdi = build_function_type_list (void_type_node, pdi_type_node, NULL_TREE); + tree fpreg_type; + tree float80_type; + + /* The __fpreg type. */ + fpreg_type = make_node (REAL_TYPE); + /* ??? The back end should know to load/save __fpreg variables using + the ldf.fill and stf.spill instructions. */ + TYPE_PRECISION (fpreg_type) = 96; + layout_type (fpreg_type); + (*lang_hooks.types.register_builtin_type) (fpreg_type, "__fpreg"); + + /* The __float80 type. */ + float80_type = make_node (REAL_TYPE); + TYPE_PRECISION (float80_type) = 96; + layout_type (float80_type); + (*lang_hooks.types.register_builtin_type) (float80_type, "__float80"); + + /* The __float128 type. */ + if (!TARGET_HPUX) + { + tree float128_type = make_node (REAL_TYPE); + TYPE_PRECISION (float128_type) = 128; + layout_type (float128_type); + (*lang_hooks.types.register_builtin_type) (float128_type, "__float128"); + } + else + /* Under HPUX, this is a synonym for "long double". */ + (*lang_hooks.types.register_builtin_type) (long_double_type_node, + "__float128"); + #define def_builtin(name, type, code) \ builtin_function ((name), (type), (code), BUILT_IN_MD, NULL, NULL_TREE) @@ -7772,8 +8138,8 @@ ia64_init_builtins () build_function_type (ptr_type_node, void_list_node), IA64_BUILTIN_BSP); - def_builtin ("__builtin_ia64_flushrs", - build_function_type (void_type_node, void_list_node), + def_builtin ("__builtin_ia64_flushrs", + build_function_type (void_type_node, void_list_node), IA64_BUILTIN_FLUSHRS); def_builtin ("__sync_fetch_and_add_si", si_ftype_psi_si, @@ -7844,11 +8210,8 @@ ia64_init_builtins () */ static rtx -ia64_expand_fetch_and_op (binoptab, mode, arglist, target) - optab binoptab; - enum machine_mode mode; - tree arglist; - rtx target; +ia64_expand_fetch_and_op (optab binoptab, enum machine_mode mode, + tree arglist, rtx target) { rtx ret, label, tmp, ccv, insn, mem, value; tree arg0, arg1; @@ -7884,13 +8247,14 @@ ia64_expand_fetch_and_op (binoptab, mode, arglist, target) } tmp = gen_reg_rtx (mode); - ccv = gen_rtx_REG (mode, AR_CCV_REGNUM); + /* ar.ccv must always be loaded with a zero-extended DImode value. */ + ccv = gen_rtx_REG (DImode, AR_CCV_REGNUM); emit_move_insn (tmp, mem); label = gen_label_rtx (); emit_label (label); emit_move_insn (ret, tmp); - emit_move_insn (ccv, tmp); + convert_move (ccv, tmp, /*unsignedp=*/1); /* Perform the specific operation. Special case NAND by noticing one_cmpl_optab instead. */ @@ -7925,11 +8289,8 @@ ia64_expand_fetch_and_op (binoptab, mode, arglist, target) */ static rtx -ia64_expand_op_and_fetch (binoptab, mode, arglist, target) - optab binoptab; - enum machine_mode mode; - tree arglist; - rtx target; +ia64_expand_op_and_fetch (optab binoptab, enum machine_mode mode, + tree arglist, rtx target) { rtx old, label, tmp, ret, ccv, insn, mem, value; tree arg0, arg1; @@ -7953,14 +8314,15 @@ ia64_expand_op_and_fetch (binoptab, mode, arglist, target) emit_insn (gen_mf ()); tmp = gen_reg_rtx (mode); old = gen_reg_rtx (mode); - ccv = gen_rtx_REG (mode, AR_CCV_REGNUM); + /* ar.ccv must always be loaded with a zero-extended DImode value. */ + ccv = gen_rtx_REG (DImode, AR_CCV_REGNUM); emit_move_insn (tmp, mem); label = gen_label_rtx (); emit_label (label); emit_move_insn (old, tmp); - emit_move_insn (ccv, tmp); + convert_move (ccv, tmp, /*unsignedp=*/1); /* Perform the specific operation. Special case NAND by noticing one_cmpl_optab instead. */ @@ -7993,12 +8355,8 @@ ia64_expand_op_and_fetch (binoptab, mode, arglist, target) */ static rtx -ia64_expand_compare_and_swap (rmode, mode, boolp, arglist, target) - enum machine_mode rmode; - enum machine_mode mode; - int boolp; - tree arglist; - rtx target; +ia64_expand_compare_and_swap (enum machine_mode rmode, enum machine_mode mode, + int boolp, tree arglist, rtx target) { tree arg0, arg1, arg2; rtx mem, old, new, ccv, tmp, insn; @@ -8013,6 +8371,11 @@ ia64_expand_compare_and_swap (rmode, mode, boolp, arglist, target) mem = gen_rtx_MEM (mode, force_reg (ptr_mode, mem)); MEM_VOLATILE_P (mem) = 1; + if (GET_MODE (old) != mode) + old = convert_to_mode (mode, old, /*unsignedp=*/1); + if (GET_MODE (new) != mode) + new = convert_to_mode (mode, new, /*unsignedp=*/1); + if (! register_operand (old, mode)) old = copy_to_mode_reg (mode, old); if (! register_operand (new, mode)) @@ -8024,14 +8387,7 @@ ia64_expand_compare_and_swap (rmode, mode, boolp, arglist, target) tmp = gen_reg_rtx (mode); ccv = gen_rtx_REG (DImode, AR_CCV_REGNUM); - if (mode == DImode) - emit_move_insn (ccv, old); - else - { - rtx ccvtmp = gen_reg_rtx (DImode); - emit_insn (gen_zero_extendsidi2 (ccvtmp, old)); - emit_move_insn (ccv, ccvtmp); - } + convert_move (ccv, old, /*unsignedp=*/1); emit_insn (gen_mf ()); if (mode == SImode) insn = gen_cmpxchg_acq_si (tmp, mem, new, ccv); @@ -8052,10 +8408,8 @@ ia64_expand_compare_and_swap (rmode, mode, boolp, arglist, target) /* Expand lock_test_and_set. I.e. `xchgsz ret = [ptr], new'. */ static rtx -ia64_expand_lock_test_and_set (mode, arglist, target) - enum machine_mode mode; - tree arglist; - rtx target; +ia64_expand_lock_test_and_set (enum machine_mode mode, tree arglist, + rtx target) { tree arg0, arg1; rtx mem, new, ret, insn; @@ -8087,10 +8441,8 @@ ia64_expand_lock_test_and_set (mode, arglist, target) /* Expand lock_release. I.e. `stsz.rel [ptr] = r0'. */ static rtx -ia64_expand_lock_release (mode, arglist, target) - enum machine_mode mode; - tree arglist; - rtx target ATTRIBUTE_UNUSED; +ia64_expand_lock_release (enum machine_mode mode, tree arglist, + rtx target ATTRIBUTE_UNUSED) { tree arg0; rtx mem; @@ -8107,12 +8459,9 @@ ia64_expand_lock_release (mode, arglist, target) } rtx -ia64_expand_builtin (exp, target, subtarget, mode, ignore) - tree exp; - rtx target; - rtx subtarget ATTRIBUTE_UNUSED; - enum machine_mode mode ATTRIBUTE_UNUSED; - int ignore ATTRIBUTE_UNUSED; +ia64_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, + enum machine_mode mode ATTRIBUTE_UNUSED, + int ignore ATTRIBUTE_UNUSED) { tree fndecl = TREE_OPERAND (TREE_OPERAND (exp, 0), 0); unsigned int fcode = DECL_FUNCTION_CODE (fndecl); @@ -8203,6 +8552,9 @@ ia64_expand_builtin (exp, target, subtarget, mode, ignore) if (! target || ! register_operand (target, DImode)) target = gen_reg_rtx (DImode); emit_insn (gen_bsp_value (target)); +#ifdef POINTERS_EXTEND_UNSIGNED + target = convert_memory_address (ptr_mode, target); +#endif return target; case IA64_BUILTIN_FLUSHRS: @@ -8268,9 +8620,7 @@ ia64_expand_builtin (exp, target, subtarget, mode, ignore) most significant bits of the stack slot. */ enum direction -ia64_hpux_function_arg_padding (mode, type) - enum machine_mode mode; - tree type; +ia64_hpux_function_arg_padding (enum machine_mode mode, tree type) { /* Exception to normal case for structures/unions/etc. */ @@ -8278,78 +8628,124 @@ ia64_hpux_function_arg_padding (mode, type) && int_size_in_bytes (type) < UNITS_PER_WORD) return upward; - /* This is the standard FUNCTION_ARG_PADDING with !BYTES_BIG_ENDIAN - hardwired to be true. */ - - return((mode == BLKmode - ? (type && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST - && int_size_in_bytes (type) < (PARM_BOUNDARY / BITS_PER_UNIT)) - : GET_MODE_BITSIZE (mode) < PARM_BOUNDARY) - ? downward : upward); + /* Fall back to the default. */ + return DEFAULT_FUNCTION_ARG_PADDING (mode, type); } /* Linked list of all external functions that are to be emitted by GCC. We output the name if and only if TREE_SYMBOL_REFERENCED is set in order to avoid putting out names that are never really used. */ -struct extern_func_list +struct extern_func_list GTY(()) { - struct extern_func_list *next; /* next external */ - char *name; /* name of the external */ -} *extern_func_head = 0; + struct extern_func_list *next; + tree decl; +}; + +static GTY(()) struct extern_func_list *extern_func_head; static void -ia64_hpux_add_extern_decl (name) - const char *name; +ia64_hpux_add_extern_decl (tree decl) { - struct extern_func_list *p; + struct extern_func_list *p = ggc_alloc (sizeof (struct extern_func_list)); - p = (struct extern_func_list *) xmalloc (sizeof (struct extern_func_list)); - p->name = xmalloc (strlen (name) + 1); - strcpy(p->name, name); + p->decl = decl; p->next = extern_func_head; extern_func_head = p; } /* Print out the list of used global functions. */ -void -ia64_hpux_asm_file_end (file) - FILE *file; +static void +ia64_hpux_file_end (void) { - while (extern_func_head) + struct extern_func_list *p; + + for (p = extern_func_head; p; p = p->next) { - const char *real_name; - tree decl; + tree decl = p->decl; + tree id = DECL_ASSEMBLER_NAME (decl); - real_name = (* targetm.strip_name_encoding) (extern_func_head->name); - decl = maybe_get_identifier (real_name); + if (!id) + abort (); - if (!decl - || (! TREE_ASM_WRITTEN (decl) && TREE_SYMBOL_REFERENCED (decl))) + if (!TREE_ASM_WRITTEN (decl) && TREE_SYMBOL_REFERENCED (id)) { - if (decl) - TREE_ASM_WRITTEN (decl) = 1; - (*targetm.asm_out.globalize_label) (file, extern_func_head->name); - fprintf (file, "%s", TYPE_ASM_OP); - assemble_name (file, extern_func_head->name); - putc (',', file); - fprintf (file, TYPE_OPERAND_FMT, "function"); - putc ('\n', file); + const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0); + + TREE_ASM_WRITTEN (decl) = 1; + (*targetm.asm_out.globalize_label) (asm_out_file, name); + fputs (TYPE_ASM_OP, asm_out_file); + assemble_name (asm_out_file, name); + fprintf (asm_out_file, "," TYPE_OPERAND_FMT "\n", "function"); } - extern_func_head = extern_func_head->next; } + + extern_func_head = 0; } +/* Rename all the TFmode libfuncs using the HPUX conventions. */ + +static void +ia64_hpux_init_libfuncs (void) +{ + set_optab_libfunc (add_optab, TFmode, "_U_Qfadd"); + set_optab_libfunc (sub_optab, TFmode, "_U_Qfsub"); + set_optab_libfunc (smul_optab, TFmode, "_U_Qfmpy"); + set_optab_libfunc (sdiv_optab, TFmode, "_U_Qfdiv"); + set_optab_libfunc (smin_optab, TFmode, "_U_Qfmin"); + set_optab_libfunc (smax_optab, TFmode, "_U_Qfmax"); + set_optab_libfunc (abs_optab, TFmode, "_U_Qfabs"); + set_optab_libfunc (neg_optab, TFmode, "_U_Qfneg"); + + /* ia64_expand_compare uses this. */ + cmptf_libfunc = init_one_libfunc ("_U_Qfcmp"); + + /* These should never be used. */ + set_optab_libfunc (eq_optab, TFmode, 0); + set_optab_libfunc (ne_optab, TFmode, 0); + set_optab_libfunc (gt_optab, TFmode, 0); + set_optab_libfunc (ge_optab, TFmode, 0); + set_optab_libfunc (lt_optab, TFmode, 0); + set_optab_libfunc (le_optab, TFmode, 0); + + set_conv_libfunc (sext_optab, TFmode, SFmode, "_U_Qfcnvff_sgl_to_quad"); + set_conv_libfunc (sext_optab, TFmode, DFmode, "_U_Qfcnvff_dbl_to_quad"); + set_conv_libfunc (sext_optab, TFmode, XFmode, "_U_Qfcnvff_f80_to_quad"); + set_conv_libfunc (trunc_optab, SFmode, TFmode, "_U_Qfcnvff_quad_to_sgl"); + set_conv_libfunc (trunc_optab, DFmode, TFmode, "_U_Qfcnvff_quad_to_dbl"); + set_conv_libfunc (trunc_optab, XFmode, TFmode, "_U_Qfcnvff_quad_to_f80"); + + set_conv_libfunc (sfix_optab, SImode, TFmode, "_U_Qfcnvfxt_quad_to_sgl"); + set_conv_libfunc (sfix_optab, DImode, TFmode, "_U_Qfcnvfxt_quad_to_dbl"); + set_conv_libfunc (ufix_optab, SImode, TFmode, "_U_Qfcnvfxut_quad_to_sgl"); + set_conv_libfunc (ufix_optab, DImode, TFmode, "_U_Qfcnvfxut_quad_to_dbl"); + + set_conv_libfunc (sfloat_optab, TFmode, SImode, "_U_Qfcnvxf_sgl_to_quad"); + set_conv_libfunc (sfloat_optab, TFmode, DImode, "_U_Qfcnvxf_dbl_to_quad"); +} + +/* Rename the division and modulus functions in VMS. */ + +static void +ia64_vms_init_libfuncs (void) +{ + set_optab_libfunc (sdiv_optab, SImode, "OTS$DIV_I"); + set_optab_libfunc (sdiv_optab, DImode, "OTS$DIV_L"); + set_optab_libfunc (udiv_optab, SImode, "OTS$DIV_UI"); + set_optab_libfunc (udiv_optab, DImode, "OTS$DIV_UL"); + set_optab_libfunc (smod_optab, SImode, "OTS$REM_I"); + set_optab_libfunc (smod_optab, DImode, "OTS$REM_L"); + set_optab_libfunc (umod_optab, SImode, "OTS$REM_UI"); + set_optab_libfunc (umod_optab, DImode, "OTS$REM_UL"); +} /* Switch to the section to which we should output X. The only thing special we do here is to honor small data. */ static void -ia64_select_rtx_section (mode, x, align) - enum machine_mode mode; - rtx x; - unsigned HOST_WIDE_INT align; +ia64_select_rtx_section (enum machine_mode mode, rtx x, + unsigned HOST_WIDE_INT align) { if (GET_MODE_SIZE (mode) > 0 && GET_MODE_SIZE (mode) <= ia64_section_threshold) @@ -8362,27 +8758,20 @@ ia64_select_rtx_section (mode, x, align) Pretend flag_pic is always set. */ static void -ia64_rwreloc_select_section (exp, reloc, align) - tree exp; - int reloc; - unsigned HOST_WIDE_INT align; +ia64_rwreloc_select_section (tree exp, int reloc, unsigned HOST_WIDE_INT align) { default_elf_select_section_1 (exp, reloc, align, true); } static void -ia64_rwreloc_unique_section (decl, reloc) - tree decl; - int reloc; +ia64_rwreloc_unique_section (tree decl, int reloc) { default_unique_section_1 (decl, reloc, true); } static void -ia64_rwreloc_select_rtx_section (mode, x, align) - enum machine_mode mode; - rtx x; - unsigned HOST_WIDE_INT align; +ia64_rwreloc_select_rtx_section (enum machine_mode mode, rtx x, + unsigned HOST_WIDE_INT align) { int save_pic = flag_pic; flag_pic = 1; @@ -8391,32 +8780,50 @@ ia64_rwreloc_select_rtx_section (mode, x, align) } static unsigned int -ia64_rwreloc_section_type_flags (decl, name, reloc) - tree decl; - const char *name; - int reloc; +ia64_rwreloc_section_type_flags (tree decl, const char *name, int reloc) { return default_section_type_flags_1 (decl, name, reloc, true); } +/* Returns true if FNTYPE (a FUNCTION_TYPE or a METHOD_TYPE) returns a + structure type and that the address of that type should be passed + in out0, rather than in r8. */ + +static bool +ia64_struct_retval_addr_is_first_parm_p (tree fntype) +{ + tree ret_type = TREE_TYPE (fntype); + + /* The Itanium C++ ABI requires that out0, rather than r8, be used + as the structure return address parameter, if the return value + type has a non-trivial copy constructor or destructor. It is not + clear if this same convention should be used for other + programming languages. Until G++ 3.4, we incorrectly used r8 for + these return values. */ + return (abi_version_at_least (2) + && ret_type + && TYPE_MODE (ret_type) == BLKmode + && TREE_ADDRESSABLE (ret_type) + && strcmp (lang_hooks.name, "GNU C++") == 0); +} /* Output the assembler code for a thunk function. THUNK_DECL is the declaration for the thunk function itself, FUNCTION is the decl for the target function. DELTA is an immediate constant offset to be - added to THIS. If VCALL_OFFSET is non-zero, the word at + added to THIS. If VCALL_OFFSET is nonzero, the word at *(*this + vcall_offset) should be added to THIS. */ static void -ia64_output_mi_thunk (file, thunk, delta, vcall_offset, function) - FILE *file; - tree thunk ATTRIBUTE_UNUSED; - HOST_WIDE_INT delta; - HOST_WIDE_INT vcall_offset; - tree function; +ia64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED, + HOST_WIDE_INT delta, HOST_WIDE_INT vcall_offset, + tree function) { rtx this, insn, funexp; + unsigned int this_parmno; + unsigned int this_regno; reload_completed = 1; + epilogue_completed = 1; no_new_pseudos = 1; /* Set things up as ia64_expand_prologue might. */ @@ -8427,16 +8834,32 @@ ia64_output_mi_thunk (file, thunk, delta, vcall_offset, function) current_frame_info.n_input_regs = 1; current_frame_info.need_regstk = (TARGET_REG_NAMES != 0); - if (!TARGET_REG_NAMES) - reg_names[IN_REG (0)] = ia64_reg_numbers[0]; - /* Mark the end of the (empty) prologue. */ - emit_note (NULL, NOTE_INSN_PROLOGUE_END); + emit_note (NOTE_INSN_PROLOGUE_END); + + /* Figure out whether "this" will be the first parameter (the + typical case) or the second parameter (as happens when the + virtual function returns certain class objects). */ + this_parmno + = (ia64_struct_retval_addr_is_first_parm_p (TREE_TYPE (thunk)) + ? 1 : 0); + this_regno = IN_REG (this_parmno); + if (!TARGET_REG_NAMES) + reg_names[this_regno] = ia64_reg_numbers[this_parmno]; - this = gen_rtx_REG (Pmode, IN_REG (0)); + this = gen_rtx_REG (Pmode, this_regno); if (TARGET_ILP32) - emit_insn (gen_ptr_extend (this, - gen_rtx_REG (ptr_mode, IN_REG (0)))); + { + rtx tmp = gen_rtx_REG (ptr_mode, this_regno); + REG_POINTER (tmp) = 1; + if (delta && CONST_OK_FOR_I (delta)) + { + emit_insn (gen_ptr_extend_plus_imm (this, tmp, GEN_INT (delta))); + delta = 0; + } + else + emit_insn (gen_ptr_extend (this, tmp)); + } /* Apply the constant offset, if required. */ if (delta) @@ -8461,19 +8884,30 @@ ia64_output_mi_thunk (file, thunk, delta, vcall_offset, function) if (TARGET_ILP32) { rtx t = gen_rtx_REG (ptr_mode, 2); + REG_POINTER (t) = 1; emit_move_insn (t, gen_rtx_MEM (ptr_mode, this)); - emit_insn (gen_ptr_extend (tmp, t)); + if (CONST_OK_FOR_I (vcall_offset)) + { + emit_insn (gen_ptr_extend_plus_imm (tmp, t, + vcall_offset_rtx)); + vcall_offset = 0; + } + else + emit_insn (gen_ptr_extend (tmp, t)); } else emit_move_insn (tmp, gen_rtx_MEM (Pmode, this)); - if (!CONST_OK_FOR_J (vcall_offset)) + if (vcall_offset) { - rtx tmp2 = gen_rtx_REG (Pmode, next_scratch_gr_reg ()); - emit_move_insn (tmp2, vcall_offset_rtx); - vcall_offset_rtx = tmp2; + if (!CONST_OK_FOR_J (vcall_offset)) + { + rtx tmp2 = gen_rtx_REG (Pmode, next_scratch_gr_reg ()); + emit_move_insn (tmp2, vcall_offset_rtx); + vcall_offset_rtx = tmp2; + } + emit_insn (gen_adddi3 (tmp, tmp, vcall_offset_rtx)); } - emit_insn (gen_adddi3 (tmp, tmp, vcall_offset_rtx)); if (TARGET_ILP32) emit_move_insn (gen_rtx_REG (ptr_mode, 2), @@ -8498,6 +8932,7 @@ ia64_output_mi_thunk (file, thunk, delta, vcall_offset, function) /* Code generation for calls relies on splitting. */ reload_completed = 1; + epilogue_completed = 1; try_split (PATTERN (insn), insn, 0); emit_barrier (); @@ -8507,15 +8942,28 @@ ia64_output_mi_thunk (file, thunk, delta, vcall_offset, function) instruction scheduling worth while. Note that use_thunk calls assemble_start_function and assemble_end_function. */ + insn_locators_initialize (); + emit_all_insn_group_barriers (NULL); insn = get_insns (); - emit_all_insn_group_barriers (NULL, insn); shorten_branches (insn); final_start_function (insn, file, 1); final (insn, file, 1, 0); final_end_function (); reload_completed = 0; + epilogue_completed = 0; no_new_pseudos = 0; } +/* Worker function for TARGET_STRUCT_VALUE_RTX. */ + +static rtx +ia64_struct_value_rtx (tree fntype, + int incoming ATTRIBUTE_UNUSED) +{ + if (fntype && ia64_struct_retval_addr_is_first_parm_p (fntype)) + return NULL_RTX; + return gen_rtx_REG (Pmode, GR_REG (8)); +} + #include "gt-ia64.h" |