1 files changed, 2836 insertions, 2388 deletions
diff --git a/contrib/gcc/config/ia64/ia64.c b/contrib/gcc/config/ia64/ia64.c
index 12f3204..19c5e92 100644
--- a/contrib/gcc/config/ia64/ia64.c
+++ b/contrib/gcc/config/ia64/ia64.c
@@ -1,27 +1,30 @@
 /* Definitions of target machine for GNU compiler.
-   Copyright (C) 1999, 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
+   Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004
+   Free Software Foundation, Inc.
    Contributed by James E. Wilson <wilson@cygnus.com> and
-   		  David Mosberger <davidm@hpl.hp.com>.
+		  David Mosberger <davidm@hpl.hp.com>.
 
-This file is part of GNU CC.
+This file is part of GCC.
 
-GNU CC is free software; you can redistribute it and/or modify
+GCC is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2, or (at your option)
 any later version.
 
-GNU CC is distributed in the hope that it will be useful,
+GCC is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License
-along with GNU CC; see the file COPYING.  If not, write to
+along with GCC; see the file COPYING.  If not, write to
 the Free Software Foundation, 59 Temple Place - Suite 330,
 Boston, MA 02111-1307, USA.  */
 
 #include "config.h"
 #include "system.h"
+#include "coretypes.h"
+#include "tm.h"
 #include "rtl.h"
 #include "tree.h"
 #include "regs.h"
@@ -45,7 +48,9 @@ Boston, MA 02111-1307, USA.  */
 #include "target.h"
 #include "target-def.h"
 #include "tm_p.h"
+#include "hashtab.h"
 #include "langhooks.h"
+#include "cfglayout.h"
 
 /* This is used for communication between ASM_OUTPUT_LABEL and
    ASM_OUTPUT_LABELREF.  */
@@ -102,6 +107,12 @@ int ia64_tls_size = 22;
 /* String used with the -mtls-size= option.  */
 const char *ia64_tls_size_string;
 
+/* Which cpu are we scheduling for.  */
+enum processor_type ia64_tune;
+
+/* String used with the -tune= option.  */
+const char *ia64_tune_string;
+
 /* Determines whether we run our final scheduling pass or not.  We always
    avoid the normal second scheduling pass.  */
 static int ia64_flag_schedule_insns2;
@@ -111,6 +122,10 @@ static int ia64_flag_schedule_insns2;
 
 unsigned int ia64_section_threshold;
 
+/* The following variable is used by the DFA insn scheduler.  The value is
+   TRUE if we do insn bundling instead of insn scheduling.  */
+int bundling_p = 0;
+
 /* Structure to be filled in by ia64_compute_frame_size with register
    save masks and offsets for the current function.  */
 
@@ -122,7 +137,7 @@ struct ia64_frame_info
   HOST_WIDE_INT spill_size;	/* size of the gr/br/fr spill area.  */
   HOST_WIDE_INT extra_spill_size;  /* size of spill area for others.  */
   HARD_REG_SET mask;		/* mask of saved registers.  */
-  unsigned int gr_used_mask;	/* mask of registers in use as gr spill 
+  unsigned int gr_used_mask;	/* mask of registers in use as gr spill
 				   registers or long-term scratches.  */
   int n_spilled;		/* number of spilled registers.  */
   int reg_fp;			/* register for fp.  */
@@ -144,84 +159,118 @@ struct ia64_frame_info
 /* Current frame information calculated by ia64_compute_frame_size.  */
 static struct ia64_frame_info current_frame_info;
 
-static rtx gen_tls_get_addr PARAMS ((void));
-static rtx gen_thread_pointer PARAMS ((void));
-static int find_gr_spill PARAMS ((int));
-static int next_scratch_gr_reg PARAMS ((void));
-static void mark_reg_gr_used_mask PARAMS ((rtx, void *));
-static void ia64_compute_frame_size PARAMS ((HOST_WIDE_INT));
-static void setup_spill_pointers PARAMS ((int, rtx, HOST_WIDE_INT));
-static void finish_spill_pointers PARAMS ((void));
-static rtx spill_restore_mem PARAMS ((rtx, HOST_WIDE_INT));
-static void do_spill PARAMS ((rtx (*)(rtx, rtx, rtx), rtx, HOST_WIDE_INT, rtx));
-static void do_restore PARAMS ((rtx (*)(rtx, rtx, rtx), rtx, HOST_WIDE_INT));
-static rtx gen_movdi_x PARAMS ((rtx, rtx, rtx));
-static rtx gen_fr_spill_x PARAMS ((rtx, rtx, rtx));
-static rtx gen_fr_restore_x PARAMS ((rtx, rtx, rtx));
-
-static enum machine_mode hfa_element_mode PARAMS ((tree, int));
-static void fix_range PARAMS ((const char *));
-static struct machine_function * ia64_init_machine_status PARAMS ((void));
-static void emit_insn_group_barriers PARAMS ((FILE *, rtx));
-static void emit_all_insn_group_barriers PARAMS ((FILE *, rtx));
-static void emit_predicate_relation_info PARAMS ((void));
-static bool ia64_in_small_data_p PARAMS ((tree));
-static void ia64_encode_section_info PARAMS ((tree, int));
-static const char *ia64_strip_name_encoding PARAMS ((const char *));
-static void process_epilogue PARAMS ((void));
-static int process_set PARAMS ((FILE *, rtx));
-
-static rtx ia64_expand_fetch_and_op PARAMS ((optab, enum machine_mode,
-					     tree, rtx));
-static rtx ia64_expand_op_and_fetch PARAMS ((optab, enum machine_mode,
-					     tree, rtx));
-static rtx ia64_expand_compare_and_swap PARAMS ((enum machine_mode,
-						 enum machine_mode,
-						 int, tree, rtx));
-static rtx ia64_expand_lock_test_and_set PARAMS ((enum machine_mode,
-						  tree, rtx));
-static rtx ia64_expand_lock_release PARAMS ((enum machine_mode, tree, rtx));
-static bool ia64_assemble_integer PARAMS ((rtx, unsigned int, int));
-static void ia64_output_function_prologue PARAMS ((FILE *, HOST_WIDE_INT));
-static void ia64_output_function_epilogue PARAMS ((FILE *, HOST_WIDE_INT));
-static void ia64_output_function_end_prologue PARAMS ((FILE *));
-
-static int ia64_issue_rate PARAMS ((void));
-static int ia64_adjust_cost PARAMS ((rtx, rtx, rtx, int));
-static void ia64_sched_init PARAMS ((FILE *, int, int));
-static void ia64_sched_finish PARAMS ((FILE *, int));
-static int ia64_internal_sched_reorder PARAMS ((FILE *, int, rtx *,
-						int *, int, int));
-static int ia64_sched_reorder PARAMS ((FILE *, int, rtx *, int *, int));
-static int ia64_sched_reorder2 PARAMS ((FILE *, int, rtx *, int *, int));
-static int ia64_variable_issue PARAMS ((FILE *, int, rtx, int));
-
-static void ia64_output_mi_thunk PARAMS ((FILE *, tree, HOST_WIDE_INT,
-					  HOST_WIDE_INT, tree));
-
-static void ia64_select_rtx_section PARAMS ((enum machine_mode, rtx,
-					     unsigned HOST_WIDE_INT));
-static void ia64_rwreloc_select_section PARAMS ((tree, int,
-					         unsigned HOST_WIDE_INT))
+static int ia64_use_dfa_pipeline_interface (void);
+static int ia64_first_cycle_multipass_dfa_lookahead (void);
+static void ia64_dependencies_evaluation_hook (rtx, rtx);
+static void ia64_init_dfa_pre_cycle_insn (void);
+static rtx ia64_dfa_pre_cycle_insn (void);
+static int ia64_first_cycle_multipass_dfa_lookahead_guard (rtx);
+static int ia64_dfa_new_cycle (FILE *, int, rtx, int, int, int *);
+static rtx gen_tls_get_addr (void);
+static rtx gen_thread_pointer (void);
+static rtx ia64_expand_tls_address (enum tls_model, rtx, rtx);
+static int find_gr_spill (int);
+static int next_scratch_gr_reg (void);
+static void mark_reg_gr_used_mask (rtx, void *);
+static void ia64_compute_frame_size (HOST_WIDE_INT);
+static void setup_spill_pointers (int, rtx, HOST_WIDE_INT);
+static void finish_spill_pointers (void);
+static rtx spill_restore_mem (rtx, HOST_WIDE_INT);
+static void do_spill (rtx (*)(rtx, rtx, rtx), rtx, HOST_WIDE_INT, rtx);
+static void do_restore (rtx (*)(rtx, rtx, rtx), rtx, HOST_WIDE_INT);
+static rtx gen_movdi_x (rtx, rtx, rtx);
+static rtx gen_fr_spill_x (rtx, rtx, rtx);
+static rtx gen_fr_restore_x (rtx, rtx, rtx);
+
+static enum machine_mode hfa_element_mode (tree, int);
+static bool ia64_function_ok_for_sibcall (tree, tree);
+static bool ia64_rtx_costs (rtx, int, int, int *);
+static void fix_range (const char *);
+static struct machine_function * ia64_init_machine_status (void);
+static void emit_insn_group_barriers (FILE *);
+static void emit_all_insn_group_barriers (FILE *);
+static void final_emit_insn_group_barriers (FILE *);
+static void emit_predicate_relation_info (void);
+static void ia64_reorg (void);
+static bool ia64_in_small_data_p (tree);
+static void process_epilogue (void);
+static int process_set (FILE *, rtx);
+
+static rtx ia64_expand_fetch_and_op (optab, enum machine_mode, tree, rtx);
+static rtx ia64_expand_op_and_fetch (optab, enum machine_mode, tree, rtx);
+static rtx ia64_expand_compare_and_swap (enum machine_mode, enum machine_mode,
+					 int, tree, rtx);
+static rtx ia64_expand_lock_test_and_set (enum machine_mode, tree, rtx);
+static rtx ia64_expand_lock_release (enum machine_mode, tree, rtx);
+static bool ia64_assemble_integer (rtx, unsigned int, int);
+static void ia64_output_function_prologue (FILE *, HOST_WIDE_INT);
+static void ia64_output_function_epilogue (FILE *, HOST_WIDE_INT);
+static void ia64_output_function_end_prologue (FILE *);
+
+static int ia64_issue_rate (void);
+static int ia64_adjust_cost (rtx, rtx, rtx, int);
+static void ia64_sched_init (FILE *, int, int);
+static void ia64_sched_finish (FILE *, int);
+static int ia64_dfa_sched_reorder (FILE *, int, rtx *, int *, int, int);
+static int ia64_sched_reorder (FILE *, int, rtx *, int *, int);
+static int ia64_sched_reorder2 (FILE *, int, rtx *, int *, int);
+static int ia64_variable_issue (FILE *, int, rtx, int);
+
+static struct bundle_state *get_free_bundle_state (void);
+static void free_bundle_state (struct bundle_state *);
+static void initiate_bundle_states (void);
+static void finish_bundle_states (void);
+static unsigned bundle_state_hash (const void *);
+static int bundle_state_eq_p (const void *, const void *);
+static int insert_bundle_state (struct bundle_state *);
+static void initiate_bundle_state_table (void);
+static void finish_bundle_state_table (void);
+static int try_issue_nops (struct bundle_state *, int);
+static int try_issue_insn (struct bundle_state *, rtx);
+static void issue_nops_and_insn (struct bundle_state *, int, rtx, int, int);
+static int get_max_pos (state_t);
+static int get_template (state_t, int);
+
+static rtx get_next_important_insn (rtx, rtx);
+static void bundling (FILE *, int, rtx, rtx);
+
+static void ia64_output_mi_thunk (FILE *, tree, HOST_WIDE_INT,
+				  HOST_WIDE_INT, tree);
+static void ia64_file_start (void);
+
+static void ia64_select_rtx_section (enum machine_mode, rtx,
+				     unsigned HOST_WIDE_INT);
+static void ia64_rwreloc_select_section (tree, int, unsigned HOST_WIDE_INT)
      ATTRIBUTE_UNUSED;
-static void ia64_rwreloc_unique_section PARAMS ((tree, int))
+static void ia64_rwreloc_unique_section (tree, int)
      ATTRIBUTE_UNUSED;
-static void ia64_rwreloc_select_rtx_section PARAMS ((enum machine_mode, rtx,
-					             unsigned HOST_WIDE_INT))
+static void ia64_rwreloc_select_rtx_section (enum machine_mode, rtx,
+					     unsigned HOST_WIDE_INT)
      ATTRIBUTE_UNUSED;
-static unsigned int ia64_rwreloc_section_type_flags
-     PARAMS ((tree, const char *, int))
+static unsigned int ia64_rwreloc_section_type_flags (tree, const char *, int)
      ATTRIBUTE_UNUSED;
 
-static void ia64_hpux_add_extern_decl PARAMS ((const char *name))
+static void ia64_hpux_add_extern_decl (tree decl)
+     ATTRIBUTE_UNUSED;
+static void ia64_hpux_file_end (void)
+     ATTRIBUTE_UNUSED;
+static void ia64_hpux_init_libfuncs (void)
      ATTRIBUTE_UNUSED;
+static void ia64_vms_init_libfuncs (void)
+     ATTRIBUTE_UNUSED;
+
+static tree ia64_handle_model_attribute (tree *, tree, tree, int, bool *);
+static void ia64_encode_section_info (tree, rtx, int);
+static rtx ia64_struct_value_rtx (tree, int);
+
 
 /* Table of valid machine attributes.  */
 static const struct attribute_spec ia64_attribute_table[] =
 {
   /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler } */
   { "syscall_linkage", 0, 0, false, true,  true,  NULL },
-  { NULL,              0, 0, false, false, false, NULL }
+  { "model",	       1, 1, true, false, false, ia64_handle_model_attribute },
+  { NULL,	       0, 0, false, false, false, NULL }
 };
 
 /* Initialize the GCC target structure.  */
@@ -260,10 +309,6 @@ static const struct attribute_spec ia64_attribute_table[] =
 
 #undef TARGET_IN_SMALL_DATA_P
 #define TARGET_IN_SMALL_DATA_P  ia64_in_small_data_p
-#undef TARGET_ENCODE_SECTION_INFO
-#define TARGET_ENCODE_SECTION_INFO ia64_encode_section_info
-#undef TARGET_STRIP_NAME_ENCODING
-#define TARGET_STRIP_NAME_ENCODING ia64_strip_name_encoding
 
 #undef TARGET_SCHED_ADJUST_COST
 #define TARGET_SCHED_ADJUST_COST ia64_adjust_cost
@@ -280,24 +325,58 @@ static const struct attribute_spec ia64_attribute_table[] =
 #undef TARGET_SCHED_REORDER2
 #define TARGET_SCHED_REORDER2 ia64_sched_reorder2
 
-#ifdef HAVE_AS_TLS
-#undef TARGET_HAVE_TLS
-#define TARGET_HAVE_TLS true
-#endif
+#undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
+#define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK ia64_dependencies_evaluation_hook
+
+#undef TARGET_SCHED_USE_DFA_PIPELINE_INTERFACE
+#define TARGET_SCHED_USE_DFA_PIPELINE_INTERFACE ia64_use_dfa_pipeline_interface
+
+#undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
+#define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD ia64_first_cycle_multipass_dfa_lookahead
+
+#undef TARGET_SCHED_INIT_DFA_PRE_CYCLE_INSN
+#define TARGET_SCHED_INIT_DFA_PRE_CYCLE_INSN ia64_init_dfa_pre_cycle_insn
+#undef TARGET_SCHED_DFA_PRE_CYCLE_INSN
+#define TARGET_SCHED_DFA_PRE_CYCLE_INSN ia64_dfa_pre_cycle_insn
+
+#undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
+#define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD\
+  ia64_first_cycle_multipass_dfa_lookahead_guard
+
+#undef TARGET_SCHED_DFA_NEW_CYCLE
+#define TARGET_SCHED_DFA_NEW_CYCLE ia64_dfa_new_cycle
+
+#undef TARGET_FUNCTION_OK_FOR_SIBCALL
+#define TARGET_FUNCTION_OK_FOR_SIBCALL ia64_function_ok_for_sibcall
 
 #undef TARGET_ASM_OUTPUT_MI_THUNK
 #define TARGET_ASM_OUTPUT_MI_THUNK ia64_output_mi_thunk
 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK hook_bool_tree_hwi_hwi_tree_true
 
+#undef TARGET_ASM_FILE_START
+#define TARGET_ASM_FILE_START ia64_file_start
+
+#undef TARGET_RTX_COSTS
+#define TARGET_RTX_COSTS ia64_rtx_costs
+#undef TARGET_ADDRESS_COST
+#define TARGET_ADDRESS_COST hook_int_rtx_0
+
+#undef TARGET_MACHINE_DEPENDENT_REORG
+#define TARGET_MACHINE_DEPENDENT_REORG ia64_reorg
+
+#undef TARGET_ENCODE_SECTION_INFO
+#define TARGET_ENCODE_SECTION_INFO ia64_encode_section_info
+
+#undef TARGET_STRUCT_VALUE_RTX
+#define TARGET_STRUCT_VALUE_RTX ia64_struct_value_rtx
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 /* Return 1 if OP is a valid operand for the MEM of a CALL insn.  */
 
 int
-call_operand (op, mode)
-     rtx op;
-     enum machine_mode mode;
+call_operand (rtx op, enum machine_mode mode)
 {
   if (mode != GET_MODE (op) && mode != VOIDmode)
     return 0;
@@ -309,9 +388,7 @@ call_operand (op, mode)
 /* Return 1 if OP refers to a symbol in the sdata section.  */
 
 int
-sdata_symbolic_operand (op, mode)
-     rtx op;
-     enum machine_mode mode ATTRIBUTE_UNUSED;
+sdata_symbolic_operand (rtx op, enum machine_mode mode ATTRIBUTE_UNUSED)
 {
   switch (GET_CODE (op))
     {
@@ -326,10 +403,7 @@ sdata_symbolic_operand (op, mode)
       if (CONSTANT_POOL_ADDRESS_P (op))
 	return GET_MODE_SIZE (get_pool_mode (op)) <= ia64_section_threshold;
       else
-	{
-	  const char *str = XSTR (op, 0);
-          return (str[0] == ENCODE_SECTION_INFO_CHAR && str[1] == 's');
-	}
+	return SYMBOL_REF_LOCAL_P (op) && SYMBOL_REF_SMALL_P (op);
 
     default:
       break;
@@ -338,12 +412,16 @@ sdata_symbolic_operand (op, mode)
   return 0;
 }
 
+int
+small_addr_symbolic_operand (rtx op, enum machine_mode mode ATTRIBUTE_UNUSED)
+{
+  return SYMBOL_REF_SMALL_ADDR_P (op);
+}
+
 /* Return 1 if OP refers to a symbol, and is appropriate for a GOT load.  */
 
 int
-got_symbolic_operand (op, mode)
-     rtx op;
-     enum machine_mode mode ATTRIBUTE_UNUSED;
+got_symbolic_operand (rtx op, enum machine_mode mode ATTRIBUTE_UNUSED)
 {
   switch (GET_CODE (op))
     {
@@ -374,6 +452,8 @@ got_symbolic_operand (op, mode)
       return (INTVAL (op) & 0x3fff) == 0;
 
     case SYMBOL_REF:
+      if (SYMBOL_REF_SMALL_ADDR_P (op))
+	return 0;
     case LABEL_REF:
       return 1;
 
@@ -386,9 +466,7 @@ got_symbolic_operand (op, mode)
 /* Return 1 if OP refers to a symbol.  */
 
 int
-symbolic_operand (op, mode)
-     rtx op;
-     enum machine_mode mode ATTRIBUTE_UNUSED;
+symbolic_operand (rtx op, enum machine_mode mode ATTRIBUTE_UNUSED)
 {
   switch (GET_CODE (op))
     {
@@ -406,40 +484,20 @@ symbolic_operand (op, mode)
 /* Return tls_model if OP refers to a TLS symbol.  */
 
 int
-tls_symbolic_operand (op, mode)
-     rtx op;
-     enum machine_mode mode ATTRIBUTE_UNUSED;
+tls_symbolic_operand (rtx op, enum machine_mode mode ATTRIBUTE_UNUSED)
 {
-  const char *str;
-
   if (GET_CODE (op) != SYMBOL_REF)
     return 0;
-  str = XSTR (op, 0);
-  if (str[0] != ENCODE_SECTION_INFO_CHAR)
-    return 0;
-  switch (str[1])
-    {
-    case 'G':
-      return TLS_MODEL_GLOBAL_DYNAMIC;
-    case 'L':
-      return TLS_MODEL_LOCAL_DYNAMIC;
-    case 'i':
-      return TLS_MODEL_INITIAL_EXEC;
-    case 'l':
-      return TLS_MODEL_LOCAL_EXEC;
-    }
-  return 0;
+  return SYMBOL_REF_TLS_MODEL (op);
 }
 
 
 /* Return 1 if OP refers to a function.  */
 
 int
-function_operand (op, mode)
-     rtx op;
-     enum machine_mode mode ATTRIBUTE_UNUSED;
+function_operand (rtx op, enum machine_mode mode ATTRIBUTE_UNUSED)
 {
-  if (GET_CODE (op) == SYMBOL_REF && SYMBOL_REF_FLAG (op))
+  if (GET_CODE (op) == SYMBOL_REF && SYMBOL_REF_FUNCTION_P (op))
     return 1;
   else
     return 0;
@@ -450,9 +508,7 @@ function_operand (op, mode)
 /* ??? This is an unsatisfying solution.  Should rethink.  */
 
 int
-setjmp_operand (op, mode)
-     rtx op;
-     enum machine_mode mode ATTRIBUTE_UNUSED;
+setjmp_operand (rtx op, enum machine_mode mode ATTRIBUTE_UNUSED)
 {
   const char *name;
   int retval = 0;
@@ -495,29 +551,18 @@ setjmp_operand (op, mode)
   return retval;
 }
 
-/* Return 1 if OP is a general operand, but when pic exclude symbolic
-   operands.  */
-
-/* ??? If we drop no-pic support, can delete SYMBOL_REF, CONST, and LABEL_REF
-   from PREDICATE_CODES.  */
+/* Return 1 if OP is a general operand, excluding tls symbolic operands.  */
 
 int
-move_operand (op, mode)
-     rtx op;
-     enum machine_mode mode;
+move_operand (rtx op, enum machine_mode mode)
 {
-  if (! TARGET_NO_PIC && symbolic_operand (op, mode))
-    return 0;
-
-  return general_operand (op, mode);
+  return general_operand (op, mode) && !tls_symbolic_operand (op, mode);
 }
 
 /* Return 1 if OP is a register operand that is (or could be) a GR reg.  */
 
 int
-gr_register_operand (op, mode)
-     rtx op;
-     enum machine_mode mode;
+gr_register_operand (rtx op, enum machine_mode mode)
 {
   if (! register_operand (op, mode))
     return 0;
@@ -535,9 +580,7 @@ gr_register_operand (op, mode)
 /* Return 1 if OP is a register operand that is (or could be) an FR reg.  */
 
 int
-fr_register_operand (op, mode)
-     rtx op;
-     enum machine_mode mode;
+fr_register_operand (rtx op, enum machine_mode mode)
 {
   if (! register_operand (op, mode))
     return 0;
@@ -555,9 +598,7 @@ fr_register_operand (op, mode)
 /* Return 1 if OP is a register operand that is (or could be) a GR/FR reg.  */
 
 int
-grfr_register_operand (op, mode)
-     rtx op;
-     enum machine_mode mode;
+grfr_register_operand (rtx op, enum machine_mode mode)
 {
   if (! register_operand (op, mode))
     return 0;
@@ -575,9 +616,7 @@ grfr_register_operand (op, mode)
 /* Return 1 if OP is a nonimmediate operand that is (or could be) a GR reg.  */
 
 int
-gr_nonimmediate_operand (op, mode)
-     rtx op;
-     enum machine_mode mode;
+gr_nonimmediate_operand (rtx op, enum machine_mode mode)
 {
   if (! nonimmediate_operand (op, mode))
     return 0;
@@ -595,9 +634,7 @@ gr_nonimmediate_operand (op, mode)
 /* Return 1 if OP is a nonimmediate operand that is (or could be) a FR reg.  */
 
 int
-fr_nonimmediate_operand (op, mode)
-     rtx op;
-     enum machine_mode mode;
+fr_nonimmediate_operand (rtx op, enum machine_mode mode)
 {
   if (! nonimmediate_operand (op, mode))
     return 0;
@@ -615,9 +652,7 @@ fr_nonimmediate_operand (op, mode)
 /* Return 1 if OP is a nonimmediate operand that is a GR/FR reg.  */
 
 int
-grfr_nonimmediate_operand (op, mode)
-     rtx op;
-     enum machine_mode mode;
+grfr_nonimmediate_operand (rtx op, enum machine_mode mode)
 {
   if (! nonimmediate_operand (op, mode))
     return 0;
@@ -635,9 +670,7 @@ grfr_nonimmediate_operand (op, mode)
 /* Return 1 if OP is a GR register operand, or zero.  */
 
 int
-gr_reg_or_0_operand (op, mode)
-     rtx op;
-     enum machine_mode mode;
+gr_reg_or_0_operand (rtx op, enum machine_mode mode)
 {
   return (op == const0_rtx || gr_register_operand (op, mode));
 }
@@ -645,9 +678,7 @@ gr_reg_or_0_operand (op, mode)
 /* Return 1 if OP is a GR register operand, or a 5 bit immediate operand.  */
 
 int
-gr_reg_or_5bit_operand (op, mode)
-     rtx op;
-     enum machine_mode mode;
+gr_reg_or_5bit_operand (rtx op, enum machine_mode mode)
 {
   return ((GET_CODE (op) == CONST_INT && INTVAL (op) >= 0 && INTVAL (op) < 32)
 	  || GET_CODE (op) == CONSTANT_P_RTX
@@ -657,9 +688,7 @@ gr_reg_or_5bit_operand (op, mode)
 /* Return 1 if OP is a GR register operand, or a 6 bit immediate operand.  */
 
 int
-gr_reg_or_6bit_operand (op, mode)
-     rtx op;
-     enum machine_mode mode;
+gr_reg_or_6bit_operand (rtx op, enum machine_mode mode)
 {
   return ((GET_CODE (op) == CONST_INT && CONST_OK_FOR_M (INTVAL (op)))
 	  || GET_CODE (op) == CONSTANT_P_RTX
@@ -669,9 +698,7 @@ gr_reg_or_6bit_operand (op, mode)
 /* Return 1 if OP is a GR register operand, or an 8 bit immediate operand.  */
 
 int
-gr_reg_or_8bit_operand (op, mode)
-     rtx op;
-     enum machine_mode mode;
+gr_reg_or_8bit_operand (rtx op, enum machine_mode mode)
 {
   return ((GET_CODE (op) == CONST_INT && CONST_OK_FOR_K (INTVAL (op)))
 	  || GET_CODE (op) == CONSTANT_P_RTX
@@ -681,9 +708,7 @@ gr_reg_or_8bit_operand (op, mode)
 /* Return 1 if OP is a GR/FR register operand, or an 8 bit immediate.  */
 
 int
-grfr_reg_or_8bit_operand (op, mode)
-     rtx op;
-     enum machine_mode mode;
+grfr_reg_or_8bit_operand (rtx op, enum machine_mode mode)
 {
   return ((GET_CODE (op) == CONST_INT && CONST_OK_FOR_K (INTVAL (op)))
 	  || GET_CODE (op) == CONSTANT_P_RTX
@@ -694,9 +719,7 @@ grfr_reg_or_8bit_operand (op, mode)
    operand.  */
 
 int
-gr_reg_or_8bit_adjusted_operand (op, mode)
-     rtx op;
-     enum machine_mode mode;
+gr_reg_or_8bit_adjusted_operand (rtx op, enum machine_mode mode)
 {
   return ((GET_CODE (op) == CONST_INT && CONST_OK_FOR_L (INTVAL (op)))
 	  || GET_CODE (op) == CONSTANT_P_RTX
@@ -709,9 +732,7 @@ gr_reg_or_8bit_adjusted_operand (op, mode)
    so we need the union of the immediates accepted by GT and LT.  */
 
 int
-gr_reg_or_8bit_and_adjusted_operand (op, mode)
-     rtx op;
-     enum machine_mode mode;
+gr_reg_or_8bit_and_adjusted_operand (rtx op, enum machine_mode mode)
 {
   return ((GET_CODE (op) == CONST_INT && CONST_OK_FOR_K (INTVAL (op))
 	   && CONST_OK_FOR_L (INTVAL (op)))
@@ -722,9 +743,7 @@ gr_reg_or_8bit_and_adjusted_operand (op, mode)
 /* Return 1 if OP is a register operand, or a 14 bit immediate operand.  */
 
 int
-gr_reg_or_14bit_operand (op, mode)
-     rtx op;
-     enum machine_mode mode;
+gr_reg_or_14bit_operand (rtx op, enum machine_mode mode)
 {
   return ((GET_CODE (op) == CONST_INT && CONST_OK_FOR_I (INTVAL (op)))
 	  || GET_CODE (op) == CONSTANT_P_RTX
@@ -734,9 +753,7 @@ gr_reg_or_14bit_operand (op, mode)
 /* Return 1 if OP is a register operand, or a 22 bit immediate operand.  */
 
 int
-gr_reg_or_22bit_operand (op, mode)
-     rtx op;
-     enum machine_mode mode;
+gr_reg_or_22bit_operand (rtx op, enum machine_mode mode)
 {
   return ((GET_CODE (op) == CONST_INT && CONST_OK_FOR_J (INTVAL (op)))
 	  || GET_CODE (op) == CONSTANT_P_RTX
@@ -746,9 +763,7 @@ gr_reg_or_22bit_operand (op, mode)
 /* Return 1 if OP is a 6 bit immediate operand.  */
 
 int
-shift_count_operand (op, mode)
-     rtx op;
-     enum machine_mode mode ATTRIBUTE_UNUSED;
+shift_count_operand (rtx op, enum machine_mode mode ATTRIBUTE_UNUSED)
 {
   return ((GET_CODE (op) == CONST_INT && CONST_OK_FOR_M (INTVAL (op)))
 	  || GET_CODE (op) == CONSTANT_P_RTX);
@@ -757,9 +772,7 @@ shift_count_operand (op, mode)
 /* Return 1 if OP is a 5 bit immediate operand.  */
 
 int
-shift_32bit_count_operand (op, mode)
-     rtx op;
-     enum machine_mode mode ATTRIBUTE_UNUSED;
+shift_32bit_count_operand (rtx op, enum machine_mode mode ATTRIBUTE_UNUSED)
 {
   return ((GET_CODE (op) == CONST_INT
 	   && (INTVAL (op) >= 0 && INTVAL (op) < 32))
@@ -769,9 +782,7 @@ shift_32bit_count_operand (op, mode)
 /* Return 1 if OP is a 2, 4, 8, or 16 immediate operand.  */
 
 int
-shladd_operand (op, mode)
-     rtx op;
-     enum machine_mode mode ATTRIBUTE_UNUSED;
+shladd_operand (rtx op, enum machine_mode mode ATTRIBUTE_UNUSED)
 {
   return (GET_CODE (op) == CONST_INT
 	  && (INTVAL (op) == 2 || INTVAL (op) == 4
@@ -781,9 +792,7 @@ shladd_operand (op, mode)
 /* Return 1 if OP is a -16, -8, -4, -1, 1, 4, 8, or 16 immediate operand.  */
 
 int
-fetchadd_operand (op, mode)
-     rtx op;
-     enum machine_mode mode ATTRIBUTE_UNUSED;
+fetchadd_operand (rtx op, enum machine_mode mode ATTRIBUTE_UNUSED)
 {
   return (GET_CODE (op) == CONST_INT
           && (INTVAL (op) == -16 || INTVAL (op) == -8 ||
@@ -795,9 +804,7 @@ fetchadd_operand (op, mode)
 /* Return 1 if OP is a floating-point constant zero, one, or a register.  */
 
 int
-fr_reg_or_fp01_operand (op, mode)
-     rtx op;
-     enum machine_mode mode;
+fr_reg_or_fp01_operand (rtx op, enum machine_mode mode)
 {
   return ((GET_CODE (op) == CONST_DOUBLE && CONST_DOUBLE_OK_FOR_G (op))
 	  || fr_register_operand (op, mode));
@@ -807,9 +814,7 @@ fr_reg_or_fp01_operand (op, mode)
    POST_MODIFY with a REG as displacement.  */
 
 int
-destination_operand (op, mode)
-     rtx op;
-     enum machine_mode mode;
+destination_operand (rtx op, enum machine_mode mode)
 {
   if (! nonimmediate_operand (op, mode))
     return 0;
@@ -823,21 +828,17 @@ destination_operand (op, mode)
 /* Like memory_operand, but don't allow post-increments.  */
 
 int
-not_postinc_memory_operand (op, mode)
-     rtx op;
-     enum machine_mode mode;
+not_postinc_memory_operand (rtx op, enum machine_mode mode)
 {
   return (memory_operand (op, mode)
 	  && GET_RTX_CLASS (GET_CODE (XEXP (op, 0))) != 'a');
 }
 
-/* Return 1 if this is a comparison operator, which accepts an normal 8-bit
+/* Return 1 if this is a comparison operator, which accepts a normal 8-bit
    signed immediate operand.  */
 
 int
-normal_comparison_operator (op, mode)
-    register rtx op;
-    enum machine_mode mode;
+normal_comparison_operator (register rtx op, enum machine_mode mode)
 {
   enum rtx_code code = GET_CODE (op);
   return ((mode == VOIDmode || GET_MODE (op) == mode)
@@ -849,9 +850,7 @@ normal_comparison_operator (op, mode)
    signed immediate operand.  */
 
 int
-adjusted_comparison_operator (op, mode)
-    register rtx op;
-    enum machine_mode mode;
+adjusted_comparison_operator (register rtx op, enum machine_mode mode)
 {
   enum rtx_code code = GET_CODE (op);
   return ((mode == VOIDmode || GET_MODE (op) == mode)
@@ -861,9 +860,7 @@ adjusted_comparison_operator (op, mode)
 /* Return 1 if this is a signed inequality operator.  */
 
 int
-signed_inequality_operator (op, mode)
-    register rtx op;
-    enum machine_mode mode;
+signed_inequality_operator (register rtx op, enum machine_mode mode)
 {
   enum rtx_code code = GET_CODE (op);
   return ((mode == VOIDmode || GET_MODE (op) == mode)
@@ -874,9 +871,7 @@ signed_inequality_operator (op, mode)
 /* Return 1 if this operator is valid for predication.  */
 
 int
-predicate_operator (op, mode)
-    register rtx op;
-    enum machine_mode mode;
+predicate_operator (register rtx op, enum machine_mode mode)
 {
   enum rtx_code code = GET_CODE (op);
   return ((GET_MODE (op) == mode || mode == VOIDmode)
@@ -886,9 +881,7 @@ predicate_operator (op, mode)
 /* Return 1 if this operator can be used in a conditional operation.  */
 
 int
-condop_operator (op, mode)
-    register rtx op;
-    enum machine_mode mode;
+condop_operator (register rtx op, enum machine_mode mode)
 {
   enum rtx_code code = GET_CODE (op);
   return ((GET_MODE (op) == mode || mode == VOIDmode)
@@ -899,9 +892,7 @@ condop_operator (op, mode)
 /* Return 1 if this is the ar.lc register.  */
 
 int
-ar_lc_reg_operand (op, mode)
-     register rtx op;
-     enum machine_mode mode;
+ar_lc_reg_operand (register rtx op, enum machine_mode mode)
 {
   return (GET_MODE (op) == DImode
 	  && (mode == DImode || mode == VOIDmode)
@@ -912,9 +903,7 @@ ar_lc_reg_operand (op, mode)
 /* Return 1 if this is the ar.ccv register.  */
 
 int
-ar_ccv_reg_operand (op, mode)
-     register rtx op;
-     enum machine_mode mode;
+ar_ccv_reg_operand (register rtx op, enum machine_mode mode)
 {
   return ((GET_MODE (op) == mode || mode == VOIDmode)
 	  && GET_CODE (op) == REG
@@ -924,9 +913,7 @@ ar_ccv_reg_operand (op, mode)
 /* Return 1 if this is the ar.pfs register.  */
 
 int
-ar_pfs_reg_operand (op, mode)
-     register rtx op;
-     enum machine_mode mode;
+ar_pfs_reg_operand (register rtx op, enum machine_mode mode)
 {
   return ((GET_MODE (op) == mode || mode == VOIDmode)
 	  && GET_CODE (op) == REG
@@ -936,9 +923,7 @@ ar_pfs_reg_operand (op, mode)
 /* Like general_operand, but don't allow (mem (addressof)).  */
 
 int
-general_tfmode_operand (op, mode)
-     rtx op;
-     enum machine_mode mode;
+general_xfmode_operand (rtx op, enum machine_mode mode)
 {
   if (! general_operand (op, mode))
     return 0;
@@ -950,9 +935,7 @@ general_tfmode_operand (op, mode)
 /* Similarly.  */
 
 int
-destination_tfmode_operand (op, mode)
-     rtx op;
-     enum machine_mode mode;
+destination_xfmode_operand (rtx op, enum machine_mode mode)
 {
   if (! destination_operand (op, mode))
     return 0;
@@ -964,9 +947,7 @@ destination_tfmode_operand (op, mode)
 /* Similarly.  */
 
 int
-tfreg_or_fp01_operand (op, mode)
-     rtx op;
-     enum machine_mode mode;
+xfreg_or_fp01_operand (rtx op, enum machine_mode mode)
 {
   if (GET_CODE (op) == SUBREG)
     return 0;
@@ -976,9 +957,7 @@ tfreg_or_fp01_operand (op, mode)
 /* Return 1 if OP is valid as a base register in a reg + offset address.  */
 
 int
-basereg_operand (op, mode)
-     rtx op;
-     enum machine_mode mode;
+basereg_operand (rtx op, enum machine_mode mode)
 {
   /* ??? Should I copy the flag_omit_frame_pointer and cse_not_expected
      checks from pa.c basereg_operand as well?  Seems to be OK without them
@@ -988,11 +967,133 @@ basereg_operand (op, mode)
 	  REG_POINTER ((GET_CODE (op) == SUBREG) ? SUBREG_REG (op) : op));
 }
 
+typedef enum
+  {
+    ADDR_AREA_NORMAL,	/* normal address area */
+    ADDR_AREA_SMALL	/* addressable by "addl" (-2MB < addr < 2MB) */
+  }
+ia64_addr_area;
+
+static GTY(()) tree small_ident1;
+static GTY(()) tree small_ident2;
+
+static void
+init_idents (void)
+{
+  if (small_ident1 == 0)
+    {
+      small_ident1 = get_identifier ("small");
+      small_ident2 = get_identifier ("__small__");
+    }
+}
+
+/* Retrieve the address area that has been chosen for the given decl.  */
+
+static ia64_addr_area
+ia64_get_addr_area (tree decl)
+{
+  tree model_attr;
+
+  model_attr = lookup_attribute ("model", DECL_ATTRIBUTES (decl));
+  if (model_attr)
+    {
+      tree id;
+
+      init_idents ();
+      id = TREE_VALUE (TREE_VALUE (model_attr));
+      if (id == small_ident1 || id == small_ident2)
+	return ADDR_AREA_SMALL;
+    }
+  return ADDR_AREA_NORMAL;
+}
+
+static tree
+ia64_handle_model_attribute (tree *node, tree name, tree args, int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
+{
+  ia64_addr_area addr_area = ADDR_AREA_NORMAL;
+  ia64_addr_area area;
+  tree arg, decl = *node;
+
+  init_idents ();
+  arg = TREE_VALUE (args);
+  if (arg == small_ident1 || arg == small_ident2)
+    {
+      addr_area = ADDR_AREA_SMALL;
+    }
+  else
+    {
+      warning ("invalid argument of `%s' attribute",
+	       IDENTIFIER_POINTER (name));
+      *no_add_attrs = true;
+    }
+
+  switch (TREE_CODE (decl))
+    {
+    case VAR_DECL:
+      if ((DECL_CONTEXT (decl) && TREE_CODE (DECL_CONTEXT (decl))
+	   == FUNCTION_DECL)
+	  && !TREE_STATIC (decl))
+	{
+	  error ("%Jan address area attribute cannot be specified for "
+		 "local variables", decl, decl);
+	  *no_add_attrs = true;
+	}
+      area = ia64_get_addr_area (decl);
+      if (area != ADDR_AREA_NORMAL && addr_area != area)
+	{
+	  error ("%Jaddress area of '%s' conflicts with previous "
+		 "declaration", decl, decl);
+	  *no_add_attrs = true;
+	}
+      break;
+
+    case FUNCTION_DECL:
+      error ("%Jaddress area attribute cannot be specified for functions",
+	     decl, decl);
+      *no_add_attrs = true;
+      break;
+
+    default:
+      warning ("`%s' attribute ignored", IDENTIFIER_POINTER (name));
+      *no_add_attrs = true;
+      break;
+    }
+
+  return NULL_TREE;
+}
+
+static void
+ia64_encode_addr_area (tree decl, rtx symbol)
+{
+  int flags;
+
+  flags = SYMBOL_REF_FLAGS (symbol);
+  switch (ia64_get_addr_area (decl))
+    {
+    case ADDR_AREA_NORMAL: break;
+    case ADDR_AREA_SMALL: flags |= SYMBOL_FLAG_SMALL_ADDR; break;
+    default: abort ();
+    }
+  SYMBOL_REF_FLAGS (symbol) = flags;
+}
+
+static void
+ia64_encode_section_info (tree decl, rtx rtl, int first)
+{
+  default_encode_section_info (decl, rtl, first);
+
+  /* Careful not to prod global register variables.  */
+  if (TREE_CODE (decl) == VAR_DECL
+      && GET_CODE (DECL_RTL (decl)) == MEM
+      && GET_CODE (XEXP (DECL_RTL (decl), 0)) == SYMBOL_REF
+      && (TREE_STATIC (decl) || DECL_EXTERNAL (decl)))
+    ia64_encode_addr_area (decl, XEXP (rtl, 0));
+}
+
 /* Return 1 if the operands of a move are ok.  */
 
 int
-ia64_move_ok (dst, src)
-     rtx dst, src;
+ia64_move_ok (rtx dst, rtx src)
 {
   /* If we're under init_recog_no_volatile, we'll not be able to use
      memory_operand.  So check the code directly and don't worry about
@@ -1012,27 +1113,18 @@ ia64_move_ok (dst, src)
     return GET_CODE (src) == CONST_DOUBLE && CONST_DOUBLE_OK_FOR_G (src);
 }
 
-/* Return 0 if we are doing C++ code.  This optimization fails with
-   C++ because of GNAT c++/6685.  */
-
 int
-addp4_optimize_ok (op1, op2)
-     rtx op1, op2;
+addp4_optimize_ok (rtx op1, rtx op2)
 {
-
-  if (!strcmp (lang_hooks.name, "GNU C++"))
-    return 0;
-
   return (basereg_operand (op1, GET_MODE(op1)) !=
 	  basereg_operand (op2, GET_MODE(op2)));
 }
 
-/* Check if OP is a mask suitible for use with SHIFT in a dep.z instruction.
+/* Check if OP is a mask suitable for use with SHIFT in a dep.z instruction.
    Return the length of the field, or <= 0 on failure.  */
 
 int
-ia64_depz_field_mask (rop, rshift)
-     rtx rop, rshift;
+ia64_depz_field_mask (rtx rop, rtx rshift)
 {
   unsigned HOST_WIDE_INT op = INTVAL (rop);
   unsigned HOST_WIDE_INT shift = INTVAL (rshift);
@@ -1045,40 +1137,48 @@ ia64_depz_field_mask (rop, rshift)
 }
 
 /* Expand a symbolic constant load.  */
-/* ??? Should generalize this, so that we can also support 32 bit pointers.  */
 
 void
-ia64_expand_load_address (dest, src, scratch)
-      rtx dest, src, scratch;
-{
-  rtx temp;
-
-  /* The destination could be a MEM during initial rtl generation,
-     which isn't a valid destination for the PIC load address patterns.  */
-  if (! register_operand (dest, DImode))
-    if (! scratch || ! register_operand (scratch, DImode))
-      temp = gen_reg_rtx (DImode);
-    else
-      temp = scratch;
-  else
-    temp = dest;
-
-  if (tls_symbolic_operand (src, Pmode))
+ia64_expand_load_address (rtx dest, rtx src)
+{
+  if (tls_symbolic_operand (src, VOIDmode))
+    abort ();
+  if (GET_CODE (dest) != REG)
     abort ();
 
-  if (TARGET_AUTO_PIC)
-    emit_insn (gen_load_gprel64 (temp, src));
-  else if (GET_CODE (src) == SYMBOL_REF && SYMBOL_REF_FLAG (src))
-    emit_insn (gen_load_fptr (temp, src));
-  else if ((GET_MODE (src) == Pmode || GET_MODE (src) == ptr_mode)
-           && sdata_symbolic_operand (src, VOIDmode))
-    emit_insn (gen_load_gprel (temp, src));
-  else if (GET_CODE (src) == CONST
-	   && GET_CODE (XEXP (src, 0)) == PLUS
-	   && GET_CODE (XEXP (XEXP (src, 0), 1)) == CONST_INT
-	   && (INTVAL (XEXP (XEXP (src, 0), 1)) & 0x1fff) != 0)
-    {
-      rtx subtarget = no_new_pseudos ? temp : gen_reg_rtx (DImode);
+  /* ILP32 mode still loads 64-bits of data from the GOT.  This avoids
+     having to pointer-extend the value afterward.  Other forms of address
+     computation below are also more natural to compute as 64-bit quantities.
+     If we've been given an SImode destination register, change it.  */
+  if (GET_MODE (dest) != Pmode)
+    dest = gen_rtx_REG (Pmode, REGNO (dest));
+
+  if (GET_CODE (src) == SYMBOL_REF && SYMBOL_REF_SMALL_ADDR_P (src))
+    {
+      emit_insn (gen_rtx_SET (VOIDmode, dest, src));
+      return;
+    }
+  else if (TARGET_AUTO_PIC)
+    {
+      emit_insn (gen_load_gprel64 (dest, src));
+      return;
+    }
+  else if (GET_CODE (src) == SYMBOL_REF && SYMBOL_REF_FUNCTION_P (src))
+    {
+      emit_insn (gen_load_fptr (dest, src));
+      return;
+    }
+  else if (sdata_symbolic_operand (src, VOIDmode))
+    {
+      emit_insn (gen_load_gprel (dest, src));
+      return;
+    }
+
+  if (GET_CODE (src) == CONST
+      && GET_CODE (XEXP (src, 0)) == PLUS
+      && GET_CODE (XEXP (XEXP (src, 0), 1)) == CONST_INT
+      && (INTVAL (XEXP (XEXP (src, 0), 1)) & 0x1fff) != 0)
+    {
       rtx sym = XEXP (XEXP (src, 0), 0);
       HOST_WIDE_INT ofs, hi, lo;
 
@@ -1088,49 +1188,34 @@ ia64_expand_load_address (dest, src, scratch)
       lo = ((ofs & 0x3fff) ^ 0x2000) - 0x2000;
       hi = ofs - lo;
 
-      if (! scratch)
-	scratch = no_new_pseudos ? subtarget : gen_reg_rtx (DImode);
-
-      emit_insn (gen_load_symptr (subtarget, plus_constant (sym, hi),
-				  scratch));
-      emit_insn (gen_adddi3 (temp, subtarget, GEN_INT (lo)));
+      ia64_expand_load_address (dest, plus_constant (sym, hi));
+      emit_insn (gen_adddi3 (dest, dest, GEN_INT (lo)));
     }
   else
     {
-      rtx insn;
-      if (! scratch)
-	scratch = no_new_pseudos ? temp : gen_reg_rtx (DImode);
+      rtx tmp;
 
-      insn = emit_insn (gen_load_symptr (temp, src, scratch));
-#ifdef POINTERS_EXTEND_UNSIGNED
-      if (GET_MODE (temp) != GET_MODE (src))
-	src = convert_memory_address (GET_MODE (temp), src);
-#endif
-      REG_NOTES (insn) = gen_rtx_EXPR_LIST (REG_EQUAL, src, REG_NOTES (insn));
-    }
+      tmp = gen_rtx_HIGH (Pmode, src);
+      tmp = gen_rtx_PLUS (Pmode, tmp, pic_offset_table_rtx);
+      emit_insn (gen_rtx_SET (VOIDmode, dest, tmp));
 
-  if (temp != dest)
-    {
-      if (GET_MODE (dest) != GET_MODE (temp))
-	temp = convert_to_mode (GET_MODE (dest), temp, 0);
-      emit_move_insn (dest, temp);
+      tmp = gen_rtx_LO_SUM (GET_MODE (dest), dest, src);
+      emit_insn (gen_rtx_SET (VOIDmode, dest, tmp));
     }
 }
 
 static GTY(()) rtx gen_tls_tga;
 static rtx
-gen_tls_get_addr ()
+gen_tls_get_addr (void)
 {
   if (!gen_tls_tga)
-    {
-      gen_tls_tga = init_one_libfunc ("__tls_get_addr");
-     }
+    gen_tls_tga = init_one_libfunc ("__tls_get_addr");
   return gen_tls_tga;
 }
 
 static GTY(()) rtx thread_pointer_rtx;
 static rtx
-gen_thread_pointer ()
+gen_thread_pointer (void)
 {
   if (!thread_pointer_rtx)
     {
@@ -1140,153 +1225,128 @@ gen_thread_pointer ()
   return thread_pointer_rtx;
 }
 
-rtx
-ia64_expand_move (op0, op1)
-     rtx op0, op1;
+static rtx
+ia64_expand_tls_address (enum tls_model tls_kind, rtx op0, rtx op1)
 {
-  enum machine_mode mode = GET_MODE (op0);
+  rtx tga_op1, tga_op2, tga_ret, tga_eqv, tmp, insns;
+  rtx orig_op0 = op0;
 
-  if (!reload_in_progress && !reload_completed && !ia64_move_ok (op0, op1))
-    op1 = force_reg (mode, op1);
-
-  if (mode == Pmode || mode == ptr_mode)
+  switch (tls_kind)
     {
-      enum tls_model tls_kind;
-      if ((tls_kind = tls_symbolic_operand (op1, Pmode)))
-	{
-	  rtx tga_op1, tga_op2, tga_ret, tga_eqv, tmp, insns;
-	  rtx orig_op0 = op0;
+    case TLS_MODEL_GLOBAL_DYNAMIC:
+      start_sequence ();
 
-	  switch (tls_kind)
-	    {
-	    case TLS_MODEL_GLOBAL_DYNAMIC:
-	      start_sequence ();
-
-	      tga_op1 = gen_reg_rtx (Pmode);
-	      emit_insn (gen_load_ltoff_dtpmod (tga_op1, op1));
-	      tga_op1 = gen_rtx_MEM (Pmode, tga_op1);
-	      RTX_UNCHANGING_P (tga_op1) = 1;
-
-	      tga_op2 = gen_reg_rtx (Pmode);
-	      emit_insn (gen_load_ltoff_dtprel (tga_op2, op1));
-	      tga_op2 = gen_rtx_MEM (Pmode, tga_op2);
-	      RTX_UNCHANGING_P (tga_op2) = 1;
-	      
-	      tga_ret = emit_library_call_value (gen_tls_get_addr (), NULL_RTX,
-						 LCT_CONST, Pmode, 2, tga_op1,
-						 Pmode, tga_op2, Pmode);
-
-	      insns = get_insns ();
-	      end_sequence ();
-
-	      if (GET_MODE (op0) != Pmode)
-		op0 = tga_ret;
-	      emit_libcall_block (insns, op0, tga_ret, op1);
-	      break;
+      tga_op1 = gen_reg_rtx (Pmode);
+      emit_insn (gen_load_ltoff_dtpmod (tga_op1, op1));
+      tga_op1 = gen_rtx_MEM (Pmode, tga_op1);
+      RTX_UNCHANGING_P (tga_op1) = 1;
 
-	    case TLS_MODEL_LOCAL_DYNAMIC:
-	      /* ??? This isn't the completely proper way to do local-dynamic
-		 If the call to __tls_get_addr is used only by a single symbol,
-		 then we should (somehow) move the dtprel to the second arg
-		 to avoid the extra add.  */
-	      start_sequence ();
+      tga_op2 = gen_reg_rtx (Pmode);
+      emit_insn (gen_load_ltoff_dtprel (tga_op2, op1));
+      tga_op2 = gen_rtx_MEM (Pmode, tga_op2);
+      RTX_UNCHANGING_P (tga_op2) = 1;
 
-	      tga_op1 = gen_reg_rtx (Pmode);
-	      emit_insn (gen_load_ltoff_dtpmod (tga_op1, op1));
-	      tga_op1 = gen_rtx_MEM (Pmode, tga_op1);
-	      RTX_UNCHANGING_P (tga_op1) = 1;
+      tga_ret = emit_library_call_value (gen_tls_get_addr (), NULL_RTX,
+					 LCT_CONST, Pmode, 2, tga_op1,
+					 Pmode, tga_op2, Pmode);
 
-	      tga_op2 = const0_rtx;
+      insns = get_insns ();
+      end_sequence ();
 
-	      tga_ret = emit_library_call_value (gen_tls_get_addr (), NULL_RTX,
-						 LCT_CONST, Pmode, 2, tga_op1,
-						 Pmode, tga_op2, Pmode);
+      if (GET_MODE (op0) != Pmode)
+	op0 = tga_ret;
+      emit_libcall_block (insns, op0, tga_ret, op1);
+      break;
 
-	      insns = get_insns ();
-	      end_sequence ();
+    case TLS_MODEL_LOCAL_DYNAMIC:
+      /* ??? This isn't the completely proper way to do local-dynamic
+	 If the call to __tls_get_addr is used only by a single symbol,
+	 then we should (somehow) move the dtprel to the second arg
+	 to avoid the extra add.  */
+      start_sequence ();
 
-	      tga_eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
-					UNSPEC_LD_BASE);
-	      tmp = gen_reg_rtx (Pmode);
-	      emit_libcall_block (insns, tmp, tga_ret, tga_eqv);
+      tga_op1 = gen_reg_rtx (Pmode);
+      emit_insn (gen_load_ltoff_dtpmod (tga_op1, op1));
+      tga_op1 = gen_rtx_MEM (Pmode, tga_op1);
+      RTX_UNCHANGING_P (tga_op1) = 1;
 
-	      if (!register_operand (op0, Pmode))
-		op0 = gen_reg_rtx (Pmode);
-	      if (TARGET_TLS64)
-		{
-		  emit_insn (gen_load_dtprel (op0, op1));
-		  emit_insn (gen_adddi3 (op0, tmp, op0));
-		}
-	      else
-		emit_insn (gen_add_dtprel (op0, tmp, op1));
-	      break;
+      tga_op2 = const0_rtx;
 
-	    case TLS_MODEL_INITIAL_EXEC:
-	      tmp = gen_reg_rtx (Pmode);
-	      emit_insn (gen_load_ltoff_tprel (tmp, op1));
-	      tmp = gen_rtx_MEM (Pmode, tmp);
-	      RTX_UNCHANGING_P (tmp) = 1;
-	      tmp = force_reg (Pmode, tmp);
+      tga_ret = emit_library_call_value (gen_tls_get_addr (), NULL_RTX,
+					 LCT_CONST, Pmode, 2, tga_op1,
+					 Pmode, tga_op2, Pmode);
 
-	      if (!register_operand (op0, Pmode))
-		op0 = gen_reg_rtx (Pmode);
-	      emit_insn (gen_adddi3 (op0, tmp, gen_thread_pointer ()));
-	      break;
+      insns = get_insns ();
+      end_sequence ();
 
-	    case TLS_MODEL_LOCAL_EXEC:
-	      if (!register_operand (op0, Pmode))
-		op0 = gen_reg_rtx (Pmode);
-	      if (TARGET_TLS64)
-		{
-		  emit_insn (gen_load_tprel (op0, op1));
-		  emit_insn (gen_adddi3 (op0, gen_thread_pointer (), op0));
-		}
-	      else
-		emit_insn (gen_add_tprel (op0, gen_thread_pointer (), op1));
-	      break;
+      tga_eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
+				UNSPEC_LD_BASE);
+      tmp = gen_reg_rtx (Pmode);
+      emit_libcall_block (insns, tmp, tga_ret, tga_eqv);
 
-	    default:
-	      abort ();
-	    }
+      if (!register_operand (op0, Pmode))
+	op0 = gen_reg_rtx (Pmode);
+      if (TARGET_TLS64)
+	{
+	  emit_insn (gen_load_dtprel (op0, op1));
+	  emit_insn (gen_adddi3 (op0, tmp, op0));
+	}
+      else
+	emit_insn (gen_add_dtprel (op0, tmp, op1));
+      break;
 
-	  if (orig_op0 == op0)
-	    return NULL_RTX;
-	  if (GET_MODE (orig_op0) == Pmode)
-	    return op0;
-	  return gen_lowpart (GET_MODE (orig_op0), op0);
+    case TLS_MODEL_INITIAL_EXEC:
+      tmp = gen_reg_rtx (Pmode);
+      emit_insn (gen_load_ltoff_tprel (tmp, op1));
+      tmp = gen_rtx_MEM (Pmode, tmp);
+      RTX_UNCHANGING_P (tmp) = 1;
+      tmp = force_reg (Pmode, tmp);
+
+      if (!register_operand (op0, Pmode))
+	op0 = gen_reg_rtx (Pmode);
+      emit_insn (gen_adddi3 (op0, tmp, gen_thread_pointer ()));
+      break;
+
+    case TLS_MODEL_LOCAL_EXEC:
+      if (!register_operand (op0, Pmode))
+	op0 = gen_reg_rtx (Pmode);
+      if (TARGET_TLS64)
+	{
+	  emit_insn (gen_load_tprel (op0, op1));
+	  emit_insn (gen_adddi3 (op0, gen_thread_pointer (), op0));
 	}
-      else if (!TARGET_NO_PIC &&
-	       (symbolic_operand (op1, Pmode) ||
-		symbolic_operand (op1, ptr_mode)))
+      else
+	emit_insn (gen_add_tprel (op0, gen_thread_pointer (), op1));
+      break;
+
+    default:
+      abort ();
+    }
+
+  if (orig_op0 == op0)
+    return NULL_RTX;
+  if (GET_MODE (orig_op0) == Pmode)
+    return op0;
+  return gen_lowpart (GET_MODE (orig_op0), op0);
+}
+
+rtx
+ia64_expand_move (rtx op0, rtx op1)
+{
+  enum machine_mode mode = GET_MODE (op0);
+
+  if (!reload_in_progress && !reload_completed && !ia64_move_ok (op0, op1))
+    op1 = force_reg (mode, op1);
+
+  if ((mode == Pmode || mode == ptr_mode) && symbolic_operand (op1, VOIDmode))
+    {
+      enum tls_model tls_kind;
+      if ((tls_kind = tls_symbolic_operand (op1, VOIDmode)))
+	return ia64_expand_tls_address (tls_kind, op0, op1);
+
+      if (!TARGET_NO_PIC && reload_completed)
 	{
-	  /* Before optimization starts, delay committing to any particular
-	     type of PIC address load.  If this function gets deferred, we
-	     may acquire information that changes the value of the
-	     sdata_symbolic_operand predicate.
-
-	     But don't delay for function pointers.  Loading a function address
-	     actually loads the address of the descriptor not the function.
-	     If we represent these as SYMBOL_REFs, then they get cse'd with
-	     calls, and we end up with calls to the descriptor address instead
-	     of calls to the function address.  Functions are not candidates
-	     for sdata anyways.
-
-	     Don't delay for LABEL_REF because the splitter loses REG_LABEL
-	     notes.  Don't delay for pool addresses on general principals;
-	     they'll never become non-local behind our back.  */
-
-	  if (rtx_equal_function_value_matters
-	      && GET_CODE (op1) != LABEL_REF
-	      && ! (GET_CODE (op1) == SYMBOL_REF
-		    && (SYMBOL_REF_FLAG (op1)
-			|| CONSTANT_POOL_ADDRESS_P (op1)
-			|| STRING_POOL_ADDRESS_P (op1))))
-	    if (GET_MODE (op1) == DImode)
-	      emit_insn (gen_movdi_symbolic (op0, op1));
-	    else
-	      emit_insn (gen_movsi_symbolic (op0, op1));
-	  else
-	    ia64_expand_load_address (op0, op1, NULL_RTX);
+	  ia64_expand_load_address (op0, op1);
 	  return NULL_RTX;
 	}
     }
@@ -1294,102 +1354,272 @@ ia64_expand_move (op0, op1)
   return op1;
 }
 
-/* Split a post-reload TImode reference into two DImode components.  */
+/* Split a move from OP1 to OP0 conditional on COND.  */
 
-rtx
-ia64_split_timode (out, in, scratch)
-     rtx out[2];
-     rtx in, scratch;
+void
+ia64_emit_cond_move (rtx op0, rtx op1, rtx cond)
+{
+  rtx insn, first = get_last_insn ();
+
+  emit_move_insn (op0, op1);
+
+  for (insn = get_last_insn (); insn != first; insn = PREV_INSN (insn))
+    if (INSN_P (insn))
+      PATTERN (insn) = gen_rtx_COND_EXEC (VOIDmode, copy_rtx (cond),
+					  PATTERN (insn));
+}
+
+/* Split a post-reload TImode or TFmode reference into two DImode
+   components.  This is made extra difficult by the fact that we do
+   not get any scratch registers to work with, because reload cannot
+   be prevented from giving us a scratch that overlaps the register
+   pair involved.  So instead, when addressing memory, we tweak the
+   pointer register up and back down with POST_INCs.  Or up and not
+   back down when we can get away with it.
+
+   REVERSED is true when the loads must be done in reversed order
+   (high word first) for correctness.  DEAD is true when the pointer
+   dies with the second insn we generate and therefore the second
+   address must not carry a postmodify.
+
+   May return an insn which is to be emitted after the moves.  */
+
+static rtx
+ia64_split_tmode (rtx out[2], rtx in, bool reversed, bool dead)
 {
+  rtx fixup = 0;
+
   switch (GET_CODE (in))
     {
     case REG:
-      out[0] = gen_rtx_REG (DImode, REGNO (in));
-      out[1] = gen_rtx_REG (DImode, REGNO (in) + 1);
-      return NULL_RTX;
+      out[reversed] = gen_rtx_REG (DImode, REGNO (in));
+      out[!reversed] = gen_rtx_REG (DImode, REGNO (in) + 1);
+      break;
+
+    case CONST_INT:
+    case CONST_DOUBLE:
+      /* Cannot occur reversed.  */
+      if (reversed) abort ();
+      
+      if (GET_MODE (in) != TFmode)
+	split_double (in, &out[0], &out[1]);
+      else
+	/* split_double does not understand how to split a TFmode
+	   quantity into a pair of DImode constants.  */
+	{
+	  REAL_VALUE_TYPE r;
+	  unsigned HOST_WIDE_INT p[2];
+	  long l[4];  /* TFmode is 128 bits */
+
+	  REAL_VALUE_FROM_CONST_DOUBLE (r, in);
+	  real_to_target (l, &r, TFmode);
+
+	  if (FLOAT_WORDS_BIG_ENDIAN)
+	    {
+	      p[0] = (((unsigned HOST_WIDE_INT) l[0]) << 32) + l[1];
+	      p[1] = (((unsigned HOST_WIDE_INT) l[2]) << 32) + l[3];
+	    }
+	  else
+	    {
+	      p[0] = (((unsigned HOST_WIDE_INT) l[3]) << 32) + l[2];
+	      p[1] = (((unsigned HOST_WIDE_INT) l[1]) << 32) + l[0];
+	    }
+	  out[0] = GEN_INT (p[0]);
+	  out[1] = GEN_INT (p[1]);
+	}
+      break;
 
     case MEM:
       {
 	rtx base = XEXP (in, 0);
+	rtx offset;
 
 	switch (GET_CODE (base))
 	  {
 	  case REG:
-	    out[0] = adjust_address (in, DImode, 0);
-	    break;
-	  case POST_MODIFY:
-	    base = XEXP (base, 0);
-	    out[0] = adjust_address (in, DImode, 0);
+	    if (!reversed)
+	      {
+		out[0] = adjust_automodify_address
+		  (in, DImode, gen_rtx_POST_INC (Pmode, base), 0);
+		out[1] = adjust_automodify_address
+		  (in, DImode, dead ? 0 : gen_rtx_POST_DEC (Pmode, base), 8);
+	      }
+	    else
+	      {
+		/* Reversal requires a pre-increment, which can only
+		   be done as a separate insn.  */
+		emit_insn (gen_adddi3 (base, base, GEN_INT (8)));
+		out[0] = adjust_automodify_address
+		  (in, DImode, gen_rtx_POST_DEC (Pmode, base), 8);
+		out[1] = adjust_address (in, DImode, 0);
+	      }
 	    break;
 
-	  /* Since we're changing the mode, we need to change to POST_MODIFY
-	     as well to preserve the size of the increment.  Either that or
-	     do the update in two steps, but we've already got this scratch
-	     register handy so let's use it.  */
 	  case POST_INC:
-	    base = XEXP (base, 0);
-	    out[0]
-	      = change_address (in, DImode,
-				gen_rtx_POST_MODIFY
-				(Pmode, base, plus_constant (base, 16)));
+	    if (reversed || dead) abort ();
+	    /* Just do the increment in two steps.  */
+	    out[0] = adjust_automodify_address (in, DImode, 0, 0);
+	    out[1] = adjust_automodify_address (in, DImode, 0, 8);
 	    break;
+
 	  case POST_DEC:
+	    if (reversed || dead) abort ();
+	    /* Add 8, subtract 24.  */
 	    base = XEXP (base, 0);
-	    out[0]
-	      = change_address (in, DImode,
-				gen_rtx_POST_MODIFY
-				(Pmode, base, plus_constant (base, -16)));
+	    out[0] = adjust_automodify_address
+	      (in, DImode, gen_rtx_POST_INC (Pmode, base), 0);
+	    out[1] = adjust_automodify_address
+	      (in, DImode,
+	       gen_rtx_POST_MODIFY (Pmode, base, plus_constant (base, -24)),
+	       8);
+	    break;
+
+	  case POST_MODIFY:
+	    if (reversed || dead) abort ();
+	    /* Extract and adjust the modification.  This case is
+	       trickier than the others, because we might have an
+	       index register, or we might have a combined offset that
+	       doesn't fit a signed 9-bit displacement field.  We can
+	       assume the incoming expression is already legitimate.  */
+	    offset = XEXP (base, 1);
+	    base = XEXP (base, 0);
+
+	    out[0] = adjust_automodify_address
+	      (in, DImode, gen_rtx_POST_INC (Pmode, base), 0);
+
+	    if (GET_CODE (XEXP (offset, 1)) == REG)
+	      {
+		/* Can't adjust the postmodify to match.  Emit the
+		   original, then a separate addition insn.  */
+		out[1] = adjust_automodify_address (in, DImode, 0, 8);
+		fixup = gen_adddi3 (base, base, GEN_INT (-8));
+	      }
+	    else if (GET_CODE (XEXP (offset, 1)) != CONST_INT)
+	      abort ();
+	    else if (INTVAL (XEXP (offset, 1)) < -256 + 8)
+	      {
+		/* Again the postmodify cannot be made to match, but
+		   in this case it's more efficient to get rid of the
+		   postmodify entirely and fix up with an add insn. */
+		out[1] = adjust_automodify_address (in, DImode, base, 8);
+		fixup = gen_adddi3 (base, base,
+				    GEN_INT (INTVAL (XEXP (offset, 1)) - 8));
+	      }
+	    else
+	      {
+		/* Combined offset still fits in the displacement field.
+		   (We cannot overflow it at the high end.)  */
+		out[1] = adjust_automodify_address
+		  (in, DImode,
+		   gen_rtx_POST_MODIFY (Pmode, base,
+		     gen_rtx_PLUS (Pmode, base,
+				   GEN_INT (INTVAL (XEXP (offset, 1)) - 8))),
+		   8);
+	      }
 	    break;
+
 	  default:
 	    abort ();
 	  }
-
-	if (scratch == NULL_RTX)
-	  abort ();
-	out[1] = change_address (in, DImode, scratch);
-	return gen_adddi3 (scratch, base, GEN_INT (8));
+	break;
       }
 
-    case CONST_INT:
-    case CONST_DOUBLE:
-      split_double (in, &out[0], &out[1]);
-      return NULL_RTX;
-
     default:
       abort ();
     }
+
+  return fixup;
 }
 
-/* ??? Fixing GR->FR TFmode moves during reload is hard.  You need to go
+/* Split a TImode or TFmode move instruction after reload.
+   This is used by *movtf_internal and *movti_internal.  */
+void
+ia64_split_tmode_move (rtx operands[])
+{
+  rtx in[2], out[2], insn;
+  rtx fixup[2];
+  bool dead = false;
+  bool reversed = false;
+
+  /* It is possible for reload to decide to overwrite a pointer with
+     the value it points to.  In that case we have to do the loads in
+     the appropriate order so that the pointer is not destroyed too
+     early.  Also we must not generate a postmodify for that second
+     load, or rws_access_regno will abort.  */
+  if (GET_CODE (operands[1]) == MEM
+      && reg_overlap_mentioned_p (operands[0], operands[1]))
+    {
+      rtx base = XEXP (operands[1], 0);
+      while (GET_CODE (base) != REG)
+	base = XEXP (base, 0);
+
+      if (REGNO (base) == REGNO (operands[0]))
+	reversed = true;
+      dead = true;
+    }
+  /* Another reason to do the moves in reversed order is if the first
+     element of the target register pair is also the second element of
+     the source register pair.  */
+  if (GET_CODE (operands[0]) == REG && GET_CODE (operands[1]) == REG
+      && REGNO (operands[0]) == REGNO (operands[1]) + 1)
+    reversed = true;
+
+  fixup[0] = ia64_split_tmode (in, operands[1], reversed, dead);
+  fixup[1] = ia64_split_tmode (out, operands[0], reversed, dead);
+
+#define MAYBE_ADD_REG_INC_NOTE(INSN, EXP)				\
+  if (GET_CODE (EXP) == MEM						\
+      && (GET_CODE (XEXP (EXP, 0)) == POST_MODIFY			\
+	  || GET_CODE (XEXP (EXP, 0)) == POST_INC			\
+	  || GET_CODE (XEXP (EXP, 0)) == POST_DEC))			\
+    REG_NOTES (INSN) = gen_rtx_EXPR_LIST (REG_INC,			\
+					  XEXP (XEXP (EXP, 0), 0),	\
+					  REG_NOTES (INSN))
+
+  insn = emit_insn (gen_rtx_SET (VOIDmode, out[0], in[0]));
+  MAYBE_ADD_REG_INC_NOTE (insn, in[0]);
+  MAYBE_ADD_REG_INC_NOTE (insn, out[0]);
+
+  insn = emit_insn (gen_rtx_SET (VOIDmode, out[1], in[1]));
+  MAYBE_ADD_REG_INC_NOTE (insn, in[1]);
+  MAYBE_ADD_REG_INC_NOTE (insn, out[1]);
+
+  if (fixup[0])
+    emit_insn (fixup[0]);
+  if (fixup[1])
+    emit_insn (fixup[1]);
+
+#undef MAYBE_ADD_REG_INC_NOTE
+}
+
+/* ??? Fixing GR->FR XFmode moves during reload is hard.  You need to go
    through memory plus an extra GR scratch register.  Except that you can
    either get the first from SECONDARY_MEMORY_NEEDED or the second from
    SECONDARY_RELOAD_CLASS, but not both.
 
    We got into problems in the first place by allowing a construct like
-   (subreg:TF (reg:TI)), which we got from a union containing a long double.  
+   (subreg:XF (reg:TI)), which we got from a union containing a long double.
    This solution attempts to prevent this situation from occurring.  When
    we see something like the above, we spill the inner register to memory.  */
 
 rtx
-spill_tfmode_operand (in, force)
-     rtx in;
-     int force;
+spill_xfmode_operand (rtx in, int force)
 {
   if (GET_CODE (in) == SUBREG
       && GET_MODE (SUBREG_REG (in)) == TImode
       && GET_CODE (SUBREG_REG (in)) == REG)
     {
-      rtx mem = gen_mem_addressof (SUBREG_REG (in), NULL_TREE, true);
-      return gen_rtx_MEM (TFmode, copy_to_reg (XEXP (mem, 0)));
+      rtx mem = gen_mem_addressof (SUBREG_REG (in), NULL_TREE, /*rescan=*/true);
+      return gen_rtx_MEM (XFmode, copy_to_reg (XEXP (mem, 0)));
     }
   else if (force && GET_CODE (in) == REG)
     {
-      rtx mem = gen_mem_addressof (in, NULL_TREE, true);
-      return gen_rtx_MEM (TFmode, copy_to_reg (XEXP (mem, 0)));
+      rtx mem = gen_mem_addressof (in, NULL_TREE, /*rescan=*/true);
+      return gen_rtx_MEM (XFmode, copy_to_reg (XEXP (mem, 0)));
     }
   else if (GET_CODE (in) == MEM
 	   && GET_CODE (XEXP (in, 0)) == ADDRESSOF)
-    return change_address (in, TFmode, copy_to_reg (XEXP (in, 0)));
+    return change_address (in, XFmode, copy_to_reg (XEXP (in, 0)));
   else
     return in;
 }
@@ -1397,10 +1627,10 @@ spill_tfmode_operand (in, force)
 /* Emit comparison instruction if necessary, returning the expression
    that holds the compare result in the proper mode.  */
 
+static GTY(()) rtx cmptf_libfunc;
+
 rtx
-ia64_expand_compare (code, mode)
-     enum rtx_code code;
-     enum machine_mode mode;
+ia64_expand_compare (enum rtx_code code, enum machine_mode mode)
 {
   rtx op0 = ia64_compare_op0, op1 = ia64_compare_op1;
   rtx cmp;
@@ -1414,6 +1644,59 @@ ia64_expand_compare (code, mode)
       else
 	abort ();
     }
+  /* HPUX TFmode compare requires a library call to _U_Qfcmp, which takes a
+     magic number as its third argument, that indicates what to do.
+     The return value is an integer to be compared against zero.  */
+  else if (TARGET_HPUX && GET_MODE (op0) == TFmode)
+    {
+      enum qfcmp_magic {
+	QCMP_INV = 1,	/* Raise FP_INVALID on SNaN as a side effect.  */
+	QCMP_UNORD = 2,
+	QCMP_EQ = 4,
+	QCMP_LT = 8,
+	QCMP_GT = 16
+      } magic;
+      enum rtx_code ncode;
+      rtx ret, insns;
+      if (GET_MODE (op1) != TFmode)
+	abort ();
+      switch (code)
+	{
+	  /* 1 = equal, 0 = not equal.  Equality operators do
+	     not raise FP_INVALID when given an SNaN operand.  */
+	case EQ:        magic = QCMP_EQ;                  ncode = NE; break;
+	case NE:        magic = QCMP_EQ;                  ncode = EQ; break;
+	  /* isunordered() from C99.  */
+	case UNORDERED: magic = QCMP_UNORD;               ncode = NE; break;
+	  /* Relational operators raise FP_INVALID when given
+	     an SNaN operand.  */
+	case LT:        magic = QCMP_LT        |QCMP_INV; ncode = NE; break;
+	case LE:        magic = QCMP_LT|QCMP_EQ|QCMP_INV; ncode = NE; break;
+	case GT:        magic = QCMP_GT        |QCMP_INV; ncode = NE; break;
+	case GE:        magic = QCMP_GT|QCMP_EQ|QCMP_INV; ncode = NE; break;
+	  /* FUTURE: Implement UNEQ, UNLT, UNLE, UNGT, UNGE, LTGT.
+	     Expanders for buneq etc. weuld have to be added to ia64.md
+	     for this to be useful.  */
+	default: abort ();
+	}
+
+      start_sequence ();
+
+      ret = emit_library_call_value (cmptf_libfunc, 0, LCT_CONST, DImode, 3,
+				     op0, TFmode, op1, TFmode,
+				     GEN_INT (magic), DImode);
+      cmp = gen_reg_rtx (BImode);
+      emit_insn (gen_rtx_SET (VOIDmode, cmp,
+			      gen_rtx_fmt_ee (ncode, BImode,
+					      ret, const0_rtx)));
+
+      insns = get_insns ();
+      end_sequence ();
+
+      emit_libcall_block (insns, cmp, cmp,
+			  gen_rtx_fmt_ee (code, BImode, op0, op1));
+      code = NE;
+    }
   else
     {
       cmp = gen_reg_rtx (BImode);
@@ -1426,16 +1709,15 @@ ia64_expand_compare (code, mode)
 }
 
 /* Emit the appropriate sequence for a call.  */
+
 void
-ia64_expand_call (retval, addr, nextarg, sibcall_p)
-     rtx retval;
-     rtx addr;
-     rtx nextarg ATTRIBUTE_UNUSED;
-     int sibcall_p;
+ia64_expand_call (rtx retval, rtx addr, rtx nextarg ATTRIBUTE_UNUSED,
+		  int sibcall_p)
 {
   rtx insn, b0;
 
   addr = XEXP (addr, 0);
+  addr = convert_memory_address (DImode, addr);
   b0 = gen_rtx_REG (DImode, R_BR (0));
 
   /* ??? Should do this for functions known to bind local too.  */
@@ -1465,8 +1747,9 @@ ia64_expand_call (retval, addr, nextarg, sibcall_p)
   if (sibcall_p)
     use_reg (&CALL_INSN_FUNCTION_USAGE (insn), b0);
 }
+
 void
-ia64_reload_gp ()
+ia64_reload_gp (void)
 {
   rtx tmp;
 
@@ -1506,10 +1789,8 @@ ia64_reload_gp ()
 }
 
 void
-ia64_split_call (retval, addr, retaddr, scratch_r, scratch_b,
-		 noreturn_p, sibcall_p)
-     rtx retval, addr, retaddr, scratch_r, scratch_b;
-     int noreturn_p, sibcall_p;
+ia64_split_call (rtx retval, rtx addr, rtx retaddr, rtx scratch_r,
+		 rtx scratch_b, int noreturn_p, int sibcall_p)
 {
   rtx insn;
   bool is_desc = false;
@@ -1522,8 +1803,8 @@ ia64_split_call (retval, addr, retaddr, scratch_r, scratch_b,
       bool addr_dead_p;
 
       /* ??? We are currently constrained to *not* use peep2, because
-	 we can legitimiately change the global lifetime of the GP
-	 (in the form of killing where previously live).  This is 
+	 we can legitimately change the global lifetime of the GP
+	 (in the form of killing where previously live).  This is
 	 because a call through a descriptor doesn't use the previous
 	 value of the GP, while a direct call does, and we do not
 	 commit to either form until the split here.
@@ -1571,9 +1852,15 @@ ia64_split_call (retval, addr, retaddr, scratch_r, scratch_b,
 
 /* Begin the assembly file.  */
 
+static void
+ia64_file_start (void)
+{
+  default_file_start ();
+  emit_safe_across_calls ();
+}
+
 void
-emit_safe_across_calls (f)
-     FILE *f;
+emit_safe_across_calls (void)
 {
   unsigned int rs, re;
   int out_state;
@@ -1590,19 +1877,19 @@ emit_safe_across_calls (f)
 	continue;
       if (out_state == 0)
 	{
-	  fputs ("\t.pred.safe_across_calls ", f);
+	  fputs ("\t.pred.safe_across_calls ", asm_out_file);
 	  out_state = 1;
 	}
       else
-	fputc (',', f);
+	fputc (',', asm_out_file);
       if (re == rs + 1)
-	fprintf (f, "p%u", rs);
+	fprintf (asm_out_file, "p%u", rs);
       else
-	fprintf (f, "p%u-p%u", rs, re - 1);
+	fprintf (asm_out_file, "p%u-p%u", rs, re - 1);
       rs = re + 1;
     }
   if (out_state)
-    fputc ('\n', f);
+    fputc ('\n', asm_out_file);
 }
 
 /* Helper function for ia64_compute_frame_size: find an appropriate general
@@ -1611,8 +1898,7 @@ emit_safe_across_calls (f)
    TRY_LOCALS is true if we should attempt to locate a local regnum.  */
 
 static int
-find_gr_spill (try_locals)
-     int try_locals;
+find_gr_spill (int try_locals)
 {
   int regno;
 
@@ -1661,7 +1947,7 @@ find_gr_spill (try_locals)
 static int last_scratch_gr_reg;
 
 static int
-next_scratch_gr_reg ()
+next_scratch_gr_reg (void)
 {
   int i, regno;
 
@@ -1686,9 +1972,7 @@ next_scratch_gr_reg ()
    diddle_return_value.  Mark REG in current_frame_info.gr_used_mask.  */
 
 static void
-mark_reg_gr_used_mask (reg, data)
-     rtx reg;
-     void *data ATTRIBUTE_UNUSED;
+mark_reg_gr_used_mask (rtx reg, void *data ATTRIBUTE_UNUSED)
 {
   unsigned int regno = REGNO (reg);
   if (regno < 32)
@@ -1704,8 +1988,7 @@ mark_reg_gr_used_mask (reg, data)
    needed for local variables.  */
 
 static void
-ia64_compute_frame_size (size)
-     HOST_WIDE_INT size;
+ia64_compute_frame_size (HOST_WIDE_INT size)
 {
   HOST_WIDE_INT total_size;
   HOST_WIDE_INT spill_size = 0;
@@ -1767,7 +2050,7 @@ ia64_compute_frame_size (size)
   i = regno - OUT_REG (0) + 1;
 
   /* When -p profiling, we need one output register for the mcount argument.
-     Likwise for -a profiling for the bb_init_func argument.  For -ax
+     Likewise for -a profiling for the bb_init_func argument.  For -ax
      profiling, we need two output registers for the two bb_init_trace_func
      arguments.  */
   if (current_function_profile)
@@ -1778,7 +2061,7 @@ ia64_compute_frame_size (size)
   current_frame_info.n_rotate_regs = 0;
 
   /* Discover which registers need spilling, and how much room that
-     will take.  Begin with floating point and general registers, 
+     will take.  Begin with floating point and general registers,
      which will always wind up on the stack.  */
 
   for (regno = FR_REG (2); regno <= FR_REG (127); regno++)
@@ -1809,7 +2092,7 @@ ia64_compute_frame_size (size)
 
   /* Now come all special registers that might get saved in other
      general registers.  */
-  
+
   if (frame_pointer_needed)
     {
       current_frame_info.reg_fp = find_gr_spill (1);
@@ -1971,8 +2254,7 @@ ia64_compute_frame_size (size)
 /* Compute the initial difference between the specified pair of registers.  */
 
 HOST_WIDE_INT
-ia64_initial_elimination_offset (from, to)
-     int from, to;
+ia64_initial_elimination_offset (int from, int to)
 {
   HOST_WIDE_INT offset;
 
@@ -2044,10 +2326,7 @@ struct spill_fill_data
 static struct spill_fill_data spill_fill_data;
 
 static void
-setup_spill_pointers (n_spills, init_reg, cfa_off)
-     int n_spills;
-     rtx init_reg;
-     HOST_WIDE_INT cfa_off;
+setup_spill_pointers (int n_spills, rtx init_reg, HOST_WIDE_INT cfa_off)
 {
   int i;
 
@@ -2073,15 +2352,13 @@ setup_spill_pointers (n_spills, init_reg, cfa_off)
 }
 
 static void
-finish_spill_pointers ()
+finish_spill_pointers (void)
 {
   current_frame_info.gr_used_mask = spill_fill_data.save_gr_used_mask;
 }
 
 static rtx
-spill_restore_mem (reg, cfa_off)
-     rtx reg;
-     HOST_WIDE_INT cfa_off;
+spill_restore_mem (rtx reg, HOST_WIDE_INT cfa_off)
 {
   int iter = spill_fill_data.next_iter;
   HOST_WIDE_INT disp = spill_fill_data.prev_off[iter] - cfa_off;
@@ -2194,10 +2471,8 @@ spill_restore_mem (reg, cfa_off)
 }
 
 static void
-do_spill (move_fn, reg, cfa_off, frame_reg)
-     rtx (*move_fn) PARAMS ((rtx, rtx, rtx));
-     rtx reg, frame_reg;
-     HOST_WIDE_INT cfa_off;
+do_spill (rtx (*move_fn) (rtx, rtx, rtx), rtx reg, HOST_WIDE_INT cfa_off,
+	  rtx frame_reg)
 {
   int iter = spill_fill_data.next_iter;
   rtx mem, insn;
@@ -2213,7 +2488,7 @@ do_spill (move_fn, reg, cfa_off, frame_reg)
 
       RTX_FRAME_RELATED_P (insn) = 1;
 
-      /* Don't even pretend that the unwind code can intuit its way 
+      /* Don't even pretend that the unwind code can intuit its way
 	 through a pair of interleaved post_modify iterators.  Just
 	 provide the correct answer.  */
 
@@ -2239,10 +2514,7 @@ do_spill (move_fn, reg, cfa_off, frame_reg)
 }
 
 static void
-do_restore (move_fn, reg, cfa_off)
-     rtx (*move_fn) PARAMS ((rtx, rtx, rtx));
-     rtx reg;
-     HOST_WIDE_INT cfa_off;
+do_restore (rtx (*move_fn) (rtx, rtx, rtx), rtx reg, HOST_WIDE_INT cfa_off)
 {
   int iter = spill_fill_data.next_iter;
   rtx insn;
@@ -2254,28 +2526,22 @@ do_restore (move_fn, reg, cfa_off)
 
 /* Wrapper functions that discards the CONST_INT spill offset.  These
    exist so that we can give gr_spill/gr_fill the offset they need and
-   use a consistant function interface.  */
+   use a consistent function interface.  */
 
 static rtx
-gen_movdi_x (dest, src, offset)
-     rtx dest, src;
-     rtx offset ATTRIBUTE_UNUSED;
+gen_movdi_x (rtx dest, rtx src, rtx offset ATTRIBUTE_UNUSED)
 {
   return gen_movdi (dest, src);
 }
 
 static rtx
-gen_fr_spill_x (dest, src, offset)
-     rtx dest, src;
-     rtx offset ATTRIBUTE_UNUSED;
+gen_fr_spill_x (rtx dest, rtx src, rtx offset ATTRIBUTE_UNUSED)
 {
   return gen_fr_spill (dest, src);
 }
 
 static rtx
-gen_fr_restore_x (dest, src, offset)
-     rtx dest, src;
-     rtx offset ATTRIBUTE_UNUSED;
+gen_fr_restore_x (rtx dest, rtx src, rtx offset ATTRIBUTE_UNUSED)
 {
   return gen_fr_restore (dest, src);
 }
@@ -2303,7 +2569,7 @@ gen_fr_restore_x (dest, src, offset)
    adds instruction.  */
 
 void
-ia64_expand_prologue ()
+ia64_expand_prologue (void)
 {
   rtx insn, ar_pfs_save_reg, ar_unat_save_reg;
   int i, epilogue_p, regno, alt_regno, cfa_off, n_varargs;
@@ -2382,7 +2648,7 @@ ia64_expand_prologue ()
 	regno = next_scratch_gr_reg ();
       ar_pfs_save_reg = gen_rtx_REG (DImode, regno);
 
-      insn = emit_insn (gen_alloc (ar_pfs_save_reg, 
+      insn = emit_insn (gen_alloc (ar_pfs_save_reg,
 				   GEN_INT (current_frame_info.n_input_regs),
 				   GEN_INT (current_frame_info.n_local_regs),
 				   GEN_INT (current_frame_info.n_output_regs),
@@ -2412,7 +2678,7 @@ ia64_expand_prologue ()
       else
 	{
 	  regno = next_scratch_gr_reg ();
- 	  offset = gen_rtx_REG (DImode, regno);
+	  offset = gen_rtx_REG (DImode, regno);
 	  emit_move_insn (offset, frame_size_rtx);
 	}
 
@@ -2630,7 +2896,7 @@ ia64_expand_prologue ()
       {
         if (cfa_off & 15)
 	  abort ();
-	reg = gen_rtx_REG (TFmode, regno);
+	reg = gen_rtx_REG (XFmode, regno);
 	do_spill (gen_fr_spill_x, reg, cfa_off, reg);
 	cfa_off -= 16;
       }
@@ -2649,8 +2915,7 @@ ia64_expand_prologue ()
    insn to prevent such scheduling.  */
 
 void
-ia64_expand_epilogue (sibcall_p)
-     int sibcall_p;
+ia64_expand_epilogue (int sibcall_p)
 {
   rtx insn, reg, alt_reg, ar_unat_save_reg;
   int regno, alt_regno, cfa_off;
@@ -2664,7 +2929,7 @@ ia64_expand_epilogue (sibcall_p)
     setup_spill_pointers (current_frame_info.n_spilled,
 			  hard_frame_pointer_rtx, 0);
   else
-    setup_spill_pointers (current_frame_info.n_spilled, stack_pointer_rtx, 
+    setup_spill_pointers (current_frame_info.n_spilled, stack_pointer_rtx,
 			  current_frame_info.total_size);
 
   if (current_frame_info.total_size != 0)
@@ -2716,7 +2981,7 @@ ia64_expand_epilogue (sibcall_p)
     }
   else
     ar_unat_save_reg = NULL_RTX;
-      
+
   if (current_frame_info.reg_save_ar_pfs != 0)
     {
       alt_reg = gen_rtx_REG (DImode, current_frame_info.reg_save_ar_pfs);
@@ -2766,7 +3031,7 @@ ia64_expand_epilogue (sibcall_p)
 	do_restore (gen_gr_restore, reg, cfa_off);
 	cfa_off -= 8;
       }
-  
+
   /* Restore the branch registers.  Handle B0 specially, as it may
      have gotten stored in some GR register.  */
   if (TEST_HARD_REG_BIT (current_frame_info.mask, BR_REG (0)))
@@ -2783,7 +3048,7 @@ ia64_expand_epilogue (sibcall_p)
       reg = gen_rtx_REG (DImode, BR_REG (0));
       emit_move_insn (reg, alt_reg);
     }
-    
+
   for (regno = BR_REG (1); regno <= BR_REG (7); ++regno)
     if (TEST_HARD_REG_BIT (current_frame_info.mask, regno))
       {
@@ -2801,7 +3066,7 @@ ia64_expand_epilogue (sibcall_p)
       {
         if (cfa_off & 15)
 	  abort ();
-	reg = gen_rtx_REG (TFmode, regno);
+	reg = gen_rtx_REG (XFmode, regno);
 	do_restore (gen_fr_restore_x, reg, cfa_off);
 	cfa_off -= 16;
       }
@@ -2867,17 +3132,17 @@ ia64_expand_epilogue (sibcall_p)
 
   if (cfun->machine->ia64_eh_epilogue_bsp)
     emit_insn (gen_set_bsp (cfun->machine->ia64_eh_epilogue_bsp));
- 
+
   if (! sibcall_p)
     emit_jump_insn (gen_return_internal (gen_rtx_REG (DImode, BR_REG (0))));
   else
     {
       int fp = GR_REG (2);
       /* We need a throw away register here, r0 and r1 are reserved, so r2 is the
-	 first available call clobbered register.  If there was a frame_pointer 
-	 register, we may have swapped the names of r2 and HARD_FRAME_POINTER_REGNUM, 
+	 first available call clobbered register.  If there was a frame_pointer
+	 register, we may have swapped the names of r2 and HARD_FRAME_POINTER_REGNUM,
 	 so we have to make sure we're using the string "r2" when emitting
-	 the register name for the assmbler.  */
+	 the register name for the assembler.  */
       if (current_frame_info.reg_fp && current_frame_info.reg_fp == GR_REG (2))
 	fp = HARD_FRAME_POINTER_REGNUM;
 
@@ -2900,7 +3165,7 @@ ia64_expand_epilogue (sibcall_p)
    function.  */
 
 int
-ia64_direct_return ()
+ia64_direct_return (void)
 {
   if (reload_completed && ! frame_pointer_needed)
     {
@@ -2921,9 +3186,7 @@ ia64_direct_return ()
    during early compilation.  */
 
 rtx
-ia64_return_addr_rtx (count, frame)
-     HOST_WIDE_INT count;
-     rtx frame ATTRIBUTE_UNUSED;
+ia64_return_addr_rtx (HOST_WIDE_INT count, rtx frame ATTRIBUTE_UNUSED)
 {
   if (count != 0)
     return NULL;
@@ -2934,8 +3197,7 @@ ia64_return_addr_rtx (count, frame)
    address is saved.  */
 
 void
-ia64_split_return_addr_rtx (dest)
-     rtx dest;
+ia64_split_return_addr_rtx (rtx dest)
 {
   rtx src;
 
@@ -2984,9 +3246,7 @@ ia64_split_return_addr_rtx (dest)
 }
 
 int
-ia64_hard_regno_rename_ok (from, to)
-     int from;
-     int to;
+ia64_hard_regno_rename_ok (int from, int to)
 {
   /* Don't clobber any of the registers we reserved for the prologue.  */
   if (to == current_frame_info.reg_fp
@@ -3020,18 +3280,15 @@ ia64_hard_regno_rename_ok (from, to)
    aligned objects and detect the cases when @fptr is needed.  */
 
 static bool
-ia64_assemble_integer (x, size, aligned_p)
-     rtx x;
-     unsigned int size;
-     int aligned_p;
+ia64_assemble_integer (rtx x, unsigned int size, int aligned_p)
 {
-  if (size == (TARGET_ILP32 ? 4 : 8)
+  if (size == POINTER_SIZE / BITS_PER_UNIT
       && aligned_p
       && !(TARGET_NO_PIC || TARGET_AUTO_PIC)
       && GET_CODE (x) == SYMBOL_REF
-      && SYMBOL_REF_FLAG (x))
+      && SYMBOL_REF_FUNCTION_P (x))
     {
-      if (TARGET_ILP32)
+      if (POINTER_SIZE == 32)
 	fputs ("\tdata4\t@fptr(", asm_out_file);
       else
 	fputs ("\tdata8\t@fptr(", asm_out_file);
@@ -3045,9 +3302,7 @@ ia64_assemble_integer (x, size, aligned_p)
 /* Emit the function prologue.  */
 
 static void
-ia64_output_function_prologue (file, size)
-     FILE *file;
-     HOST_WIDE_INT size ATTRIBUTE_UNUSED;
+ia64_output_function_prologue (FILE *file, HOST_WIDE_INT size ATTRIBUTE_UNUSED)
 {
   int mask, grsave, grsave_prev;
 
@@ -3097,7 +3352,7 @@ ia64_output_function_prologue (file, size)
 	grsave = current_frame_info.reg_save_pr;
     }
 
-  if (mask)
+  if (mask && TARGET_GNU_AS)
     fprintf (file, "\t.prologue %d, %d\n", mask,
 	     ia64_dbx_register_number (grsave));
   else
@@ -3114,8 +3369,7 @@ ia64_output_function_prologue (file, size)
 /* Emit the .body directive at the scheduled end of the prologue.  */
 
 static void
-ia64_output_function_end_prologue (file)
-     FILE *file;
+ia64_output_function_end_prologue (FILE *file)
 {
   if (!flag_unwind_tables && (!flag_exceptions || USING_SJLJ_EXCEPTIONS))
     return;
@@ -3126,9 +3380,8 @@ ia64_output_function_end_prologue (file)
 /* Emit the function epilogue.  */
 
 static void
-ia64_output_function_epilogue (file, size)
-     FILE *file ATTRIBUTE_UNUSED;
-     HOST_WIDE_INT size ATTRIBUTE_UNUSED;
+ia64_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED,
+			       HOST_WIDE_INT size ATTRIBUTE_UNUSED)
 {
   int i;
 
@@ -3153,8 +3406,7 @@ ia64_output_function_epilogue (file, size)
 }
 
 int
-ia64_dbx_register_number (regno)
-     int regno;
+ia64_dbx_register_number (int regno)
 {
   /* In ia64_expand_prologue we quite literally renamed the frame pointer
      from its home at loc79 to something inside the register frame.  We
@@ -3179,11 +3431,24 @@ ia64_dbx_register_number (regno)
 }
 
 void
-ia64_initialize_trampoline (addr, fnaddr, static_chain)
-     rtx addr, fnaddr, static_chain;
+ia64_initialize_trampoline (rtx addr, rtx fnaddr, rtx static_chain)
 {
   rtx addr_reg, eight = GEN_INT (8);
 
+  /* The Intel assembler requires that the global __ia64_trampoline symbol
+     be declared explicitly */
+  if (!TARGET_GNU_AS)
+    {
+      static bool declared_ia64_trampoline = false;
+
+      if (!declared_ia64_trampoline)
+	{
+	  declared_ia64_trampoline = true;
+	  (*targetm.asm_out.globalize_label) (asm_out_file,
+					      "__ia64_trampoline");
+	}
+    }
+
   /* Load up our iterator.  */
   addr_reg = gen_reg_rtx (Pmode);
   emit_move_insn (addr_reg, addr);
@@ -3212,12 +3477,9 @@ ia64_initialize_trampoline (addr, fnaddr, static_chain)
    We generate the actual spill instructions during prologue generation.  */
 
 void
-ia64_setup_incoming_varargs (cum, int_mode, type, pretend_size, second_time)
-     CUMULATIVE_ARGS cum;
-     int             int_mode;
-     tree            type;
-     int *           pretend_size;
-     int	     second_time ATTRIBUTE_UNUSED;
+ia64_setup_incoming_varargs (CUMULATIVE_ARGS cum, int int_mode, tree type,
+			     int * pretend_size,
+			     int second_time ATTRIBUTE_UNUSED)
 {
   /* Skip the current argument.  */
   ia64_function_arg_advance (&cum, int_mode, type, 1);
@@ -3239,9 +3501,7 @@ ia64_setup_incoming_varargs (cum, int_mode, type, pretend_size, second_time)
    SFmode).  128-bit quad-precision floats are excluded.  */
 
 static enum machine_mode
-hfa_element_mode (type, nested)
-     tree type;
-     int nested;
+hfa_element_mode (tree type, int nested)
 {
   enum machine_mode element_mode = VOIDmode;
   enum machine_mode mode;
@@ -3263,16 +3523,15 @@ hfa_element_mode (type, nested)
 	 types though.  */
     case COMPLEX_TYPE:
       if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_COMPLEX_FLOAT
-	  && (TYPE_MODE (type) != TCmode || INTEL_EXTENDED_IEEE_FORMAT))
-	return mode_for_size (GET_MODE_UNIT_SIZE (TYPE_MODE (type))
-			      * BITS_PER_UNIT, MODE_FLOAT, 0);
+	  && TYPE_MODE (type) != TCmode)
+	return GET_MODE_INNER (TYPE_MODE (type));
       else
 	return VOIDmode;
 
     case REAL_TYPE:
       /* We want to return VOIDmode for raw REAL_TYPEs, but the actual
 	 mode if this is contained within an aggregate.  */
-      if (nested && (TYPE_MODE (type) != TFmode || INTEL_EXTENDED_IEEE_FORMAT))
+      if (nested && TYPE_MODE (type) != TFmode)
 	return TYPE_MODE (type);
       else
 	return VOIDmode;
@@ -3315,40 +3574,62 @@ hfa_element_mode (type, nested)
   return VOIDmode;
 }
 
+/* Return the number of words required to hold a quantity of TYPE and MODE
+   when passed as an argument.  */
+static int
+ia64_function_arg_words (tree type, enum machine_mode mode)
+{
+  int words;
+
+  if (mode == BLKmode)
+    words = int_size_in_bytes (type);
+  else
+    words = GET_MODE_SIZE (mode);
+
+  return (words + UNITS_PER_WORD - 1) / UNITS_PER_WORD;  /* round up */
+}
+
+/* Return the number of registers that should be skipped so the current
+   argument (described by TYPE and WORDS) will be properly aligned.
+
+   Integer and float arguments larger than 8 bytes start at the next
+   even boundary.  Aggregates larger than 8 bytes start at the next
+   even boundary if the aggregate has 16 byte alignment.  Note that
+   in the 32-bit ABI, TImode and TFmode have only 8-byte alignment
+   but are still to be aligned in registers.
+
+   ??? The ABI does not specify how to handle aggregates with
+   alignment from 9 to 15 bytes, or greater than 16.  We handle them
+   all as if they had 16 byte alignment.  Such aggregates can occur
+   only if gcc extensions are used.  */
+static int
+ia64_function_arg_offset (CUMULATIVE_ARGS *cum, tree type, int words)
+{
+  if ((cum->words & 1) == 0)
+    return 0;
+
+  if (type
+      && TREE_CODE (type) != INTEGER_TYPE
+      && TREE_CODE (type) != REAL_TYPE)
+    return TYPE_ALIGN (type) > 8 * BITS_PER_UNIT;
+  else
+    return words > 1;
+}
+
 /* Return rtx for register where argument is passed, or zero if it is passed
    on the stack.  */
-
 /* ??? 128-bit quad-precision floats are always passed in general
    registers.  */
 
 rtx
-ia64_function_arg (cum, mode, type, named, incoming)
-     CUMULATIVE_ARGS *cum;
-     enum machine_mode mode;
-     tree type;
-     int named;
-     int incoming;
+ia64_function_arg (CUMULATIVE_ARGS *cum, enum machine_mode mode, tree type,
+		   int named, int incoming)
 {
   int basereg = (incoming ? GR_ARG_FIRST : AR_ARG_FIRST);
-  int words = (((mode == BLKmode ? int_size_in_bytes (type)
-		 : GET_MODE_SIZE (mode)) + UNITS_PER_WORD - 1)
-	       / UNITS_PER_WORD);
-  int offset = 0;
+  int words = ia64_function_arg_words (type, mode);
+  int offset = ia64_function_arg_offset (cum, type, words);
   enum machine_mode hfa_mode = VOIDmode;
 
-  /* Integer and float arguments larger than 8 bytes start at the next even
-     boundary.  Aggregates larger than 8 bytes start at the next even boundary
-     if the aggregate has 16 byte alignment.  Net effect is that types with
-     alignment greater than 8 start at the next even boundary.  */
-  /* ??? The ABI does not specify how to handle aggregates with alignment from
-     9 to 15 bytes, or greater than 16.   We handle them all as if they had
-     16 byte alignment.  Such aggregates can occur only if gcc extensions are
-     used.  */
-  if ((type ? (TYPE_ALIGN (type) > 8 * BITS_PER_UNIT)
-       : (words > 1))
-      && (cum->words & 1))
-    offset = 1;
-
   /* If all argument slots are used, then it must go on the stack.  */
   if (cum->words + offset >= MAX_ARGUMENT_SLOTS)
     return 0;
@@ -3408,6 +3689,7 @@ ia64_function_arg (cum, mode, type, named, incoming)
       for (; offset < byte_size && int_regs < MAX_ARGUMENT_SLOTS; i++)
 	{
 	  enum machine_mode gr_mode = DImode;
+	  unsigned int gr_size;
 
 	  /* If we have an odd 4 byte hunk because we ran out of FR regs,
 	     then this goes in a GR reg left adjusted/little endian, right
@@ -3421,22 +3703,25 @@ ia64_function_arg (cum, mode, type, named, incoming)
 	     adjusted/little endian.  */
 	  else if (byte_size - offset == 4)
 	    gr_mode = SImode;
-	  /* Complex floats need to have float mode.  */
-	  if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
-	    gr_mode = hfa_mode;
 
 	  loc[i] = gen_rtx_EXPR_LIST (VOIDmode,
 				      gen_rtx_REG (gr_mode, (basereg
 							     + int_regs)),
 				      GEN_INT (offset));
-	  offset += GET_MODE_SIZE (gr_mode);
-	  int_regs += GET_MODE_SIZE (gr_mode) <= UNITS_PER_WORD
-		      ? 1 : GET_MODE_SIZE (gr_mode) / UNITS_PER_WORD;
+
+	  gr_size = GET_MODE_SIZE (gr_mode);
+	  offset += gr_size;
+	  if (gr_size == UNITS_PER_WORD
+	      || (gr_size < UNITS_PER_WORD && offset % UNITS_PER_WORD == 0))
+	    int_regs++;
+	  else if (gr_size > UNITS_PER_WORD)
+	    int_regs += gr_size / UNITS_PER_WORD;
 	}
 
-      /* If we ended up using just one location, just return that one loc.  */
+      /* If we ended up using just one location, just return that one loc, but
+	 change the mode back to the argument mode.  */
       if (i == 1)
-	return XEXP (loc[0], 0);
+	return gen_rtx_REG (mode, REGNO (XEXP (loc[0], 0)));
       else
 	return gen_rtx_PARALLEL (mode, gen_rtvec_v (i, loc));
     }
@@ -3444,8 +3729,8 @@ ia64_function_arg (cum, mode, type, named, incoming)
   /* Integral and aggregates go in general registers.  If we have run out of
      FR registers, then FP values must also go in general registers.  This can
      happen when we have a SFmode HFA.  */
-  else if (((mode == TFmode) && ! INTEL_EXTENDED_IEEE_FORMAT)
-          || (! FLOAT_MODE_P (mode) || cum->fp_regs == MAX_ARGUMENT_SLOTS))
+  else if (mode == TFmode || mode == TCmode
+	   || (! FLOAT_MODE_P (mode) || cum->fp_regs == MAX_ARGUMENT_SLOTS))
     {
       int byte_size = ((mode == BLKmode)
                        ? int_size_in_bytes (type) : GET_MODE_SIZE (mode));
@@ -3467,24 +3752,37 @@ ia64_function_arg (cum, mode, type, named, incoming)
     }
 
   /* If there is a prototype, then FP values go in a FR register when
-     named, and in a GR registeer when unnamed.  */
+     named, and in a GR register when unnamed.  */
   else if (cum->prototype)
     {
-      if (! named)
-	return gen_rtx_REG (mode, basereg + cum->words + offset);
-      else
+      if (named)
 	return gen_rtx_REG (mode, FR_ARG_FIRST + cum->fp_regs);
+      /* In big-endian mode, an anonymous SFmode value must be represented
+         as (parallel:SF [(expr_list (reg:DI n) (const_int 0))]) to force
+	 the value into the high half of the general register.  */
+      else if (BYTES_BIG_ENDIAN && mode == SFmode)
+	return gen_rtx_PARALLEL (mode,
+		 gen_rtvec (1,
+                   gen_rtx_EXPR_LIST (VOIDmode,
+		     gen_rtx_REG (DImode, basereg + cum->words + offset),
+				      const0_rtx)));
+      else
+	return gen_rtx_REG (mode, basereg + cum->words + offset);
     }
   /* If there is no prototype, then FP values go in both FR and GR
      registers.  */
   else
     {
+      /* See comment above.  */
+      enum machine_mode inner_mode =
+	(BYTES_BIG_ENDIAN && mode == SFmode) ? DImode : mode;
+
       rtx fp_reg = gen_rtx_EXPR_LIST (VOIDmode,
 				      gen_rtx_REG (mode, (FR_ARG_FIRST
 							  + cum->fp_regs)),
 				      const0_rtx);
       rtx gr_reg = gen_rtx_EXPR_LIST (VOIDmode,
-				      gen_rtx_REG (mode,
+				      gen_rtx_REG (inner_mode,
 						   (basereg + cum->words
 						    + offset)),
 				      const0_rtx);
@@ -3498,23 +3796,11 @@ ia64_function_arg (cum, mode, type, named, incoming)
    in memory.  */
 
 int
-ia64_function_arg_partial_nregs (cum, mode, type, named)
-     CUMULATIVE_ARGS *cum;
-     enum machine_mode mode;
-     tree type;
-     int named ATTRIBUTE_UNUSED;
-{
-  int words = (((mode == BLKmode ? int_size_in_bytes (type)
-		 : GET_MODE_SIZE (mode)) + UNITS_PER_WORD - 1)
-	       / UNITS_PER_WORD);
-  int offset = 0;
-
-  /* Arguments with alignment larger than 8 bytes start at the next even
-     boundary.  */
-  if ((type ? (TYPE_ALIGN (type) > 8 * BITS_PER_UNIT)
-       : (words > 1))
-      && (cum->words & 1))
-    offset = 1;
+ia64_function_arg_partial_nregs (CUMULATIVE_ARGS *cum, enum machine_mode mode,
+				 tree type, int named ATTRIBUTE_UNUSED)
+{
+  int words = ia64_function_arg_words (type, mode);
+  int offset = ia64_function_arg_offset (cum, type, words);
 
   /* If all argument slots are used, then it must go on the stack.  */
   if (cum->words + offset >= MAX_ARGUMENT_SLOTS)
@@ -3535,29 +3821,17 @@ ia64_function_arg_partial_nregs (cum, mode, type, named)
    ia64_function_arg.  */
 
 void
-ia64_function_arg_advance (cum, mode, type, named)
-     CUMULATIVE_ARGS *cum;
-     enum machine_mode mode;
-     tree type;
-     int named;
-{
-  int words = (((mode == BLKmode ? int_size_in_bytes (type)
-		 : GET_MODE_SIZE (mode)) + UNITS_PER_WORD - 1)
-	       / UNITS_PER_WORD);
-  int offset = 0;
+ia64_function_arg_advance (CUMULATIVE_ARGS *cum, enum machine_mode mode,
+			   tree type, int named)
+{
+  int words = ia64_function_arg_words (type, mode);
+  int offset = ia64_function_arg_offset (cum, type, words);
   enum machine_mode hfa_mode = VOIDmode;
 
   /* If all arg slots are already full, then there is nothing to do.  */
   if (cum->words >= MAX_ARGUMENT_SLOTS)
     return;
 
-  /* Arguments with alignment larger than 8 bytes start at the next even
-     boundary.  */
-  if ((type ? (TYPE_ALIGN (type) > 8 * BITS_PER_UNIT)
-       : (words > 1))
-      && (cum->words & 1))
-    offset = 1;
-
   cum->words += words + offset;
 
   /* Check for and handle homogeneous FP aggregates.  */
@@ -3607,7 +3881,7 @@ ia64_function_arg_advance (cum, mode, type, named)
     cum->int_regs = cum->words;
 
   /* If there is a prototype, then FP values go in a FR register when
-     named, and in a GR registeer when unnamed.  */
+     named, and in a GR register when unnamed.  */
   else if (cum->prototype)
     {
       if (! named)
@@ -3619,7 +3893,7 @@ ia64_function_arg_advance (cum, mode, type, named)
   /* If there is no prototype, then FP values go in both FR and GR
      registers.  */
   else
-    { 
+    {
       /* ??? Complex types should not reach here.  */
       cum->fp_regs += (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT ? 2 : 1);
       cum->int_regs = cum->words;
@@ -3630,34 +3904,49 @@ ia64_function_arg_advance (cum, mode, type, named)
 /* ??? At present this is a GCC extension to the IA-64 ABI.  */
 
 int
-ia64_function_arg_pass_by_reference (cum, mode, type, named)
-     CUMULATIVE_ARGS *cum ATTRIBUTE_UNUSED;
-     enum machine_mode mode ATTRIBUTE_UNUSED;
-     tree type;
-     int named ATTRIBUTE_UNUSED;
+ia64_function_arg_pass_by_reference (CUMULATIVE_ARGS *cum ATTRIBUTE_UNUSED,
+				     enum machine_mode mode ATTRIBUTE_UNUSED,
+				     tree type, int named ATTRIBUTE_UNUSED)
 {
   return type && TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST;
 }
+
+/* True if it is OK to do sibling call optimization for the specified
+   call expression EXP.  DECL will be the called function, or NULL if
+   this is an indirect call.  */
+static bool
+ia64_function_ok_for_sibcall (tree decl, tree exp ATTRIBUTE_UNUSED)
+{
+  /* We must always return with our current GP.  This means we can
+     only sibcall to functions defined in the current module.  */
+  return decl && (*targetm.binds_local_p) (decl);
+}
 
 
 /* Implement va_arg.  */
 
 rtx
-ia64_va_arg (valist, type)
-     tree valist, type;
+ia64_va_arg (tree valist, tree type)
 {
   tree t;
 
   /* Variable sized types are passed by reference.  */
   if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
     {
-      rtx addr = std_expand_builtin_va_arg (valist, build_pointer_type (type));
-      return gen_rtx_MEM (ptr_mode, force_reg (Pmode, addr));
+      rtx addr = force_reg (ptr_mode,
+	    std_expand_builtin_va_arg (valist, build_pointer_type (type)));
+#ifdef POINTERS_EXTEND_UNSIGNED
+      addr = convert_memory_address (Pmode, addr);
+#endif
+      return gen_rtx_MEM (ptr_mode, addr);
     }
 
-  /* Arguments with alignment larger than 8 bytes start at the next even
-     boundary.  */
-  if (TYPE_ALIGN (type) > 8 * BITS_PER_UNIT)
+  /* Aggregate arguments with alignment larger than 8 bytes start at
+     the next even boundary.  Integer and floating point arguments
+     do so if they are larger than 8 bytes, whether or not they are
+     also aligned larger than 8 bytes.  */
+  if ((TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == INTEGER_TYPE)
+      ? int_size_in_bytes (type) > 8 : TYPE_ALIGN (type) > 8 * BITS_PER_UNIT)
     {
       t = build (PLUS_EXPR, TREE_TYPE (valist), valist,
 		 build_int_2 (2 * UNITS_PER_WORD - 1, 0));
@@ -3675,8 +3964,7 @@ ia64_va_arg (valist, type)
    in a register.  */
 
 int
-ia64_return_in_memory (valtype)
-     tree valtype;
+ia64_return_in_memory (tree valtype)
 {
   enum machine_mode mode;
   enum machine_mode hfa_mode;
@@ -3712,9 +4000,7 @@ ia64_return_in_memory (valtype)
 /* Return rtx for register that holds the function return value.  */
 
 rtx
-ia64_function_value (valtype, func)
-     tree valtype;
-     tree func ATTRIBUTE_UNUSED;
+ia64_function_value (tree valtype, tree func ATTRIBUTE_UNUSED)
 {
   enum machine_mode mode;
   enum machine_mode hfa_mode;
@@ -3747,8 +4033,7 @@ ia64_function_value (valtype, func)
       else
 	return gen_rtx_PARALLEL (mode, gen_rtvec_v (i, loc));
     }
-  else if (FLOAT_TYPE_P (valtype) &&
-           ((mode != TFmode) || INTEL_EXTENDED_IEEE_FORMAT))
+  else if (FLOAT_TYPE_P (valtype) && mode != TFmode && mode != TCmode)
     return gen_rtx_REG (mode, FR_ARG_FIRST);
   else
     {
@@ -3777,15 +4062,27 @@ ia64_function_value (valtype, func)
     }
 }
 
+/* This is called from dwarf2out.c via ASM_OUTPUT_DWARF_DTPREL.
+   We need to emit DTP-relative relocations.  */
+
+void
+ia64_output_dwarf_dtprel (FILE *file, int size, rtx x)
+{
+  if (size != 8)
+    abort ();
+  fputs ("\tdata8.ua\t@dtprel(", file);
+  output_addr_const (file, x);
+  fputs (")", file);
+}
+
 /* Print a memory address as an operand to reference that memory location.  */
 
 /* ??? Do we need this?  It gets used only for 'a' operands.  We could perhaps
    also call this from ia64_print_operand for memory addresses.  */
 
 void
-ia64_print_operand_address (stream, address)
-     FILE * stream ATTRIBUTE_UNUSED;
-     rtx    address ATTRIBUTE_UNUSED;
+ia64_print_operand_address (FILE * stream ATTRIBUTE_UNUSED,
+			    rtx address ATTRIBUTE_UNUSED)
 {
 }
 
@@ -3810,10 +4107,7 @@ ia64_print_operand_address (stream, address)
    r	Print register name, or constant 0 as r0.  HP compatibility for
 	Linux kernel.  */
 void
-ia64_print_operand (file, x, code)
-     FILE * file;
-     rtx    x;
-     int    code;
+ia64_print_operand (FILE * file, rtx x, int code)
 {
   const char *str;
 
@@ -3921,9 +4215,7 @@ ia64_print_operand (file, x, code)
 	    break;
 	  }
 
-	putc (',', file);
-	putc (' ', file);
-	fprintf (file, HOST_WIDE_INT_PRINT_DEC, value);
+	fprintf (file, ", " HOST_WIDE_INT_PRINT_DEC, value);
 	return;
       }
 
@@ -3974,7 +4266,7 @@ ia64_print_operand (file, x, code)
     case '+':
       {
 	const char *which;
-	
+
 	/* For conditional branches, returns or calls, substitute
 	   sptk, dptk, dpnt, or spnt for %s.  */
 	x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
@@ -4047,13 +4339,88 @@ ia64_print_operand (file, x, code)
   return;
 }
 
-/* Calulate the cost of moving data from a register in class FROM to
+/* Compute a (partial) cost for rtx X.  Return true if the complete
+   cost has been computed, and false if subexpressions should be
+   scanned.  In either case, *TOTAL contains the cost result.  */
+/* ??? This is incomplete.  */
+
+static bool
+ia64_rtx_costs (rtx x, int code, int outer_code, int *total)
+{
+  switch (code)
+    {
+    case CONST_INT:
+      switch (outer_code)
+        {
+        case SET:
+	  *total = CONST_OK_FOR_J (INTVAL (x)) ? 0 : COSTS_N_INSNS (1);
+	  return true;
+        case PLUS:
+	  if (CONST_OK_FOR_I (INTVAL (x)))
+	    *total = 0;
+	  else if (CONST_OK_FOR_J (INTVAL (x)))
+	    *total = 1;
+	  else
+	    *total = COSTS_N_INSNS (1);
+	  return true;
+        default:
+	  if (CONST_OK_FOR_K (INTVAL (x)) || CONST_OK_FOR_L (INTVAL (x)))
+	    *total = 0;
+	  else
+	    *total = COSTS_N_INSNS (1);
+	  return true;
+	}
+
+    case CONST_DOUBLE:
+      *total = COSTS_N_INSNS (1);
+      return true;
+
+    case CONST:
+    case SYMBOL_REF:
+    case LABEL_REF:
+      *total = COSTS_N_INSNS (3);
+      return true;
+
+    case MULT:
+      /* For multiplies wider than HImode, we have to go to the FPU,
+         which normally involves copies.  Plus there's the latency
+         of the multiply itself, and the latency of the instructions to
+         transfer integer regs to FP regs.  */
+      /* ??? Check for FP mode.  */
+      if (GET_MODE_SIZE (GET_MODE (x)) > 2)
+        *total = COSTS_N_INSNS (10);
+      else
+	*total = COSTS_N_INSNS (2);
+      return true;
+
+    case PLUS:
+    case MINUS:
+    case ASHIFT:
+    case ASHIFTRT:
+    case LSHIFTRT:
+      *total = COSTS_N_INSNS (1);
+      return true;
+
+    case DIV:
+    case UDIV:
+    case MOD:
+    case UMOD:
+      /* We make divide expensive, so that divide-by-constant will be
+         optimized to a multiply.  */
+      *total = COSTS_N_INSNS (60);
+      return true;
+
+    default:
+      return false;
+    }
+}
+
+/* Calculate the cost of moving data from a register in class FROM to
    one in class TO, using MODE.  */
 
 int
-ia64_register_move_cost (mode, from, to)
-     enum machine_mode mode;
-     enum reg_class from, to;
+ia64_register_move_cost (enum machine_mode mode, enum reg_class from,
+			 enum reg_class to)
 {
   /* ADDL_REGS is the same as GR_REGS for movement purposes.  */
   if (to == ADDL_REGS)
@@ -4069,11 +4436,11 @@ ia64_register_move_cost (mode, from, to)
       to = from, from = tmp;
     }
 
-  /* Moving from FR<->GR in TFmode must be more expensive than 2,
+  /* Moving from FR<->GR in XFmode must be more expensive than 2,
      so that we get secondary memory reloads.  Between FR_REGS,
      we have to make this at least as expensive as MEMORY_MOVE_COST
      to avoid spectacularly poor register class preferencing.  */
-  if (mode == TFmode)
+  if (mode == XFmode)
     {
       if (to != GR_REGS || from != GR_REGS)
         return MEMORY_MOVE_COST (mode, to, 0);
@@ -4125,10 +4492,8 @@ ia64_register_move_cost (mode, from, to)
    is required.  */
 
 enum reg_class
-ia64_secondary_reload_class (class, mode, x)
-     enum reg_class class;
-     enum machine_mode mode ATTRIBUTE_UNUSED;
-     rtx x;
+ia64_secondary_reload_class (enum reg_class class,
+			     enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
 {
   int regno = -1;
 
@@ -4162,10 +4527,10 @@ ia64_secondary_reload_class (class, mode, x)
       break;
 
     case FR_REGS:
-      /* Need to go through general regsters to get to other class regs.  */
+      /* Need to go through general registers to get to other class regs.  */
       if (regno >= 0 && ! (FR_REGNO_P (regno) || GENERAL_REGNO_P (regno)))
 	return GR_REGS;
- 
+
       /* This can happen when a paradoxical subreg is an operand to the
 	 muldi3 pattern.  */
       /* ??? This shouldn't be necessary after instruction scheduling is
@@ -4206,28 +4571,19 @@ ia64_secondary_reload_class (class, mode, x)
 	return GR_REGS;
       break;
 
-    case GR_REGS:
-      /* Since we have no offsettable memory addresses, we need a temporary
-	 to hold the address of the second word.  */
-      if (mode == TImode)
-	return GR_REGS;
-      break;
-
     default:
       break;
     }
 
   return NO_REGS;
 }
+
 
 /* Emit text to declare externally defined variables and functions, because
    the Intel assembler does not support undefined externals.  */
 
 void
-ia64_asm_output_external (file, decl, name)
-     FILE *file;
-     tree decl;
-     const char *name;
+ia64_asm_output_external (FILE *file, tree decl, const char *name)
 {
   int save_referenced;
 
@@ -4237,7 +4593,7 @@ ia64_asm_output_external (file, decl, name)
   if (TARGET_GNU_AS
       && (!TARGET_HPUX_LD
 	  || TREE_CODE (decl) != FUNCTION_DECL
-	  || strstr(name, "__builtin_") == name))
+	  || strstr (name, "__builtin_") == name))
     return;
 
   /* ??? The Intel assembler creates a reference that needs to be satisfied by
@@ -4253,7 +4609,7 @@ ia64_asm_output_external (file, decl, name)
     return;
 
   if (TARGET_HPUX_LD)
-    ia64_hpux_add_extern_decl (name);
+    ia64_hpux_add_extern_decl (decl);
   else
     {
       /* assemble_name will set TREE_SYMBOL_REFERENCED, so we must save and
@@ -4269,8 +4625,7 @@ ia64_asm_output_external (file, decl, name)
 /* Parse the -mfixed-range= option string.  */
 
 static void
-fix_range (const_str)
-     const char *const_str;
+fix_range (const char *const_str)
 {
   int i, first, last;
   char *str, *dash, *comma;
@@ -4333,7 +4688,7 @@ fix_range (const_str)
 }
 
 static struct machine_function *
-ia64_init_machine_status ()
+ia64_init_machine_status (void)
 {
   return ggc_alloc_cleared (sizeof (struct machine_function));
 }
@@ -4341,8 +4696,25 @@ ia64_init_machine_status ()
 /* Handle TARGET_OPTIONS switches.  */
 
 void
-ia64_override_options ()
+ia64_override_options (void)
 {
+  static struct pta
+    {
+      const char *const name;		/* processor name or nickname.  */
+      const enum processor_type processor;
+    }
+  const processor_alias_table[] =
+    {
+      {"itanium", PROCESSOR_ITANIUM},
+      {"itanium1", PROCESSOR_ITANIUM},
+      {"merced", PROCESSOR_ITANIUM},
+      {"itanium2", PROCESSOR_ITANIUM2},
+      {"mckinley", PROCESSOR_ITANIUM2},
+    };
+
+  int const pta_size = ARRAY_SIZE (processor_alias_table);
+  int i;
+
   if (TARGET_AUTO_PIC)
     target_flags |= MASK_CONST_GP;
 
@@ -4358,6 +4730,18 @@ ia64_override_options ()
       target_flags &= ~MASK_INLINE_INT_DIV_THR;
     }
 
+  if (TARGET_INLINE_SQRT_LAT && TARGET_INLINE_SQRT_THR)
+    {
+      warning ("cannot optimize square root for both latency and throughput");
+      target_flags &= ~MASK_INLINE_SQRT_THR;
+    }
+
+  if (TARGET_INLINE_SQRT_LAT)
+    {
+      warning ("not yet implemented: latency-optimized inline square root");
+      target_flags &= ~MASK_INLINE_SQRT_LAT;
+    }
+
   if (ia64_fixed_range_string)
     fix_range (ia64_fixed_range_string);
 
@@ -4371,35 +4755,32 @@ ia64_override_options ()
 	ia64_tls_size = tmp;
     }
 
+  if (!ia64_tune_string)
+    ia64_tune_string = "itanium2";
+
+  for (i = 0; i < pta_size; i++)
+    if (! strcmp (ia64_tune_string, processor_alias_table[i].name))
+      {
+	ia64_tune = processor_alias_table[i].processor;
+	break;
+      }
+
+  if (i == pta_size)
+    error ("bad value (%s) for -tune= switch", ia64_tune_string);
+
   ia64_flag_schedule_insns2 = flag_schedule_insns_after_reload;
   flag_schedule_insns_after_reload = 0;
 
   ia64_section_threshold = g_switch_set ? g_switch_value : IA64_DEFAULT_GVALUE;
 
   init_machine_status = ia64_init_machine_status;
-
-  /* Tell the compiler which flavor of TFmode we're using.  */
-  if (INTEL_EXTENDED_IEEE_FORMAT)
-    real_format_for_mode[TFmode - QFmode] = &ieee_extended_intel_128_format;
 }
 
-static enum attr_itanium_requires_unit0 ia64_safe_itanium_requires_unit0 PARAMS((rtx));
-static enum attr_itanium_class ia64_safe_itanium_class PARAMS((rtx));
-static enum attr_type ia64_safe_type PARAMS((rtx));
-
-static enum attr_itanium_requires_unit0
-ia64_safe_itanium_requires_unit0 (insn)
-     rtx insn;
-{
-  if (recog_memoized (insn) >= 0)
-    return get_attr_itanium_requires_unit0 (insn);
-  else
-    return ITANIUM_REQUIRES_UNIT0_NO;
-}
+static enum attr_itanium_class ia64_safe_itanium_class (rtx);
+static enum attr_type ia64_safe_type (rtx);
 
 static enum attr_itanium_class
-ia64_safe_itanium_class (insn)
-     rtx insn;
+ia64_safe_itanium_class (rtx insn)
 {
   if (recog_memoized (insn) >= 0)
     return get_attr_itanium_class (insn);
@@ -4408,8 +4789,7 @@ ia64_safe_itanium_class (insn)
 }
 
 static enum attr_type
-ia64_safe_type (insn)
-     rtx insn;
+ia64_safe_type (rtx insn)
 {
   if (recog_memoized (insn) >= 0)
     return get_attr_type (insn);
@@ -4487,26 +4867,21 @@ struct reg_flags
   unsigned int is_sibcall : 1;	/* Is this a sibling or normal call?  */
 };
 
-static void rws_update PARAMS ((struct reg_write_state *, int,
-				struct reg_flags, int));
-static int rws_access_regno PARAMS ((int, struct reg_flags, int));
-static int rws_access_reg PARAMS ((rtx, struct reg_flags, int));
-static void update_set_flags PARAMS ((rtx, struct reg_flags *, int *, rtx *));
-static int set_src_needs_barrier PARAMS ((rtx, struct reg_flags, int, rtx));
-static int rtx_needs_barrier PARAMS ((rtx, struct reg_flags, int));
-static void init_insn_group_barriers PARAMS ((void));
-static int group_barrier_needed_p PARAMS ((rtx));
-static int safe_group_barrier_needed_p PARAMS ((rtx));
+static void rws_update (struct reg_write_state *, int, struct reg_flags, int);
+static int rws_access_regno (int, struct reg_flags, int);
+static int rws_access_reg (rtx, struct reg_flags, int);
+static void update_set_flags (rtx, struct reg_flags *, int *, rtx *);
+static int set_src_needs_barrier (rtx, struct reg_flags, int, rtx);
+static int rtx_needs_barrier (rtx, struct reg_flags, int);
+static void init_insn_group_barriers (void);
+static int group_barrier_needed_p (rtx);
+static int safe_group_barrier_needed_p (rtx);
 
 /* Update *RWS for REGNO, which is being written by the current instruction,
    with predicate PRED, and associated register flags in FLAGS.  */
 
 static void
-rws_update (rws, regno, flags, pred)
-     struct reg_write_state *rws;
-     int regno;
-     struct reg_flags flags;
-     int pred;
+rws_update (struct reg_write_state *rws, int regno, struct reg_flags flags, int pred)
 {
   if (pred)
     rws[regno].write_count++;
@@ -4524,10 +4899,7 @@ rws_update (rws, regno, flags, pred)
    a dependency with an earlier instruction in the same group.  */
 
 static int
-rws_access_regno (regno, flags, pred)
-     int regno;
-     struct reg_flags flags;
-     int pred;
+rws_access_regno (int regno, struct reg_flags flags, int pred)
 {
   int need_barrier = 0;
 
@@ -4562,7 +4934,7 @@ rws_access_regno (regno, flags, pred)
 	  /* ??? This assumes that P and P+1 are always complementary
 	     predicates for P even.  */
 	  if (flags.is_and && rws_sum[regno].written_by_and)
-	    ; 
+	    ;
 	  else if (flags.is_or && rws_sum[regno].written_by_or)
 	    ;
 	  else if ((rws_sum[regno].first_pred ^ 1) != pred)
@@ -4645,10 +5017,7 @@ rws_access_regno (regno, flags, pred)
 }
 
 static int
-rws_access_reg (reg, flags, pred)
-     rtx reg;
-     struct reg_flags flags;
-     int pred;
+rws_access_reg (rtx reg, struct reg_flags flags, int pred)
 {
   int regno = REGNO (reg);
   int n = HARD_REGNO_NREGS (REGNO (reg), GET_MODE (reg));
@@ -4668,11 +5037,7 @@ rws_access_reg (reg, flags, pred)
    the condition, stored in *PFLAGS, *PPRED and *PCOND.  */
 
 static void
-update_set_flags (x, pflags, ppred, pcond)
-     rtx x;
-     struct reg_flags *pflags;
-     int *ppred;
-     rtx *pcond;
+update_set_flags (rtx x, struct reg_flags *pflags, int *ppred, rtx *pcond)
 {
   rtx src = SET_SRC (x);
 
@@ -4686,7 +5051,7 @@ update_set_flags (x, pflags, ppred, pcond)
     case IF_THEN_ELSE:
       if (SET_DEST (x) == pc_rtx)
 	/* X is a conditional branch.  */
-	return;	
+	return;
       else
 	{
 	  int is_complemented = 0;
@@ -4749,13 +5114,9 @@ update_set_flags (x, pflags, ppred, pcond)
    source of a given SET rtx found in X needs a barrier.  FLAGS and PRED
    are as in rtx_needs_barrier.  COND is an rtx that holds the condition
    for this insn.  */
-   
+
 static int
-set_src_needs_barrier (x, flags, pred, cond)
-     rtx x;
-     struct reg_flags flags;
-     int pred;
-     rtx cond;
+set_src_needs_barrier (rtx x, struct reg_flags flags, int pred, rtx cond)
 {
   int need_barrier = 0;
   rtx dst;
@@ -4790,15 +5151,12 @@ set_src_needs_barrier (x, flags, pred, cond)
   return need_barrier;
 }
 
-/* Handle an access to rtx X of type FLAGS using predicate register PRED.
-   Return 1 is this access creates a dependency with an earlier instruction
-   in the same group.  */
+/* Handle an access to rtx X of type FLAGS using predicate register
+   PRED.  Return 1 if this access creates a dependency with an earlier
+   instruction in the same group.  */
 
 static int
-rtx_needs_barrier (x, flags, pred)
-     rtx x;
-     struct reg_flags flags;
-     int pred;
+rtx_needs_barrier (rtx x, struct reg_flags flags, int pred)
 {
   int i, j;
   int is_complemented = 0;
@@ -4814,7 +5172,7 @@ rtx_needs_barrier (x, flags, pred)
 
   switch (GET_CODE (x))
     {
-    case SET:      
+    case SET:
       update_set_flags (x, &new_flags, &pred, &cond);
       need_barrier = set_src_needs_barrier (x, new_flags, pred, cond);
       if (GET_CODE (SET_SRC (x)) != CALL)
@@ -4984,7 +5342,7 @@ rtx_needs_barrier (x, flags, pred)
     case NEG:      case NOT:	        case SIGN_EXTEND:     case ZERO_EXTEND:
     case TRUNCATE: case FLOAT_EXTEND:   case FLOAT_TRUNCATE:  case FLOAT:
     case FIX:      case UNSIGNED_FLOAT: case UNSIGNED_FIX:    case ABS:
-    case SQRT:     case FFS:
+    case SQRT:     case FFS:		case POPCOUNT:
       need_barrier = rtx_needs_barrier (XEXP (x, 0), flags, pred);
       break;
 
@@ -5017,14 +5375,13 @@ rtx_needs_barrier (x, flags, pred)
 					      new_flags, pred);
 	    break;
 	  }
-	  
+
 	case UNSPEC_FR_SPILL:
 	case UNSPEC_FR_RESTORE:
-	case UNSPEC_POPCNT:
-	  need_barrier = rtx_needs_barrier (XVECEXP (x, 0, 0), flags, pred);
-	  break;
-
+	case UNSPEC_GETF_EXP:
+	case UNSPEC_SETF_EXP:
         case UNSPEC_ADDP4:
+	case UNSPEC_FR_SQRT_RECIP_APPROX:
 	  need_barrier = rtx_needs_barrier (XVECEXP (x, 0, 0), flags, pred);
 	  break;
 
@@ -5122,7 +5479,7 @@ rtx_needs_barrier (x, flags, pred)
    sequence of insns.  */
 
 static void
-init_insn_group_barriers ()
+init_insn_group_barriers (void)
 {
   memset (rws_sum, 0, sizeof (rws_sum));
   first_instruction = 1;
@@ -5133,8 +5490,7 @@ init_insn_group_barriers ()
    Return nonzero if so.  */
 
 static int
-group_barrier_needed_p (insn)
-     rtx insn;
+group_barrier_needed_p (rtx insn)
 {
   rtx pat;
   int need_barrier = 0;
@@ -5230,7 +5586,10 @@ group_barrier_needed_p (insn)
       abort ();
     }
 
-  if (first_instruction)
+  if (first_instruction && INSN_P (insn)
+      && ia64_safe_itanium_class (insn) != ITANIUM_CLASS_IGNORE
+      && GET_CODE (PATTERN (insn)) != USE
+      && GET_CODE (PATTERN (insn)) != CLOBBER)
     {
       need_barrier = 0;
       first_instruction = 0;
@@ -5242,8 +5601,7 @@ group_barrier_needed_p (insn)
 /* Like group_barrier_needed_p, but do not clobber the current state.  */
 
 static int
-safe_group_barrier_needed_p (insn)
-     rtx insn;
+safe_group_barrier_needed_p (rtx insn)
 {
   struct reg_write_state rws_saved[NUM_REGS];
   int saved_first_instruction;
@@ -5260,17 +5618,15 @@ safe_group_barrier_needed_p (insn)
   return t;
 }
 
-/* INSNS is an chain of instructions.  Scan the chain, and insert stop bits
-   as necessary to eliminate dependendencies.  This function assumes that
-   a final instruction scheduling pass has been run which has already
-   inserted most of the necessary stop bits.  This function only inserts
-   new ones at basic block boundaries, since these are invisible to the
-   scheduler.  */
+/* Scan the current function and insert stop bits as necessary to
+   eliminate dependencies.  This function assumes that a final
+   instruction scheduling pass has been run which has already
+   inserted most of the necessary stop bits.  This function only
+   inserts new ones at basic block boundaries, since these are
+   invisible to the scheduler.  */
 
 static void
-emit_insn_group_barriers (dump, insns)
-     FILE *dump;
-     rtx insns;
+emit_insn_group_barriers (FILE *dump)
 {
   rtx insn;
   rtx last_label = 0;
@@ -5278,7 +5634,7 @@ emit_insn_group_barriers (dump, insns)
 
   init_insn_group_barriers ();
 
-  for (insn = insns; insn; insn = NEXT_INSN (insn))
+  for (insn = get_insns (); insn; insn = NEXT_INSN (insn))
     {
       if (GET_CODE (insn) == CODE_LABEL)
 	{
@@ -5326,15 +5682,13 @@ emit_insn_group_barriers (dump, insns)
    This function has to emit all necessary group barriers.  */
 
 static void
-emit_all_insn_group_barriers (dump, insns)
-     FILE *dump ATTRIBUTE_UNUSED;
-     rtx insns;
+emit_all_insn_group_barriers (FILE *dump ATTRIBUTE_UNUSED)
 {
   rtx insn;
 
   init_insn_group_barriers ();
 
-  for (insn = insns; insn; insn = NEXT_INSN (insn))
+  for (insn = get_insns (); insn; insn = NEXT_INSN (insn))
     {
       if (GET_CODE (insn) == BARRIER)
 	{
@@ -5363,10 +5717,11 @@ emit_all_insn_group_barriers (dump, insns)
 	}
     }
 }
+
 
-static int errata_find_address_regs PARAMS ((rtx *, void *));
-static void errata_emit_nops PARAMS ((rtx));
-static void fixup_errata PARAMS ((void));
+static int errata_find_address_regs (rtx *, void *);
+static void errata_emit_nops (rtx);
+static void fixup_errata (void);
 
 /* This structure is used to track some details about the previous insns
    groups so we can determine if it may be necessary to insert NOPs to
@@ -5384,9 +5739,7 @@ static int group_idx;
    conditionally set in the previous group is used as an address register.
    It ensures that for_each_rtx returns 1 in that case.  */
 static int
-errata_find_address_regs (xp, data)
-     rtx *xp;
-     void *data ATTRIBUTE_UNUSED;
+errata_find_address_regs (rtx *xp, void *data ATTRIBUTE_UNUSED)
 {
   rtx x = *xp;
   if (GET_CODE (x) != MEM)
@@ -5409,8 +5762,7 @@ errata_find_address_regs (xp, data)
    last_group and emits additional NOPs if necessary to work around
    an Itanium A/B step erratum.  */
 static void
-errata_emit_nops (insn)
-     rtx insn;
+errata_emit_nops (rtx insn)
 {
   struct group *this_group = last_group + group_idx;
   struct group *prev_group = last_group + (group_idx ^ 1);
@@ -5483,7 +5835,7 @@ errata_emit_nops (insn)
 /* Emit extra nops if they are required to work around hardware errata.  */
 
 static void
-fixup_errata ()
+fixup_errata (void)
 {
   rtx insn;
 
@@ -5508,153 +5860,106 @@ fixup_errata ()
     }
 }
 
-/* Instruction scheduling support.  */
-/* Describe one bundle.  */
 
-struct bundle
-{
-  /* Zero if there's no possibility of a stop in this bundle other than
-     at the end, otherwise the position of the optional stop bit.  */
-  int possible_stop;
-  /* The types of the three slots.  */
-  enum attr_type t[3];
-  /* The pseudo op to be emitted into the assembler output.  */
-  const char *name;
-};
+/* Instruction scheduling support.  */
 
 #define NR_BUNDLES 10
 
-/* A list of all available bundles.  */
+/* A list of names of all available bundles.  */
 
-static const struct bundle bundle[NR_BUNDLES] =
+static const char *bundle_name [NR_BUNDLES] =
 {
-  { 2, { TYPE_M, TYPE_I, TYPE_I }, ".mii" },
-  { 1, { TYPE_M, TYPE_M, TYPE_I }, ".mmi" },
-  { 0, { TYPE_M, TYPE_F, TYPE_I }, ".mfi" },
-  { 0, { TYPE_M, TYPE_M, TYPE_F }, ".mmf" },
+  ".mii",
+  ".mmi",
+  ".mfi",
+  ".mmf",
 #if NR_BUNDLES == 10
-  { 0, { TYPE_B, TYPE_B, TYPE_B }, ".bbb" },
-  { 0, { TYPE_M, TYPE_B, TYPE_B }, ".mbb" },
+  ".bbb",
+  ".mbb",
 #endif
-  { 0, { TYPE_M, TYPE_I, TYPE_B }, ".mib" },
-  { 0, { TYPE_M, TYPE_M, TYPE_B }, ".mmb" },
-  { 0, { TYPE_M, TYPE_F, TYPE_B }, ".mfb" },
-  /* .mfi needs to occur earlier than .mlx, so that we only generate it if
-     it matches an L type insn.  Otherwise we'll try to generate L type
-     nops.  */
-  { 0, { TYPE_M, TYPE_L, TYPE_X }, ".mlx" }
+  ".mib",
+  ".mmb",
+  ".mfb",
+  ".mlx"
 };
 
-/* Describe a packet of instructions.  Packets consist of two bundles that
-   are visible to the hardware in one scheduling window.  */
+/* Nonzero if we should insert stop bits into the schedule.  */
 
-struct ia64_packet
-{
-  const struct bundle *t1, *t2;
-  /* Precomputed value of the first split issue in this packet if a cycle
-     starts at its beginning.  */
-  int first_split;
-  /* For convenience, the insn types are replicated here so we don't have
-     to go through T1 and T2 all the time.  */
-  enum attr_type t[6];
-};
+int ia64_final_schedule = 0;
 
-/* An array containing all possible packets.  */
-#define NR_PACKETS (NR_BUNDLES * NR_BUNDLES)
-static struct ia64_packet packets[NR_PACKETS];
+/* Codes of the corresponding quieryied units: */
 
-/* Map attr_type to a string with the name.  */
+static int _0mii_, _0mmi_, _0mfi_, _0mmf_;
+static int _0bbb_, _0mbb_, _0mib_, _0mmb_, _0mfb_, _0mlx_;
 
-static const char *const type_names[] =
-{
-  "UNKNOWN", "A", "I", "M", "F", "B", "L", "X", "S"
-};
+static int _1mii_, _1mmi_, _1mfi_, _1mmf_;
+static int _1bbb_, _1mbb_, _1mib_, _1mmb_, _1mfb_, _1mlx_;
 
-/* Nonzero if we should insert stop bits into the schedule.  */
-int ia64_final_schedule = 0;
+static int pos_1, pos_2, pos_3, pos_4, pos_5, pos_6;
 
-static int itanium_split_issue PARAMS ((const struct ia64_packet *, int));
-static rtx ia64_single_set PARAMS ((rtx));
-static int insn_matches_slot PARAMS ((const struct ia64_packet *, enum attr_type, int, rtx));
-static void ia64_emit_insn_before PARAMS ((rtx, rtx));
-static void maybe_rotate PARAMS ((FILE *));
-static void finish_last_head PARAMS ((FILE *, int));
-static void rotate_one_bundle PARAMS ((FILE *));
-static void rotate_two_bundles PARAMS ((FILE *));
-static void nop_cycles_until PARAMS ((int, FILE *));
-static void cycle_end_fill_slots PARAMS ((FILE *));
-static int packet_matches_p PARAMS ((const struct ia64_packet *, int, int *));
-static int get_split PARAMS ((const struct ia64_packet *, int));
-static int find_best_insn PARAMS ((rtx *, enum attr_type *, int,
-				   const struct ia64_packet *, int));
-static void find_best_packet PARAMS ((int *, const struct ia64_packet **,
-				      rtx *, enum attr_type *, int));
-static int itanium_reorder PARAMS ((FILE *, rtx *, rtx *, int));
-static void dump_current_packet PARAMS ((FILE *));
-static void schedule_stop PARAMS ((FILE *));
-static rtx gen_nop_type PARAMS ((enum attr_type));
-static void ia64_emit_nops PARAMS ((void));
+/* The following variable value is an insn group barrier.  */
 
-/* Map a bundle number to its pseudo-op.  */
+static rtx dfa_stop_insn;
 
-const char *
-get_bundle_name (b)
-     int b;
-{
-  return bundle[b].name;
-}
+/* The following variable value is the last issued insn.  */
 
-/* Compute the slot which will cause a split issue in packet P if the
-   current cycle begins at slot BEGIN.  */
+static rtx last_scheduled_insn;
 
-static int
-itanium_split_issue (p, begin)
-     const struct ia64_packet *p;
-     int begin;
-{
-  int type_count[TYPE_S];
-  int i;
-  int split = 6;
+/* The following variable value is size of the DFA state.  */
 
-  if (begin < 3)
-    {
-      /* Always split before and after MMF.  */
-      if (p->t[0] == TYPE_M && p->t[1] == TYPE_M && p->t[2] == TYPE_F)
-	return 3;
-      if (p->t[3] == TYPE_M && p->t[4] == TYPE_M && p->t[5] == TYPE_F)
-	return 3;
-      /* Always split after MBB and BBB.  */
-      if (p->t[1] == TYPE_B)
-	return 3;
-      /* Split after first bundle in MIB BBB combination.  */
-      if (p->t[2] == TYPE_B && p->t[3] == TYPE_B)
-	return 3;
-    }
+static size_t dfa_state_size;
 
-  memset (type_count, 0, sizeof type_count);
-  for (i = begin; i < split; i++)
-    {
-      enum attr_type t0 = p->t[i];
-      /* An MLX bundle reserves the same units as an MFI bundle.  */
-      enum attr_type t = (t0 == TYPE_L ? TYPE_F
-			  : t0 == TYPE_X ? TYPE_I
-			  : t0);
+/* The following variable value is pointer to a DFA state used as
+   temporary variable.  */
 
-      /* Itanium can execute up to 3 branches, 2 floating point, 2 memory, and
-	 2 integer per cycle.  */
-      int max = (t == TYPE_B ? 3 : 2);
-      if (type_count[t] == max)
-	return i;
+static state_t temp_dfa_state = NULL;
 
-      type_count[t]++;
-    }
-  return split;
+/* The following variable value is DFA state after issuing the last
+   insn.  */
+
+static state_t prev_cycle_state = NULL;
+
+/* The following array element values are TRUE if the corresponding
+   insn requires to add stop bits before it.  */
+
+static char *stops_p;
+
+/* The following variable is used to set up the mentioned above array.  */
+
+static int stop_before_p = 0;
+
+/* The following variable value is length of the arrays `clocks' and
+   `add_cycles'. */
+
+static int clocks_length;
+
+/* The following array element values are cycles on which the
+   corresponding insn will be issued.  The array is used only for
+   Itanium1.  */
+
+static int *clocks;
+
+/* The following array element values are numbers of cycles should be
+   added to improve insn scheduling for MM_insns for Itanium1.  */
+
+static int *add_cycles;
+
+static rtx ia64_single_set (rtx);
+static void ia64_emit_insn_before (rtx, rtx);
+
+/* Map a bundle number to its pseudo-op.  */
+
+const char *
+get_bundle_name (int b)
+{
+  return bundle_name[b];
 }
 
+
 /* Return the maximum number of instructions a cpu can issue.  */
 
 static int
-ia64_issue_rate ()
+ia64_issue_rate (void)
 {
   return 6;
 }
@@ -5662,8 +5967,7 @@ ia64_issue_rate ()
 /* Helper function - like single_set, but look inside COND_EXEC.  */
 
 static rtx
-ia64_single_set (insn)
-     rtx insn;
+ia64_single_set (rtx insn)
 {
   rtx x = PATTERN (insn), ret;
   if (GET_CODE (x) == COND_EXEC)
@@ -5693,1273 +5997,1438 @@ ia64_single_set (insn)
    a dependency LINK or INSN on DEP_INSN.  COST is the current cost.  */
 
 static int
-ia64_adjust_cost (insn, link, dep_insn, cost)
-     rtx insn, link, dep_insn;
-     int cost;
+ia64_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
 {
-  enum attr_type dep_type;
   enum attr_itanium_class dep_class;
   enum attr_itanium_class insn_class;
-  rtx dep_set, set, src, addr;
-
-  if (GET_CODE (PATTERN (insn)) == CLOBBER
-      || GET_CODE (PATTERN (insn)) == USE
-      || GET_CODE (PATTERN (dep_insn)) == CLOBBER
-      || GET_CODE (PATTERN (dep_insn)) == USE
-      /* @@@ Not accurate for indirect calls.  */
-      || GET_CODE (insn) == CALL_INSN
-      || ia64_safe_type (insn) == TYPE_S)
-    return 0;
 
-  if (REG_NOTE_KIND (link) == REG_DEP_OUTPUT
-      || REG_NOTE_KIND (link) == REG_DEP_ANTI)
-    return 0;
+  if (REG_NOTE_KIND (link) != REG_DEP_OUTPUT)
+    return cost;
 
-  dep_type = ia64_safe_type (dep_insn);
-  dep_class = ia64_safe_itanium_class (dep_insn);
   insn_class = ia64_safe_itanium_class (insn);
-
-  /* Compares that feed a conditional branch can execute in the same
-     cycle.  */
-  dep_set = ia64_single_set (dep_insn);
-  set = ia64_single_set (insn);
-
-  if (dep_type != TYPE_F
-      && dep_set
-      && GET_CODE (SET_DEST (dep_set)) == REG
-      && PR_REG (REGNO (SET_DEST (dep_set)))
-      && GET_CODE (insn) == JUMP_INSN)
+  dep_class = ia64_safe_itanium_class (dep_insn);
+  if (dep_class == ITANIUM_CLASS_ST || dep_class == ITANIUM_CLASS_STF
+      || insn_class == ITANIUM_CLASS_ST || insn_class == ITANIUM_CLASS_STF)
     return 0;
 
-  if (dep_set && GET_CODE (SET_DEST (dep_set)) == MEM)
-    {
-      /* ??? Can't find any information in the documenation about whether
-	 a sequence
-	   st [rx] = ra
-	   ld rb = [ry]
-	 splits issue.  Assume it doesn't.  */
-      return 0;
-    }
-
-  src = set ? SET_SRC (set) : 0;
-  addr = 0;
-  if (set)
-    {
-      if (GET_CODE (SET_DEST (set)) == MEM)
-	addr = XEXP (SET_DEST (set), 0);
-      else if (GET_CODE (SET_DEST (set)) == SUBREG
-	       && GET_CODE (SUBREG_REG (SET_DEST (set))) == MEM)
-	addr = XEXP (SUBREG_REG (SET_DEST (set)), 0);
-      else
-	{
-	  addr = src;
-	  if (GET_CODE (addr) == UNSPEC && XVECLEN (addr, 0) > 0)
-	    addr = XVECEXP (addr, 0, 0);
-	  while (GET_CODE (addr) == SUBREG || GET_CODE (addr) == ZERO_EXTEND)
-	    addr = XEXP (addr, 0);
-
-	  /* Note that LO_SUM is used for GOT loads.  */
-	  if (GET_CODE (addr) == MEM || GET_CODE (addr) == LO_SUM)
-	    addr = XEXP (addr, 0);
-	  else
-	    addr = 0;
-	}
-    }
-
-  if (addr && GET_CODE (addr) == POST_MODIFY)
-    addr = XEXP (addr, 0);
-
-  set = ia64_single_set (dep_insn);
-
-  if ((dep_class == ITANIUM_CLASS_IALU
-       || dep_class == ITANIUM_CLASS_ILOG
-       || dep_class == ITANIUM_CLASS_LD)
-      && (insn_class == ITANIUM_CLASS_LD
-	  || insn_class == ITANIUM_CLASS_ST))
-    {
-      if (! addr || ! set)
-	abort ();
-      /* This isn't completely correct - an IALU that feeds an address has
-	 a latency of 1 cycle if it's issued in an M slot, but 2 cycles
-	 otherwise.  Unfortunately there's no good way to describe this.  */
-      if (reg_overlap_mentioned_p (SET_DEST (set), addr))
-	return cost + 1;
-    }
-
-  if ((dep_class == ITANIUM_CLASS_IALU
-       || dep_class == ITANIUM_CLASS_ILOG
-       || dep_class == ITANIUM_CLASS_LD)
-      && (insn_class == ITANIUM_CLASS_MMMUL
-	  || insn_class == ITANIUM_CLASS_MMSHF
-	  || insn_class == ITANIUM_CLASS_MMSHFI))
-    return 3;
-
-  if (dep_class == ITANIUM_CLASS_FMAC
-      && (insn_class == ITANIUM_CLASS_FMISC
-	  || insn_class == ITANIUM_CLASS_FCVTFX
-	  || insn_class == ITANIUM_CLASS_XMPY))
-    return 7;
-
-  if ((dep_class == ITANIUM_CLASS_FMAC
-       || dep_class == ITANIUM_CLASS_FMISC
-       || dep_class == ITANIUM_CLASS_FCVTFX
-       || dep_class == ITANIUM_CLASS_XMPY)
-      && insn_class == ITANIUM_CLASS_STF)
-    return 8;
-
-  /* Intel docs say only LD, ST, IALU, ILOG, ISHF consumers have latency 4,
-     but HP engineers say any non-MM operation.  */
-  if ((dep_class == ITANIUM_CLASS_MMMUL
-       || dep_class == ITANIUM_CLASS_MMSHF
-       || dep_class == ITANIUM_CLASS_MMSHFI)
-      && insn_class != ITANIUM_CLASS_MMMUL
-      && insn_class != ITANIUM_CLASS_MMSHF
-      && insn_class != ITANIUM_CLASS_MMSHFI)
-    return 4;
-
   return cost;
 }
 
-/* Describe the current state of the Itanium pipeline.  */
-static struct
-{
-  /* The first slot that is used in the current cycle.  */
-  int first_slot;
-  /* The next slot to fill.  */
-  int cur;
-  /* The packet we have selected for the current issue window.  */
-  const struct ia64_packet *packet;
-  /* The position of the split issue that occurs due to issue width
-     limitations (6 if there's no split issue).  */
-  int split;
-  /* Record data about the insns scheduled so far in the same issue
-     window.  The elements up to but not including FIRST_SLOT belong
-     to the previous cycle, the ones starting with FIRST_SLOT belong
-     to the current cycle.  */
-  enum attr_type types[6];
-  rtx insns[6];
-  int stopbit[6];
-  /* Nonzero if we decided to schedule a stop bit.  */
-  int last_was_stop;
-} sched_data;
-
-/* Temporary arrays; they have enough elements to hold all insns that
-   can be ready at the same time while scheduling of the current block.
-   SCHED_READY can hold ready insns, SCHED_TYPES their types.  */
-static rtx *sched_ready;
-static enum attr_type *sched_types;
-
-/* Determine whether an insn INSN of type ITYPE can fit into slot SLOT
-   of packet P.  */
-
-static int
-insn_matches_slot (p, itype, slot, insn)
-     const struct ia64_packet *p;
-     enum attr_type itype;
-     int slot;
-     rtx insn;
-{
-  enum attr_itanium_requires_unit0 u0;
-  enum attr_type stype = p->t[slot];
-
-  if (insn)
-    {
-      u0 = ia64_safe_itanium_requires_unit0 (insn);
-      if (u0 == ITANIUM_REQUIRES_UNIT0_YES)
-	{
-	  int i;
-	  for (i = sched_data.first_slot; i < slot; i++)
-	    if (p->t[i] == stype
-		|| (stype == TYPE_F && p->t[i] == TYPE_L)
-		|| (stype == TYPE_I && p->t[i] == TYPE_X))
-	      return 0;
-	}
-      if (GET_CODE (insn) == CALL_INSN)
-	{
-	  /* Reject calls in multiway branch packets.  We want to limit
-	     the number of multiway branches we generate (since the branch
-	     predictor is limited), and this seems to work fairly well.
-	     (If we didn't do this, we'd have to add another test here to
-	     force calls into the third slot of the bundle.)  */
-	  if (slot < 3)
-	    {
-	      if (p->t[1] == TYPE_B)
-		return 0;
-	    }
-	  else
-	    {
-	      if (p->t[4] == TYPE_B)
-		return 0;
-	    }
-	}
-    }
-
-  if (itype == stype)
-    return 1;
-  if (itype == TYPE_A)
-    return stype == TYPE_M || stype == TYPE_I;
-  return 0;
-}
-
 /* Like emit_insn_before, but skip cycle_display notes.
    ??? When cycle display notes are implemented, update this.  */
 
 static void
-ia64_emit_insn_before (insn, before)
-     rtx insn, before;
+ia64_emit_insn_before (rtx insn, rtx before)
 {
   emit_insn_before (insn, before);
 }
 
-/* When rotating a bundle out of the issue window, insert a bundle selector
-   insn in front of it.  DUMP is the scheduling dump file or NULL.  START
-   is either 0 or 3, depending on whether we want to emit a bundle selector
-   for the first bundle or the second bundle in the current issue window.
-
-   The selector insns are emitted this late because the selected packet can
-   be changed until parts of it get rotated out.  */
+/* The following function marks insns who produce addresses for load
+   and store insns.  Such insns will be placed into M slots because it
+   decrease latency time for Itanium1 (see function
+   `ia64_produce_address_p' and the DFA descriptions).  */
 
 static void
-finish_last_head (dump, start)
-     FILE *dump;
-     int start;
+ia64_dependencies_evaluation_hook (rtx head, rtx tail)
 {
-  const struct ia64_packet *p = sched_data.packet;
-  const struct bundle *b = start == 0 ? p->t1 : p->t2;
-  int bundle_type = b - bundle;
-  rtx insn;
-  int i;
-
-  if (! ia64_final_schedule)
-    return;
-
-  for (i = start; sched_data.insns[i] == 0; i++)
-    if (i == start + 3)
-      abort ();
-  insn = sched_data.insns[i];
-
-  if (dump)
-    fprintf (dump, "//    Emitting template before %d: %s\n",
-	     INSN_UID (insn), b->name);
+  rtx insn, link, next, next_tail;
 
-  ia64_emit_insn_before (gen_bundle_selector (GEN_INT (bundle_type)), insn);
+  next_tail = NEXT_INSN (tail);
+  for (insn = head; insn != next_tail; insn = NEXT_INSN (insn))
+    if (INSN_P (insn))
+      insn->call = 0;
+  for (insn = head; insn != next_tail; insn = NEXT_INSN (insn))
+    if (INSN_P (insn)
+	&& ia64_safe_itanium_class (insn) == ITANIUM_CLASS_IALU)
+      {
+	for (link = INSN_DEPEND (insn); link != 0; link = XEXP (link, 1))
+	  {
+	    next = XEXP (link, 0);
+	    if ((ia64_safe_itanium_class (next) == ITANIUM_CLASS_ST
+		 || ia64_safe_itanium_class (next) == ITANIUM_CLASS_STF)
+		&& ia64_st_address_bypass_p (insn, next))
+	      break;
+	    else if ((ia64_safe_itanium_class (next) == ITANIUM_CLASS_LD
+		      || ia64_safe_itanium_class (next)
+		      == ITANIUM_CLASS_FLD)
+		     && ia64_ld_address_bypass_p (insn, next))
+	      break;
+	  }
+	insn->call = link != 0;
+      }
 }
 
-/* We can't schedule more insns this cycle.  Fix up the scheduling state
-   and advance FIRST_SLOT and CUR.
-   We have to distribute the insns that are currently found between
-   FIRST_SLOT and CUR into the slots of the packet we have selected.  So
-   far, they are stored successively in the fields starting at FIRST_SLOT;
-   now they must be moved to the correct slots.
-   DUMP is the current scheduling dump file, or NULL.  */
+/* We're beginning a new block.  Initialize data structures as necessary.  */
 
 static void
-cycle_end_fill_slots (dump)
-     FILE *dump;
+ia64_sched_init (FILE *dump ATTRIBUTE_UNUSED,
+		 int sched_verbose ATTRIBUTE_UNUSED,
+		 int max_ready ATTRIBUTE_UNUSED)
 {
-  const struct ia64_packet *packet = sched_data.packet;
-  int slot, i;
-  enum attr_type tmp_types[6];
-  rtx tmp_insns[6];
-
-  memcpy (tmp_types, sched_data.types, 6 * sizeof (enum attr_type));
-  memcpy (tmp_insns, sched_data.insns, 6 * sizeof (rtx));
+#ifdef ENABLE_CHECKING
+  rtx insn;
 
-  for (i = slot = sched_data.first_slot; i < sched_data.cur; i++)
-    {
-      enum attr_type t = tmp_types[i];
-      if (t != ia64_safe_type (tmp_insns[i]))
+  if (reload_completed)
+    for (insn = NEXT_INSN (current_sched_info->prev_head);
+	 insn != current_sched_info->next_tail;
+	 insn = NEXT_INSN (insn))
+      if (SCHED_GROUP_P (insn))
 	abort ();
-      while (! insn_matches_slot (packet, t, slot, tmp_insns[i]))
-	{
-	  if (slot > sched_data.split)
-	    abort ();
-	  if (dump)
-	    fprintf (dump, "// Packet needs %s, have %s\n",
-		     type_names[packet->t[slot]], type_names[t]);
-	  sched_data.types[slot] = packet->t[slot];
-	  sched_data.insns[slot] = 0;
-	  sched_data.stopbit[slot] = 0;
-
-	  /* ??? TYPE_L instructions always fill up two slots, but we don't
-	     support TYPE_L nops.  */
-	  if (packet->t[slot] == TYPE_L)
-	    abort ();
-
-	  slot++;
-	}
-
-      /* Do _not_ use T here.  If T == TYPE_A, then we'd risk changing the
-	 actual slot type later.  */
-      sched_data.types[slot] = packet->t[slot];
-      sched_data.insns[slot] = tmp_insns[i];
-      sched_data.stopbit[slot] = 0;
-      slot++;
-
-      /* TYPE_L instructions always fill up two slots.  */
-      if (t == TYPE_L)
-	{
-	  sched_data.types[slot] = packet->t[slot];
-	  sched_data.insns[slot] = 0;
-	  sched_data.stopbit[slot] = 0;
-	  slot++;
-	}
-    }
-
-  /* This isn't right - there's no need to pad out until the forced split;
-     the CPU will automatically split if an insn isn't ready.  */
-#if 0
-  while (slot < sched_data.split)
-    {
-      sched_data.types[slot] = packet->t[slot];
-      sched_data.insns[slot] = 0;
-      sched_data.stopbit[slot] = 0;
-      slot++;
-    }
 #endif
-
-  sched_data.first_slot = sched_data.cur = slot;
+  last_scheduled_insn = NULL_RTX;
+  init_insn_group_barriers ();
 }
 
-/* Bundle rotations, as described in the Itanium optimization manual.
-   We can rotate either one or both bundles out of the issue window.
-   DUMP is the current scheduling dump file, or NULL.  */
-
-static void
-rotate_one_bundle (dump)
-     FILE *dump;
-{
-  if (dump)
-    fprintf (dump, "// Rotating one bundle.\n");
-
-  finish_last_head (dump, 0);
-  if (sched_data.cur > 3)
-    {
-      sched_data.cur -= 3;
-      sched_data.first_slot -= 3;
-      memmove (sched_data.types,
-	       sched_data.types + 3,
-	       sched_data.cur * sizeof *sched_data.types);
-      memmove (sched_data.stopbit,
-	       sched_data.stopbit + 3,
-	       sched_data.cur * sizeof *sched_data.stopbit);
-      memmove (sched_data.insns,
-	       sched_data.insns + 3,
-	       sched_data.cur * sizeof *sched_data.insns);
-      sched_data.packet
-	= &packets[(sched_data.packet->t2 - bundle) * NR_BUNDLES];
-    }
-  else
-    {
-      sched_data.cur = 0;
-      sched_data.first_slot = 0;
-    }
-}
+/* We are about to being issuing insns for this clock cycle.
+   Override the default sort algorithm to better slot instructions.  */
 
-static void
-rotate_two_bundles (dump)
-     FILE *dump;
+static int
+ia64_dfa_sched_reorder (FILE *dump, int sched_verbose, rtx *ready,
+			int *pn_ready, int clock_var ATTRIBUTE_UNUSED,
+			int reorder_type)
 {
-  if (dump)
-    fprintf (dump, "// Rotating two bundles.\n");
-
-  if (sched_data.cur == 0)
-    return;
-
-  finish_last_head (dump, 0);
-  if (sched_data.cur > 3)
-    finish_last_head (dump, 3);
-  sched_data.cur = 0;
-  sched_data.first_slot = 0;
-}
-
-/* We're beginning a new block.  Initialize data structures as necessary.  */
+  int n_asms;
+  int n_ready = *pn_ready;
+  rtx *e_ready = ready + n_ready;
+  rtx *insnp;
 
-static void
-ia64_sched_init (dump, sched_verbose, max_ready)
-     FILE *dump ATTRIBUTE_UNUSED;
-     int sched_verbose ATTRIBUTE_UNUSED;
-     int max_ready;
-{
-  static int initialized = 0;
+  if (sched_verbose)
+    fprintf (dump, "// ia64_dfa_sched_reorder (type %d):\n", reorder_type);
 
-  if (! initialized)
+  if (reorder_type == 0)
     {
-      int b1, b2, i;
-
-      initialized = 1;
-
-      for (i = b1 = 0; b1 < NR_BUNDLES; b1++)
-	{
-	  const struct bundle *t1 = bundle + b1;
-	  for (b2 = 0; b2 < NR_BUNDLES; b2++, i++)
-	    {
-	      const struct bundle *t2 = bundle + b2;
+      /* First, move all USEs, CLOBBERs and other crud out of the way.  */
+      n_asms = 0;
+      for (insnp = ready; insnp < e_ready; insnp++)
+	if (insnp < e_ready)
+	  {
+	    rtx insn = *insnp;
+	    enum attr_type t = ia64_safe_type (insn);
+	    if (t == TYPE_UNKNOWN)
+	      {
+		if (GET_CODE (PATTERN (insn)) == ASM_INPUT
+		    || asm_noperands (PATTERN (insn)) >= 0)
+		  {
+		    rtx lowest = ready[n_asms];
+		    ready[n_asms] = insn;
+		    *insnp = lowest;
+		    n_asms++;
+		  }
+		else
+		  {
+		    rtx highest = ready[n_ready - 1];
+		    ready[n_ready - 1] = insn;
+		    *insnp = highest;
+		    return 1;
+		  }
+	      }
+	  }
 
-	      packets[i].t1 = t1;
-	      packets[i].t2 = t2;
-	    }
-	}
-      for (i = 0; i < NR_PACKETS; i++)
+      if (n_asms < n_ready)
 	{
-	  int j;
-	  for (j = 0; j < 3; j++)
-	    packets[i].t[j] = packets[i].t1->t[j];
-	  for (j = 0; j < 3; j++)
-	    packets[i].t[j + 3] = packets[i].t2->t[j];
-	  packets[i].first_split = itanium_split_issue (packets + i, 0);
+	  /* Some normal insns to process.  Skip the asms.  */
+	  ready += n_asms;
+	  n_ready -= n_asms;
 	}
-	
+      else if (n_ready > 0)
+	return 1;
     }
 
-  init_insn_group_barriers ();
-
-  memset (&sched_data, 0, sizeof sched_data);
-  sched_types = (enum attr_type *) xmalloc (max_ready
-					    * sizeof (enum attr_type));
-  sched_ready = (rtx *) xmalloc (max_ready * sizeof (rtx));
-}
-
-/* See if the packet P can match the insns we have already scheduled.  Return
-   nonzero if so.  In *PSLOT, we store the first slot that is available for
-   more instructions if we choose this packet.
-   SPLIT holds the last slot we can use, there's a split issue after it so
-   scheduling beyond it would cause us to use more than one cycle.  */
+  if (ia64_final_schedule)
+    {
+      int deleted = 0;
+      int nr_need_stop = 0;
 
-static int
-packet_matches_p (p, split, pslot)
-     const struct ia64_packet *p;
-     int split;
-     int *pslot;
-{
-  int filled = sched_data.cur;
-  int first = sched_data.first_slot;
-  int i, slot;
-
-  /* First, check if the first of the two bundles must be a specific one (due
-     to stop bits).  */
-  if (first > 0 && sched_data.stopbit[0] && p->t1->possible_stop != 1)
-    return 0;
-  if (first > 1 && sched_data.stopbit[1] && p->t1->possible_stop != 2)
-    return 0;
+      for (insnp = ready; insnp < e_ready; insnp++)
+	if (safe_group_barrier_needed_p (*insnp))
+	  nr_need_stop++;
 
-  for (i = 0; i < first; i++)
-    if (! insn_matches_slot (p, sched_data.types[i], i,
-			     sched_data.insns[i]))
-      return 0;
-  for (i = slot = first; i < filled; i++)
-    {
-      while (slot < split)
-	{
-	  if (insn_matches_slot (p, sched_data.types[i], slot,
-				 sched_data.insns[i]))
-	    break;
-	  slot++;
-	}
-      if (slot == split)
+      if (reorder_type == 1 && n_ready == nr_need_stop)
 	return 0;
-      slot++;
+      if (reorder_type == 0)
+	return 1;
+      insnp = e_ready;
+      /* Move down everything that needs a stop bit, preserving
+	 relative order.  */
+      while (insnp-- > ready + deleted)
+	while (insnp >= ready + deleted)
+	  {
+	    rtx insn = *insnp;
+	    if (! safe_group_barrier_needed_p (insn))
+	      break;
+	    memmove (ready + 1, ready, (insnp - ready) * sizeof (rtx));
+	    *ready = insn;
+	    deleted++;
+	  }
+      n_ready -= deleted;
+      ready += deleted;
     }
 
-  if (pslot)
-    *pslot = slot;
   return 1;
 }
 
-/* A frontend for itanium_split_issue.  For a packet P and a slot
-   number FIRST that describes the start of the current clock cycle,
-   return the slot number of the first split issue.  This function
-   uses the cached number found in P if possible.  */
+/* We are about to being issuing insns for this clock cycle.  Override
+   the default sort algorithm to better slot instructions.  */
 
 static int
-get_split (p, first)
-     const struct ia64_packet *p;
-     int first;
+ia64_sched_reorder (FILE *dump, int sched_verbose, rtx *ready, int *pn_ready,
+		    int clock_var)
 {
-  if (first == 0)
-    return p->first_split;
-  return itanium_split_issue (p, first);
+  return ia64_dfa_sched_reorder (dump, sched_verbose, ready,
+				 pn_ready, clock_var, 0);
 }
 
-/* Given N_READY insns in the array READY, whose types are found in the
-   corresponding array TYPES, return the insn that is best suited to be
-   scheduled in slot SLOT of packet P.  */
+/* Like ia64_sched_reorder, but called after issuing each insn.
+   Override the default sort algorithm to better slot instructions.  */
 
 static int
-find_best_insn (ready, types, n_ready, p, slot)
-     rtx *ready;
-     enum attr_type *types;
-     int n_ready;
-     const struct ia64_packet *p;
-     int slot;
-{
-  int best = -1;
-  int best_pri = 0;
-  while (n_ready-- > 0)
-    {
-      rtx insn = ready[n_ready];
-      if (! insn)
-	continue;
-      if (best >= 0 && INSN_PRIORITY (ready[n_ready]) < best_pri)
-	break;
-      /* If we have equally good insns, one of which has a stricter
-	 slot requirement, prefer the one with the stricter requirement.  */
-      if (best >= 0 && types[n_ready] == TYPE_A)
-	continue;
-      if (insn_matches_slot (p, types[n_ready], slot, insn))
-	{
-	  best = n_ready;
-	  best_pri = INSN_PRIORITY (ready[best]);
+ia64_sched_reorder2 (FILE *dump ATTRIBUTE_UNUSED,
+		     int sched_verbose ATTRIBUTE_UNUSED, rtx *ready,
+		     int *pn_ready, int clock_var)
+{
+  if (ia64_tune == PROCESSOR_ITANIUM && reload_completed && last_scheduled_insn)
+    clocks [INSN_UID (last_scheduled_insn)] = clock_var;
+  return ia64_dfa_sched_reorder (dump, sched_verbose, ready, pn_ready,
+				 clock_var, 1);
+}
 
-	  /* If there's no way we could get a stricter requirement, stop
-	     looking now.  */
-	  if (types[n_ready] != TYPE_A
-	      && ia64_safe_itanium_requires_unit0 (ready[n_ready]))
-	    break;
-	  break;
-	}
+/* We are about to issue INSN.  Return the number of insns left on the
+   ready queue that can be issued this cycle.  */
+
+static int
+ia64_variable_issue (FILE *dump ATTRIBUTE_UNUSED,
+		     int sched_verbose ATTRIBUTE_UNUSED,
+		     rtx insn ATTRIBUTE_UNUSED,
+		     int can_issue_more ATTRIBUTE_UNUSED)
+{
+  last_scheduled_insn = insn;
+  memcpy (prev_cycle_state, curr_state, dfa_state_size);
+  if (reload_completed)
+    {
+      if (group_barrier_needed_p (insn))
+	abort ();
+      if (GET_CODE (insn) == CALL_INSN)
+	init_insn_group_barriers ();
+      stops_p [INSN_UID (insn)] = stop_before_p;
+      stop_before_p = 0;
     }
-  return best;
+  return 1;
 }
 
-/* Select the best packet to use given the current scheduler state and the
-   current ready list.
-   READY is an array holding N_READY ready insns; TYPES is a corresponding
-   array that holds their types.  Store the best packet in *PPACKET and the
-   number of insns that can be scheduled in the current cycle in *PBEST.  */
+/* We are choosing insn from the ready queue.  Return nonzero if INSN
+   can be chosen.  */
 
-static void
-find_best_packet (pbest, ppacket, ready, types, n_ready)
-     int *pbest;
-     const struct ia64_packet **ppacket;
-     rtx *ready;
-     enum attr_type *types;
-     int n_ready;
-{
-  int first = sched_data.first_slot;
-  int best = 0;
-  int lowest_end = 6;
-  const struct ia64_packet *best_packet = NULL;
-  int i;
-
-  for (i = 0; i < NR_PACKETS; i++)
-    {
-      const struct ia64_packet *p = packets + i;
-      int slot;
-      int split = get_split (p, first);
-      int win = 0;
-      int first_slot, last_slot;
-      int b_nops = 0;
+static int
+ia64_first_cycle_multipass_dfa_lookahead_guard (rtx insn)
+{
+  if (insn == NULL_RTX || !INSN_P (insn))
+    abort ();
+  return (!reload_completed
+	  || !safe_group_barrier_needed_p (insn));
+}
 
-      if (! packet_matches_p (p, split, &first_slot))
-	continue;
+/* The following variable value is pseudo-insn used by the DFA insn
+   scheduler to change the DFA state when the simulated clock is
+   increased.  */
 
-      memcpy (sched_ready, ready, n_ready * sizeof (rtx));
+static rtx dfa_pre_cycle_insn;
 
-      win = 0;
-      last_slot = 6;
-      for (slot = first_slot; slot < split; slot++)
-	{
-	  int insn_nr;
+/* We are about to being issuing INSN.  Return nonzero if we can not
+   issue it on given cycle CLOCK and return zero if we should not sort
+   the ready queue on the next clock start.  */
 
-	  /* Disallow a degenerate case where the first bundle doesn't
-	     contain anything but NOPs!  */
-	  if (first_slot == 0 && win == 0 && slot == 3)
-	    {
-	      win = -1;
-	      break;
-	    }
+static int
+ia64_dfa_new_cycle (FILE *dump, int verbose, rtx insn, int last_clock,
+		    int clock, int *sort_p)
+{
+  int setup_clocks_p = FALSE;
 
-	  insn_nr = find_best_insn (sched_ready, types, n_ready, p, slot);
-	  if (insn_nr >= 0)
-	    {
-	      sched_ready[insn_nr] = 0;
-	      last_slot = slot;
-	      win++;
-	    }
-	  else if (p->t[slot] == TYPE_B)
-	    b_nops++;
-	}
-      /* We must disallow MBB/BBB packets if any of their B slots would be
-	 filled with nops.  */
-      if (last_slot < 3)
+  if (insn == NULL_RTX || !INSN_P (insn))
+    abort ();
+  if ((reload_completed && safe_group_barrier_needed_p (insn))
+      || (last_scheduled_insn
+	  && (GET_CODE (last_scheduled_insn) == CALL_INSN
+	      || GET_CODE (PATTERN (last_scheduled_insn)) == ASM_INPUT
+	      || asm_noperands (PATTERN (last_scheduled_insn)) >= 0)))
+    {
+      init_insn_group_barriers ();
+      if (verbose && dump)
+	fprintf (dump, "//    Stop should be before %d%s\n", INSN_UID (insn),
+		 last_clock == clock ? " + cycle advance" : "");
+      stop_before_p = 1;
+      if (last_clock == clock)
 	{
-	  if (p->t[1] == TYPE_B && (b_nops || last_slot < 2))
-	    win = -1;
+	  state_transition (curr_state, dfa_stop_insn);
+	  if (TARGET_EARLY_STOP_BITS)
+	    *sort_p = (last_scheduled_insn == NULL_RTX
+		       || GET_CODE (last_scheduled_insn) != CALL_INSN);
+	  else
+	    *sort_p = 0;
+	  return 1;
 	}
+      else if (reload_completed)
+	setup_clocks_p = TRUE;
+      if (GET_CODE (PATTERN (last_scheduled_insn)) == ASM_INPUT
+	  || asm_noperands (PATTERN (last_scheduled_insn)) >= 0)
+	state_reset (curr_state);
       else
 	{
-	  if (p->t[4] == TYPE_B && (b_nops || last_slot < 5))
-	    win = -1;
+	  memcpy (curr_state, prev_cycle_state, dfa_state_size);
+	  state_transition (curr_state, dfa_stop_insn);
+	  state_transition (curr_state, dfa_pre_cycle_insn);
+	  state_transition (curr_state, NULL);
 	}
+    }
+  else if (reload_completed)
+    setup_clocks_p = TRUE;
+  if (setup_clocks_p && ia64_tune == PROCESSOR_ITANIUM
+      && GET_CODE (PATTERN (insn)) != ASM_INPUT
+      && asm_noperands (PATTERN (insn)) < 0)
+    {
+      enum attr_itanium_class c = ia64_safe_itanium_class (insn);
 
-      if (win > best
-	  || (win == best && last_slot < lowest_end))
+      if (c != ITANIUM_CLASS_MMMUL && c != ITANIUM_CLASS_MMSHF)
 	{
-	  best = win;
-	  lowest_end = last_slot;
-	  best_packet = p;
+	  rtx link;
+	  int d = -1;
+
+	  for (link = LOG_LINKS (insn); link; link = XEXP (link, 1))
+	    if (REG_NOTE_KIND (link) == 0)
+	      {
+		enum attr_itanium_class dep_class;
+		rtx dep_insn = XEXP (link, 0);
+
+		dep_class = ia64_safe_itanium_class (dep_insn);
+		if ((dep_class == ITANIUM_CLASS_MMMUL
+		     || dep_class == ITANIUM_CLASS_MMSHF)
+		    && last_clock - clocks [INSN_UID (dep_insn)] < 4
+		    && (d < 0
+			|| last_clock - clocks [INSN_UID (dep_insn)] < d))
+		  d = last_clock - clocks [INSN_UID (dep_insn)];
+	      }
+	  if (d >= 0)
+	    add_cycles [INSN_UID (insn)] = 3 - d;
 	}
     }
-  *pbest = best;
-  *ppacket = best_packet;
+  return 0;
 }
 
-/* Reorder the ready list so that the insns that can be issued in this cycle
-   are found in the correct order at the end of the list.
-   DUMP is the scheduling dump file, or NULL.  READY points to the start,
-   E_READY to the end of the ready list.  MAY_FAIL determines what should be
-   done if no insns can be scheduled in this cycle: if it is zero, we abort,
-   otherwise we return 0.
-   Return 1 if any insns can be scheduled in this cycle.  */
+
 
-static int
-itanium_reorder (dump, ready, e_ready, may_fail)
-     FILE *dump;
-     rtx *ready;
-     rtx *e_ready;
-     int may_fail;
-{
-  const struct ia64_packet *best_packet;
-  int n_ready = e_ready - ready;
-  int first = sched_data.first_slot;
-  int i, best, best_split, filled;
+/* The following page contains abstract data `bundle states' which are
+   used for bundling insns (inserting nops and template generation).  */
+
+/* The following describes state of insn bundling.  */
+
+struct bundle_state
+{
+  /* Unique bundle state number to identify them in the debugging
+     output  */
+  int unique_num;
+  rtx insn;     /* corresponding insn, NULL for the 1st and the last state  */
+  /* number nops before and after the insn  */
+  short before_nops_num, after_nops_num;
+  int insn_num; /* insn number (0 - for initial state, 1 - for the 1st
+                   insn */
+  int cost;     /* cost of the state in cycles */
+  int accumulated_insns_num; /* number of all previous insns including
+				nops.  L is considered as 2 insns */
+  int branch_deviation; /* deviation of previous branches from 3rd slots  */
+  struct bundle_state *next;  /* next state with the same insn_num  */
+  struct bundle_state *originator; /* originator (previous insn state)  */
+  /* All bundle states are in the following chain.  */
+  struct bundle_state *allocated_states_chain;
+  /* The DFA State after issuing the insn and the nops.  */
+  state_t dfa_state;
+};
 
-  for (i = 0; i < n_ready; i++)
-    sched_types[i] = ia64_safe_type (ready[i]);
+/* The following is map insn number to the corresponding bundle state.  */
 
-  find_best_packet (&best, &best_packet, ready, sched_types, n_ready);
+static struct bundle_state **index_to_bundle_states;
 
-  if (best == 0)
-    {
-      if (may_fail)
-	return 0;
-      abort ();
-    }
+/* The unique number of next bundle state.  */
 
-  if (dump)
-    {
-      fprintf (dump, "// Selected bundles: %s %s (%d insns)\n",
-	       best_packet->t1->name,
-	       best_packet->t2 ? best_packet->t2->name : NULL, best);
-    }
+static int bundle_states_num;
 
-  best_split = itanium_split_issue (best_packet, first);
-  packet_matches_p (best_packet, best_split, &filled);
+/* All allocated bundle states are in the following chain.  */
 
-  for (i = filled; i < best_split; i++)
-    {
-      int insn_nr;
+static struct bundle_state *allocated_bundle_states_chain;
 
-      insn_nr = find_best_insn (ready, sched_types, n_ready, best_packet, i);
-      if (insn_nr >= 0)
-	{
-	  rtx insn = ready[insn_nr];
-	  memmove (ready + insn_nr, ready + insn_nr + 1,
-		   (n_ready - insn_nr - 1) * sizeof (rtx));
-	  memmove (sched_types + insn_nr, sched_types + insn_nr + 1,
-		   (n_ready - insn_nr - 1) * sizeof (enum attr_type));
-	  ready[--n_ready] = insn;
-	}
-    }
+/* All allocated but not used bundle states are in the following
+   chain.  */
 
-  sched_data.packet = best_packet;
-  sched_data.split = best_split;
-  return 1;
-}
+static struct bundle_state *free_bundle_state_chain;
 
-/* Dump information about the current scheduling state to file DUMP.  */
 
-static void
-dump_current_packet (dump)
-     FILE *dump;
+/* The following function returns a free bundle state.  */
+
+static struct bundle_state *
+get_free_bundle_state (void)
 {
-  int i;
-  fprintf (dump, "//    %d slots filled:", sched_data.cur);
-  for (i = 0; i < sched_data.first_slot; i++)
+  struct bundle_state *result;
+
+  if (free_bundle_state_chain != NULL)
     {
-      rtx insn = sched_data.insns[i];
-      fprintf (dump, " %s", type_names[sched_data.types[i]]);
-      if (insn)
-	fprintf (dump, "/%s", type_names[ia64_safe_type (insn)]);
-      if (sched_data.stopbit[i])
-	fprintf (dump, " ;;");
+      result = free_bundle_state_chain;
+      free_bundle_state_chain = result->next;
     }
-  fprintf (dump, " :::");
-  for (i = sched_data.first_slot; i < sched_data.cur; i++)
+  else
     {
-      rtx insn = sched_data.insns[i];
-      enum attr_type t = ia64_safe_type (insn);
-      fprintf (dump, " (%d) %s", INSN_UID (insn), type_names[t]);
+      result = xmalloc (sizeof (struct bundle_state));
+      result->dfa_state = xmalloc (dfa_state_size);
+      result->allocated_states_chain = allocated_bundle_states_chain;
+      allocated_bundle_states_chain = result;
     }
-  fprintf (dump, "\n");
+  result->unique_num = bundle_states_num++;
+  return result;
+
 }
 
-/* Schedule a stop bit.  DUMP is the current scheduling dump file, or
-   NULL.  */
+/* The following function frees given bundle state.  */
 
 static void
-schedule_stop (dump)
-     FILE *dump;
+free_bundle_state (struct bundle_state *state)
 {
-  const struct ia64_packet *best = sched_data.packet;
-  int i;
-  int best_stop = 6;
+  state->next = free_bundle_state_chain;
+  free_bundle_state_chain = state;
+}
 
-  if (dump)
-    fprintf (dump, "// Stop bit, cur = %d.\n", sched_data.cur);
+/* Start work with abstract data `bundle states'.  */
 
-  if (sched_data.cur == 0)
-    {
-      if (dump)
-	fprintf (dump, "//   At start of bundle, so nothing to do.\n");
+static void
+initiate_bundle_states (void)
+{
+  bundle_states_num = 0;
+  free_bundle_state_chain = NULL;
+  allocated_bundle_states_chain = NULL;
+}
 
-      rotate_two_bundles (NULL);
-      return;
-    }
+/* Finish work with abstract data `bundle states'.  */
 
-  for (i = -1; i < NR_PACKETS; i++)
+static void
+finish_bundle_states (void)
+{
+  struct bundle_state *curr_state, *next_state;
+
+  for (curr_state = allocated_bundle_states_chain;
+       curr_state != NULL;
+       curr_state = next_state)
     {
-      /* This is a slight hack to give the current packet the first chance.
-	 This is done to avoid e.g. switching from MIB to MBB bundles.  */
-      const struct ia64_packet *p = (i >= 0 ? packets + i : sched_data.packet);
-      int split = get_split (p, sched_data.first_slot);
-      const struct bundle *compare;
-      int next, stoppos;
+      next_state = curr_state->allocated_states_chain;
+      free (curr_state->dfa_state);
+      free (curr_state);
+    }
+}
 
-      if (! packet_matches_p (p, split, &next))
-	continue;
+/* Hash table of the bundle states.  The key is dfa_state and insn_num
+   of the bundle states.  */
 
-      compare = next > 3 ? p->t2 : p->t1;
+static htab_t bundle_state_table;
 
-      stoppos = 3;
-      if (compare->possible_stop)
-	stoppos = compare->possible_stop;
-      if (next > 3)
-	stoppos += 3;
+/* The function returns hash of BUNDLE_STATE.  */
 
-      if (stoppos < next || stoppos >= best_stop)
-	{
-	  if (compare->possible_stop == 0)
-	    continue;
-	  stoppos = (next > 3 ? 6 : 3);
-	}
-      if (stoppos < next || stoppos >= best_stop)
-	continue;
+static unsigned
+bundle_state_hash (const void *bundle_state)
+{
+  const struct bundle_state *state = (struct bundle_state *) bundle_state;
+  unsigned result, i;
+
+  for (result = i = 0; i < dfa_state_size; i++)
+    result += (((unsigned char *) state->dfa_state) [i]
+	       << ((i % CHAR_BIT) * 3 + CHAR_BIT));
+  return result + state->insn_num;
+}
+
+/* The function returns nonzero if the bundle state keys are equal.  */
+
+static int
+bundle_state_eq_p (const void *bundle_state_1, const void *bundle_state_2)
+{
+  const struct bundle_state * state1 = (struct bundle_state *) bundle_state_1;
+  const struct bundle_state * state2 = (struct bundle_state *) bundle_state_2;
+
+  return (state1->insn_num == state2->insn_num
+	  && memcmp (state1->dfa_state, state2->dfa_state,
+		     dfa_state_size) == 0);
+}
 
-      if (dump)
-	fprintf (dump, "//   switching from %s %s to %s %s (stop at %d)\n",
-		 best->t1->name, best->t2->name, p->t1->name, p->t2->name,
-		 stoppos);
+/* The function inserts the BUNDLE_STATE into the hash table.  The
+   function returns nonzero if the bundle has been inserted into the
+   table.  The table contains the best bundle state with given key.  */
 
-      best_stop = stoppos;
-      best = p;
+static int
+insert_bundle_state (struct bundle_state *bundle_state)
+{
+  void **entry_ptr;
+
+  entry_ptr = htab_find_slot (bundle_state_table, bundle_state, 1);
+  if (*entry_ptr == NULL)
+    {
+      bundle_state->next = index_to_bundle_states [bundle_state->insn_num];
+      index_to_bundle_states [bundle_state->insn_num] = bundle_state;
+      *entry_ptr = (void *) bundle_state;
+      return TRUE;
     }
+  else if (bundle_state->cost < ((struct bundle_state *) *entry_ptr)->cost
+	   || (bundle_state->cost == ((struct bundle_state *) *entry_ptr)->cost
+	       && (((struct bundle_state *)*entry_ptr)->accumulated_insns_num
+		   > bundle_state->accumulated_insns_num
+		   || (((struct bundle_state *)
+			*entry_ptr)->accumulated_insns_num
+		       == bundle_state->accumulated_insns_num
+		       && ((struct bundle_state *)
+			   *entry_ptr)->branch_deviation
+		       > bundle_state->branch_deviation))))
 
-  sched_data.packet = best;
-  cycle_end_fill_slots (dump);
-  while (sched_data.cur < best_stop)
     {
-      sched_data.types[sched_data.cur] = best->t[sched_data.cur];
-      sched_data.insns[sched_data.cur] = 0;
-      sched_data.stopbit[sched_data.cur] = 0;
-      sched_data.cur++;
+      struct bundle_state temp;
+
+      temp = *(struct bundle_state *) *entry_ptr;
+      *(struct bundle_state *) *entry_ptr = *bundle_state;
+      ((struct bundle_state *) *entry_ptr)->next = temp.next;
+      *bundle_state = temp;
     }
-  sched_data.stopbit[sched_data.cur - 1] = 1;
-  sched_data.first_slot = best_stop;
+  return FALSE;
+}
 
-  if (dump)
-    dump_current_packet (dump);
+/* Start work with the hash table.  */
+
+static void
+initiate_bundle_state_table (void)
+{
+  bundle_state_table = htab_create (50, bundle_state_hash, bundle_state_eq_p,
+				    (htab_del) 0);
 }
 
-/* If necessary, perform one or two rotations on the scheduling state.  
-   This should only be called if we are starting a new cycle.  */
+/* Finish work with the hash table.  */
 
 static void
-maybe_rotate (dump)
-     FILE *dump;
+finish_bundle_state_table (void)
 {
-  cycle_end_fill_slots (dump);
-  if (sched_data.cur == 6)
-    rotate_two_bundles (dump);
-  else if (sched_data.cur >= 3)
-    rotate_one_bundle (dump);
-  sched_data.first_slot = sched_data.cur;
+  htab_delete (bundle_state_table);
 }
 
-/* The clock cycle when ia64_sched_reorder was last called.  */
-static int prev_cycle;
+
 
-/* The first insn scheduled in the previous cycle.  This is the saved
-   value of sched_data.first_slot.  */
-static int prev_first;
+/* The following variable is a insn `nop' used to check bundle states
+   with different number of inserted nops.  */
 
-/* Emit NOPs to fill the delay between PREV_CYCLE and CLOCK_VAR.  Used to
-   pad out the delay between MM (shifts, etc.) and integer operations.  */
+static rtx ia64_nop;
 
-static void
-nop_cycles_until (clock_var, dump)
-     int clock_var;
-     FILE *dump;
+/* The following function tries to issue NOPS_NUM nops for the current
+   state without advancing processor cycle.  If it failed, the
+   function returns FALSE and frees the current state.  */
+
+static int
+try_issue_nops (struct bundle_state *curr_state, int nops_num)
 {
-  int prev_clock = prev_cycle;
-  int cycles_left = clock_var - prev_clock;
-  bool did_stop = false;
+  int i;
 
-  /* Finish the previous cycle; pad it out with NOPs.  */
-  if (sched_data.cur == 3)
+  for (i = 0; i < nops_num; i++)
+    if (state_transition (curr_state->dfa_state, ia64_nop) >= 0)
+      {
+	free_bundle_state (curr_state);
+	return FALSE;
+      }
+  return TRUE;
+}
+
+/* The following function tries to issue INSN for the current
+   state without advancing processor cycle.  If it failed, the
+   function returns FALSE and frees the current state.  */
+
+static int
+try_issue_insn (struct bundle_state *curr_state, rtx insn)
+{
+  if (insn && state_transition (curr_state->dfa_state, insn) >= 0)
     {
-      sched_emit_insn (gen_insn_group_barrier (GEN_INT (3)));
-      did_stop = true;
-      maybe_rotate (dump);
+      free_bundle_state (curr_state);
+      return FALSE;
     }
-  else if (sched_data.cur > 0)
-    {
-      int need_stop = 0;
-      int split = itanium_split_issue (sched_data.packet, prev_first);
-
-      if (sched_data.cur < 3 && split > 3)
-	{
-	  split = 3;
-	  need_stop = 1;
-	}
+  return TRUE;
+}
 
-      if (split > sched_data.cur)
-	{
-	  int i;
-	  for (i = sched_data.cur; i < split; i++)
-	    {
-	      rtx t = sched_emit_insn (gen_nop_type (sched_data.packet->t[i]));
-	      sched_data.types[i] = sched_data.packet->t[i];
-	      sched_data.insns[i] = t;
-	      sched_data.stopbit[i] = 0;
-	    }
-	  sched_data.cur = split;
-	}
+/* The following function tries to issue BEFORE_NOPS_NUM nops and INSN
+   starting with ORIGINATOR without advancing processor cycle.  If
+   TRY_BUNDLE_END_P is TRUE, the function also/only (if
+   ONLY_BUNDLE_END_P is TRUE) tries to issue nops to fill all bundle.
+   If it was successful, the function creates new bundle state and
+   insert into the hash table and into `index_to_bundle_states'.  */
 
-      if (! need_stop && sched_data.cur > 0 && sched_data.cur < 6
-	  && cycles_left > 1)
+static void
+issue_nops_and_insn (struct bundle_state *originator, int before_nops_num,
+		     rtx insn, int try_bundle_end_p, int only_bundle_end_p)
+{
+  struct bundle_state *curr_state;
+
+  curr_state = get_free_bundle_state ();
+  memcpy (curr_state->dfa_state, originator->dfa_state, dfa_state_size);
+  curr_state->insn = insn;
+  curr_state->insn_num = originator->insn_num + 1;
+  curr_state->cost = originator->cost;
+  curr_state->originator = originator;
+  curr_state->before_nops_num = before_nops_num;
+  curr_state->after_nops_num = 0;
+  curr_state->accumulated_insns_num
+    = originator->accumulated_insns_num + before_nops_num;
+  curr_state->branch_deviation = originator->branch_deviation;
+  if (insn == NULL_RTX)
+    abort ();
+  else if (INSN_CODE (insn) == CODE_FOR_insn_group_barrier)
+    {
+      if (GET_MODE (insn) == TImode)
+	abort ();
+      if (!try_issue_nops (curr_state, before_nops_num))
+	return;
+      if (!try_issue_insn (curr_state, insn))
+	return;
+      memcpy (temp_dfa_state, curr_state->dfa_state, dfa_state_size);
+      if (state_transition (temp_dfa_state, dfa_pre_cycle_insn) >= 0
+	  && curr_state->accumulated_insns_num % 3 != 0)
 	{
-	  int i;
-	  for (i = sched_data.cur; i < 6; i++)
-	    {
-	      rtx t = sched_emit_insn (gen_nop_type (sched_data.packet->t[i]));
-	      sched_data.types[i] = sched_data.packet->t[i];
-	      sched_data.insns[i] = t;
-	      sched_data.stopbit[i] = 0;
-	    }
-	  sched_data.cur = 6;
-	  cycles_left--;
-	  need_stop = 1;
+	  free_bundle_state (curr_state);
+	  return;
 	}
-
-      if (need_stop || sched_data.cur == 6)
+    }
+  else if (GET_MODE (insn) != TImode)
+    {
+      if (!try_issue_nops (curr_state, before_nops_num))
+	return;
+      if (!try_issue_insn (curr_state, insn))
+	return;
+      curr_state->accumulated_insns_num++;
+      if (GET_CODE (PATTERN (insn)) == ASM_INPUT
+	  || asm_noperands (PATTERN (insn)) >= 0)
+	abort ();
+      if (ia64_safe_type (insn) == TYPE_L)
+	curr_state->accumulated_insns_num++;
+    }
+  else
+    {
+      state_transition (curr_state->dfa_state, dfa_pre_cycle_insn);
+      state_transition (curr_state->dfa_state, NULL);
+      curr_state->cost++;
+      if (!try_issue_nops (curr_state, before_nops_num))
+	return;
+      if (!try_issue_insn (curr_state, insn))
+	return;
+      curr_state->accumulated_insns_num++;
+      if (GET_CODE (PATTERN (insn)) == ASM_INPUT
+	  || asm_noperands (PATTERN (insn)) >= 0)
 	{
-	  sched_emit_insn (gen_insn_group_barrier (GEN_INT (3)));
-	  did_stop = true;
+	  /* Finish bundle containing asm insn.  */
+	  curr_state->after_nops_num
+	    = 3 - curr_state->accumulated_insns_num % 3;
+	  curr_state->accumulated_insns_num
+	    += 3 - curr_state->accumulated_insns_num % 3;
 	}
-      maybe_rotate (dump);
+      else if (ia64_safe_type (insn) == TYPE_L)
+	curr_state->accumulated_insns_num++;
     }
-
-  cycles_left--;
-  while (cycles_left > 0)
+  if (ia64_safe_type (insn) == TYPE_B)
+    curr_state->branch_deviation
+      += 2 - (curr_state->accumulated_insns_num - 1) % 3;
+  if (try_bundle_end_p && curr_state->accumulated_insns_num % 3 != 0)
     {
-      sched_emit_insn (gen_bundle_selector (GEN_INT (0)));
-      sched_emit_insn (gen_nop_type (TYPE_M));
-      sched_emit_insn (gen_nop_type (TYPE_I));
-      if (cycles_left > 1)
+      if (!only_bundle_end_p && insert_bundle_state (curr_state))
 	{
-	  sched_emit_insn (gen_insn_group_barrier (GEN_INT (2)));
-	  cycles_left--;
+	  state_t dfa_state;
+	  struct bundle_state *curr_state1;
+	  struct bundle_state *allocated_states_chain;
+
+	  curr_state1 = get_free_bundle_state ();
+	  dfa_state = curr_state1->dfa_state;
+	  allocated_states_chain = curr_state1->allocated_states_chain;
+	  *curr_state1 = *curr_state;
+	  curr_state1->dfa_state = dfa_state;
+	  curr_state1->allocated_states_chain = allocated_states_chain;
+	  memcpy (curr_state1->dfa_state, curr_state->dfa_state,
+		  dfa_state_size);
+	  curr_state = curr_state1;
 	}
-      sched_emit_insn (gen_nop_type (TYPE_I));
-      sched_emit_insn (gen_insn_group_barrier (GEN_INT (3)));
-      did_stop = true;
-      cycles_left--;
+      if (!try_issue_nops (curr_state,
+			   3 - curr_state->accumulated_insns_num % 3))
+	return;
+      curr_state->after_nops_num
+	= 3 - curr_state->accumulated_insns_num % 3;
+      curr_state->accumulated_insns_num
+	+= 3 - curr_state->accumulated_insns_num % 3;
     }
-
-  if (did_stop)
-    init_insn_group_barriers ();
+  if (!insert_bundle_state (curr_state))
+    free_bundle_state (curr_state);
+  return;
 }
 
-/* We are about to being issuing insns for this clock cycle.
-   Override the default sort algorithm to better slot instructions.  */
+/* The following function returns position in the two window bundle
+   for given STATE.  */
 
 static int
-ia64_internal_sched_reorder (dump, sched_verbose, ready, pn_ready,
-		    reorder_type, clock_var)
-     FILE *dump ATTRIBUTE_UNUSED;
-     int sched_verbose ATTRIBUTE_UNUSED;
-     rtx *ready;
-     int *pn_ready;
-     int reorder_type, clock_var;
+get_max_pos (state_t state)
 {
-  int n_asms;
-  int n_ready = *pn_ready;
-  rtx *e_ready = ready + n_ready;
-  rtx *insnp;
+  if (cpu_unit_reservation_p (state, pos_6))
+    return 6;
+  else if (cpu_unit_reservation_p (state, pos_5))
+    return 5;
+  else if (cpu_unit_reservation_p (state, pos_4))
+    return 4;
+  else if (cpu_unit_reservation_p (state, pos_3))
+    return 3;
+  else if (cpu_unit_reservation_p (state, pos_2))
+    return 2;
+  else if (cpu_unit_reservation_p (state, pos_1))
+    return 1;
+  else
+    return 0;
+}
 
-  if (sched_verbose)
-    {
-      fprintf (dump, "// ia64_sched_reorder (type %d):\n", reorder_type);
-      dump_current_packet (dump);
-    }
+/* The function returns code of a possible template for given position
+   and state.  The function should be called only with 2 values of
+   position equal to 3 or 6.  */
 
-  /* Work around the pipeline flush that will occurr if the results of
-     an MM instruction are accessed before the result is ready.  Intel
-     documentation says this only happens with IALU, ISHF, ILOG, LD,
-     and ST consumers, but experimental evidence shows that *any* non-MM
-     type instruction will incurr the flush.  */
-  if (reorder_type == 0 && clock_var > 0 && ia64_final_schedule)
+static int
+get_template (state_t state, int pos)
+{
+  switch (pos)
     {
-      for (insnp = ready; insnp < e_ready; insnp++)
-	{
-	  rtx insn = *insnp, link;
-	  enum attr_itanium_class t = ia64_safe_itanium_class (insn);
+    case 3:
+      if (cpu_unit_reservation_p (state, _0mii_))
+	return 0;
+      else if (cpu_unit_reservation_p (state, _0mmi_))
+	return 1;
+      else if (cpu_unit_reservation_p (state, _0mfi_))
+	return 2;
+      else if (cpu_unit_reservation_p (state, _0mmf_))
+	return 3;
+      else if (cpu_unit_reservation_p (state, _0bbb_))
+	return 4;
+      else if (cpu_unit_reservation_p (state, _0mbb_))
+	return 5;
+      else if (cpu_unit_reservation_p (state, _0mib_))
+	return 6;
+      else if (cpu_unit_reservation_p (state, _0mmb_))
+	return 7;
+      else if (cpu_unit_reservation_p (state, _0mfb_))
+	return 8;
+      else if (cpu_unit_reservation_p (state, _0mlx_))
+	return 9;
+      else
+	abort ();
+    case 6:
+      if (cpu_unit_reservation_p (state, _1mii_))
+	return 0;
+      else if (cpu_unit_reservation_p (state, _1mmi_))
+	return 1;
+      else if (cpu_unit_reservation_p (state, _1mfi_))
+	return 2;
+      else if (_1mmf_ >= 0 && cpu_unit_reservation_p (state, _1mmf_))
+	return 3;
+      else if (cpu_unit_reservation_p (state, _1bbb_))
+	return 4;
+      else if (cpu_unit_reservation_p (state, _1mbb_))
+	return 5;
+      else if (cpu_unit_reservation_p (state, _1mib_))
+	return 6;
+      else if (cpu_unit_reservation_p (state, _1mmb_))
+	return 7;
+      else if (cpu_unit_reservation_p (state, _1mfb_))
+	return 8;
+      else if (cpu_unit_reservation_p (state, _1mlx_))
+	return 9;
+      else
+	abort ();
+    default:
+      abort ();
+    }
+}
 
-	  if (t == ITANIUM_CLASS_MMMUL
-	      || t == ITANIUM_CLASS_MMSHF
-	      || t == ITANIUM_CLASS_MMSHFI)
-	    continue;
+/* The following function returns an insn important for insn bundling
+   followed by INSN and before TAIL.  */
 
-	  for (link = LOG_LINKS (insn); link; link = XEXP (link, 1))
-	    if (REG_NOTE_KIND (link) == 0)
-	      {
-		rtx other = XEXP (link, 0);
-		enum attr_itanium_class t0 = ia64_safe_itanium_class (other);
-		if (t0 == ITANIUM_CLASS_MMSHF || t0 == ITANIUM_CLASS_MMMUL)
-		  {
-		    nop_cycles_until (clock_var, sched_verbose ? dump : NULL);
-		    goto out;
-		  }
-	      }
-	}
-    }
- out:
+static rtx
+get_next_important_insn (rtx insn, rtx tail)
+{
+  for (; insn && insn != tail; insn = NEXT_INSN (insn))
+    if (INSN_P (insn)
+	&& ia64_safe_itanium_class (insn) != ITANIUM_CLASS_IGNORE
+	&& GET_CODE (PATTERN (insn)) != USE
+	&& GET_CODE (PATTERN (insn)) != CLOBBER)
+      return insn;
+  return NULL_RTX;
+}
 
-  prev_first = sched_data.first_slot;
-  prev_cycle = clock_var;
+/* The following function does insn bundling.  Bundling means
+   inserting templates and nop insns to fit insn groups into permitted
+   templates.  Instruction scheduling uses NDFA (non-deterministic
+   finite automata) encoding informations about the templates and the
+   inserted nops.  Nondeterminism of the automata permits follows
+   all possible insn sequences very fast.
+
+   Unfortunately it is not possible to get information about inserting
+   nop insns and used templates from the automata states.  The
+   automata only says that we can issue an insn possibly inserting
+   some nops before it and using some template.  Therefore insn
+   bundling in this function is implemented by using DFA
+   (deterministic finite automata).  We follows all possible insn
+   sequences by inserting 0-2 nops (that is what the NDFA describe for
+   insn scheduling) before/after each insn being bundled.  We know the
+   start of simulated processor cycle from insn scheduling (insn
+   starting a new cycle has TImode).
+
+   Simple implementation of insn bundling would create enormous
+   number of possible insn sequences satisfying information about new
+   cycle ticks taken from the insn scheduling.  To make the algorithm
+   practical we use dynamic programming.  Each decision (about
+   inserting nops and implicitly about previous decisions) is described
+   by structure bundle_state (see above).  If we generate the same
+   bundle state (key is automaton state after issuing the insns and
+   nops for it), we reuse already generated one.  As consequence we
+   reject some decisions which can not improve the solution and
+   reduce memory for the algorithm.
+
+   When we reach the end of EBB (extended basic block), we choose the
+   best sequence and then, moving back in EBB, insert templates for
+   the best alternative.  The templates are taken from querying
+   automaton state for each insn in chosen bundle states.
+
+   So the algorithm makes two (forward and backward) passes through
+   EBB.  There is an additional forward pass through EBB for Itanium1
+   processor.  This pass inserts more nops to make dependency between
+   a producer insn and MMMUL/MMSHF at least 4 cycles long.  */
 
-  if (reorder_type == 0)
-    maybe_rotate (sched_verbose ? dump : NULL);
+static void
+bundling (FILE *dump, int verbose, rtx prev_head_insn, rtx tail)
+{
+  struct bundle_state *curr_state, *next_state, *best_state;
+  rtx insn, next_insn;
+  int insn_num;
+  int i, bundle_end_p, only_bundle_end_p, asm_p;
+  int pos = 0, max_pos, template0, template1;
+  rtx b;
+  rtx nop;
+  enum attr_type type;
 
-  /* First, move all USEs, CLOBBERs and other crud out of the way.  */
-  n_asms = 0;
-  for (insnp = ready; insnp < e_ready; insnp++)
-    if (insnp < e_ready)
+  insn_num = 0;
+  /* Count insns in the EBB.  */
+  for (insn = NEXT_INSN (prev_head_insn);
+       insn && insn != tail;
+       insn = NEXT_INSN (insn))
+    if (INSN_P (insn))
+      insn_num++;
+  if (insn_num == 0)
+    return;
+  bundling_p = 1;
+  dfa_clean_insn_cache ();
+  initiate_bundle_state_table ();
+  index_to_bundle_states = xmalloc ((insn_num + 2)
+				    * sizeof (struct bundle_state *));
+  /* First (forward) pass -- generation of bundle states. */
+  curr_state = get_free_bundle_state ();
+  curr_state->insn = NULL;
+  curr_state->before_nops_num = 0;
+  curr_state->after_nops_num = 0;
+  curr_state->insn_num = 0;
+  curr_state->cost = 0;
+  curr_state->accumulated_insns_num = 0;
+  curr_state->branch_deviation = 0;
+  curr_state->next = NULL;
+  curr_state->originator = NULL;
+  state_reset (curr_state->dfa_state);
+  index_to_bundle_states [0] = curr_state;
+  insn_num = 0;
+  /* Shift cycle mark if it is put on insn which could be ignored.  */
+  for (insn = NEXT_INSN (prev_head_insn);
+       insn != tail;
+       insn = NEXT_INSN (insn))
+    if (INSN_P (insn)
+	&& (ia64_safe_itanium_class (insn) == ITANIUM_CLASS_IGNORE
+	    || GET_CODE (PATTERN (insn)) == USE
+	    || GET_CODE (PATTERN (insn)) == CLOBBER)
+	&& GET_MODE (insn) == TImode)
       {
-	rtx insn = *insnp;
-	enum attr_type t = ia64_safe_type (insn);
-	if (t == TYPE_UNKNOWN)
+	PUT_MODE (insn, VOIDmode);
+	for (next_insn = NEXT_INSN (insn);
+	     next_insn != tail;
+	     next_insn = NEXT_INSN (next_insn))
+	  if (INSN_P (next_insn)
+	      && ia64_safe_itanium_class (next_insn) != ITANIUM_CLASS_IGNORE
+	      && GET_CODE (PATTERN (next_insn)) != USE
+	      && GET_CODE (PATTERN (next_insn)) != CLOBBER)
+	    {
+	      PUT_MODE (next_insn, TImode);
+	      break;
+	    }
+      }
+  /* Froward pass: generation of bundle states.  */
+  for (insn = get_next_important_insn (NEXT_INSN (prev_head_insn), tail);
+       insn != NULL_RTX;
+       insn = next_insn)
+    {
+      if (!INSN_P (insn)
+	  || ia64_safe_itanium_class (insn) == ITANIUM_CLASS_IGNORE
+	  || GET_CODE (PATTERN (insn)) == USE
+	  || GET_CODE (PATTERN (insn)) == CLOBBER)
+	abort ();
+      type = ia64_safe_type (insn);
+      next_insn = get_next_important_insn (NEXT_INSN (insn), tail);
+      insn_num++;
+      index_to_bundle_states [insn_num] = NULL;
+      for (curr_state = index_to_bundle_states [insn_num - 1];
+	   curr_state != NULL;
+	   curr_state = next_state)
+	{
+	  pos = curr_state->accumulated_insns_num % 3;
+	  next_state = curr_state->next;
+	  /* We must fill up the current bundle in order to start a
+	     subsequent asm insn in a new bundle.  Asm insn is always
+	     placed in a separate bundle.  */
+	  only_bundle_end_p
+	    = (next_insn != NULL_RTX
+	       && INSN_CODE (insn) == CODE_FOR_insn_group_barrier
+	       && ia64_safe_type (next_insn) == TYPE_UNKNOWN);
+	  /* We may fill up the current bundle if it is the cycle end
+	     without a group barrier.  */
+	  bundle_end_p
+	    = (only_bundle_end_p || next_insn == NULL_RTX
+	       || (GET_MODE (next_insn) == TImode
+		   && INSN_CODE (insn) != CODE_FOR_insn_group_barrier));
+	  if (type == TYPE_F || type == TYPE_B || type == TYPE_L
+	      || type == TYPE_S
+	      /* We need to insert 2 nops for cases like M_MII.  To
+		 guarantee issuing all insns on the same cycle for
+		 Itanium 1, we need to issue 2 nops after the first M
+		 insn (MnnMII where n is a nop insn).  */
+	      || ((type == TYPE_M || type == TYPE_A)
+		  && ia64_tune == PROCESSOR_ITANIUM
+		  && !bundle_end_p && pos == 1))
+	    issue_nops_and_insn (curr_state, 2, insn, bundle_end_p,
+				 only_bundle_end_p);
+	  issue_nops_and_insn (curr_state, 1, insn, bundle_end_p,
+			       only_bundle_end_p);
+	  issue_nops_and_insn (curr_state, 0, insn, bundle_end_p,
+			       only_bundle_end_p);
+	}
+      if (index_to_bundle_states [insn_num] == NULL)
+	abort ();
+      for (curr_state = index_to_bundle_states [insn_num];
+	   curr_state != NULL;
+	   curr_state = curr_state->next)
+	if (verbose >= 2 && dump)
 	  {
-	    if (GET_CODE (PATTERN (insn)) == ASM_INPUT
-		|| asm_noperands (PATTERN (insn)) >= 0)
-	      {
-		rtx lowest = ready[n_asms];
-		ready[n_asms] = insn;
-		*insnp = lowest;
-		n_asms++;
-	      }
-	    else
-	      {
-		rtx highest = ready[n_ready - 1];
-		ready[n_ready - 1] = insn;
-		*insnp = highest;
-		if (ia64_final_schedule && group_barrier_needed_p (insn))
-		  {
-		    schedule_stop (sched_verbose ? dump : NULL);
-		    sched_data.last_was_stop = 1;
-		    maybe_rotate (sched_verbose ? dump : NULL);
-		  }
-
-		return 1;
-	      }
+	    /* This structure is taken from generated code of the
+	       pipeline hazard recognizer (see file insn-attrtab.c).
+	       Please don't forget to change the structure if a new
+	       automaton is added to .md file.  */
+	    struct DFA_chip
+	    {
+	      unsigned short one_automaton_state;
+	      unsigned short oneb_automaton_state;
+	      unsigned short two_automaton_state;
+	      unsigned short twob_automaton_state;
+	    };
+
+	    fprintf
+	      (dump,
+	       "//    Bundle state %d (orig %d, cost %d, nops %d/%d, insns %d, branch %d, state %d) for %d\n",
+	       curr_state->unique_num,
+	       (curr_state->originator == NULL
+		? -1 : curr_state->originator->unique_num),
+	       curr_state->cost,
+	       curr_state->before_nops_num, curr_state->after_nops_num,
+	       curr_state->accumulated_insns_num, curr_state->branch_deviation,
+	       (ia64_tune == PROCESSOR_ITANIUM
+		? ((struct DFA_chip *) curr_state->dfa_state)->oneb_automaton_state
+		: ((struct DFA_chip *) curr_state->dfa_state)->twob_automaton_state),
+	       INSN_UID (insn));
 	  }
-      }
-  if (n_asms < n_ready)
-    {
-      /* Some normal insns to process.  Skip the asms.  */
-      ready += n_asms;
-      n_ready -= n_asms;
     }
-  else if (n_ready > 0)
-    {
-      /* Only asm insns left.  */
-      if (ia64_final_schedule && group_barrier_needed_p (ready[n_ready - 1]))
+  if (index_to_bundle_states [insn_num] == NULL)
+    /* We should find a solution because the 2nd insn scheduling has
+       found one.  */
+    abort ();
+  /* Find a state corresponding to the best insn sequence.  */
+  best_state = NULL;
+  for (curr_state = index_to_bundle_states [insn_num];
+       curr_state != NULL;
+       curr_state = curr_state->next)
+    /* We are just looking at the states with fully filled up last
+       bundle.  The first we prefer insn sequences with minimal cost
+       then with minimal inserted nops and finally with branch insns
+       placed in the 3rd slots.  */
+    if (curr_state->accumulated_insns_num % 3 == 0
+	&& (best_state == NULL || best_state->cost > curr_state->cost
+	    || (best_state->cost == curr_state->cost
+		&& (curr_state->accumulated_insns_num
+		    < best_state->accumulated_insns_num
+		    || (curr_state->accumulated_insns_num
+			== best_state->accumulated_insns_num
+			&& curr_state->branch_deviation
+			< best_state->branch_deviation)))))
+      best_state = curr_state;
+  /* Second (backward) pass: adding nops and templates.  */
+  insn_num = best_state->before_nops_num;
+  template0 = template1 = -1;
+  for (curr_state = best_state;
+       curr_state->originator != NULL;
+       curr_state = curr_state->originator)
+    {
+      insn = curr_state->insn;
+      asm_p = (GET_CODE (PATTERN (insn)) == ASM_INPUT
+	       || asm_noperands (PATTERN (insn)) >= 0);
+      insn_num++;
+      if (verbose >= 2 && dump)
 	{
-	  schedule_stop (sched_verbose ? dump : NULL);
-	  sched_data.last_was_stop = 1;
-	  maybe_rotate (sched_verbose ? dump : NULL);
+	  struct DFA_chip
+	  {
+	    unsigned short one_automaton_state;
+	    unsigned short oneb_automaton_state;
+	    unsigned short two_automaton_state;
+	    unsigned short twob_automaton_state;
+	  };
+
+	  fprintf
+	    (dump,
+	     "//    Best %d (orig %d, cost %d, nops %d/%d, insns %d, branch %d, state %d) for %d\n",
+	     curr_state->unique_num,
+	     (curr_state->originator == NULL
+	      ? -1 : curr_state->originator->unique_num),
+	     curr_state->cost,
+	     curr_state->before_nops_num, curr_state->after_nops_num,
+	     curr_state->accumulated_insns_num, curr_state->branch_deviation,
+	     (ia64_tune == PROCESSOR_ITANIUM
+	      ? ((struct DFA_chip *) curr_state->dfa_state)->oneb_automaton_state
+	      : ((struct DFA_chip *) curr_state->dfa_state)->twob_automaton_state),
+	     INSN_UID (insn));
 	}
-      cycle_end_fill_slots (sched_verbose ? dump : NULL);
-      return 1;
-    }
-
-  if (ia64_final_schedule)
-    {
-      int nr_need_stop = 0;
-
-      for (insnp = ready; insnp < e_ready; insnp++)
-	if (safe_group_barrier_needed_p (*insnp))
-	  nr_need_stop++;
-
-      /* Schedule a stop bit if
-          - all insns require a stop bit, or
-          - we are starting a new cycle and _any_ insns require a stop bit.
-         The reason for the latter is that if our schedule is accurate, then
-         the additional stop won't decrease performance at this point (since
-	 there's a split issue at this point anyway), but it gives us more
-         freedom when scheduling the currently ready insns.  */
-      if ((reorder_type == 0 && nr_need_stop)
-	  || (reorder_type == 1 && n_ready == nr_need_stop))
+      /* Find the position in the current bundle window.  The window can
+	 contain at most two bundles.  Two bundle window means that
+	 the processor will make two bundle rotation.  */
+      max_pos = get_max_pos (curr_state->dfa_state);
+      if (max_pos == 6
+	  /* The following (negative template number) means that the
+	     processor did one bundle rotation.  */
+	  || (max_pos == 3 && template0 < 0))
 	{
-	  schedule_stop (sched_verbose ? dump : NULL);
-	  sched_data.last_was_stop = 1;
-	  maybe_rotate (sched_verbose ? dump : NULL);
-	  if (reorder_type == 1)
-	    return 0;
+	  /* We are at the end of the window -- find template(s) for
+	     its bundle(s).  */
+	  pos = max_pos;
+	  if (max_pos == 3)
+	    template0 = get_template (curr_state->dfa_state, 3);
+	  else
+	    {
+	      template1 = get_template (curr_state->dfa_state, 3);
+	      template0 = get_template (curr_state->dfa_state, 6);
+	    }
 	}
-      else
+      if (max_pos > 3 && template1 < 0)
+	/* It may happen when we have the stop inside a bundle.  */
 	{
-	  int deleted = 0;
-	  insnp = e_ready;
-	  /* Move down everything that needs a stop bit, preserving relative
-	     order.  */
-	  while (insnp-- > ready + deleted)
-	    while (insnp >= ready + deleted)
+	  if (pos > 3)
+	    abort ();
+	  template1 = get_template (curr_state->dfa_state, 3);
+	  pos += 3;
+	}
+      if (!asm_p)
+	/* Emit nops after the current insn.  */
+	for (i = 0; i < curr_state->after_nops_num; i++)
+	  {
+	    nop = gen_nop ();
+	    emit_insn_after (nop, insn);
+	    pos--;
+	    if (pos < 0)
+	      abort ();
+	    if (pos % 3 == 0)
 	      {
-		rtx insn = *insnp;
-		if (! safe_group_barrier_needed_p (insn))
-		  break;
-		memmove (ready + 1, ready, (insnp - ready) * sizeof (rtx));
-		*ready = insn;
-		deleted++;
+		/* We are at the start of a bundle: emit the template
+		   (it should be defined).  */
+		if (template0 < 0)
+		  abort ();
+		b = gen_bundle_selector (GEN_INT (template0));
+		ia64_emit_insn_before (b, nop);
+		/* If we have two bundle window, we make one bundle
+		   rotation.  Otherwise template0 will be undefined
+		   (negative value).  */
+		template0 = template1;
+		template1 = -1;
 	      }
-	  n_ready -= deleted;
-	  ready += deleted;
-	  if (deleted != nr_need_stop)
+	  }
+      /* Move the position backward in the window.  Group barrier has
+	 no slot.  Asm insn takes all bundle.  */
+      if (INSN_CODE (insn) != CODE_FOR_insn_group_barrier
+	  && GET_CODE (PATTERN (insn)) != ASM_INPUT
+	  && asm_noperands (PATTERN (insn)) < 0)
+	pos--;
+      /* Long insn takes 2 slots.  */
+      if (ia64_safe_type (insn) == TYPE_L)
+	pos--;
+      if (pos < 0)
+	abort ();
+      if (pos % 3 == 0
+	  && INSN_CODE (insn) != CODE_FOR_insn_group_barrier
+	  && GET_CODE (PATTERN (insn)) != ASM_INPUT
+	  && asm_noperands (PATTERN (insn)) < 0)
+	{
+	  /* The current insn is at the bundle start: emit the
+	     template.  */
+	  if (template0 < 0)
 	    abort ();
+	  b = gen_bundle_selector (GEN_INT (template0));
+	  ia64_emit_insn_before (b, insn);
+	  b = PREV_INSN (insn);
+	  insn = b;
+	  /* See comment above in analogous place for emiting nops
+	     after the insn.  */
+	  template0 = template1;
+	  template1 = -1;
+	}
+      /* Emit nops after the current insn.  */
+      for (i = 0; i < curr_state->before_nops_num; i++)
+	{
+	  nop = gen_nop ();
+	  ia64_emit_insn_before (nop, insn);
+	  nop = PREV_INSN (insn);
+	  insn = nop;
+	  pos--;
+	  if (pos < 0)
+	    abort ();
+	  if (pos % 3 == 0)
+	    {
+	      /* See comment above in analogous place for emiting nops
+		 after the insn.  */
+	      if (template0 < 0)
+		abort ();
+	      b = gen_bundle_selector (GEN_INT (template0));
+	      ia64_emit_insn_before (b, insn);
+	      b = PREV_INSN (insn);
+	      insn = b;
+	      template0 = template1;
+	      template1 = -1;
+	    }
 	}
     }
-
-  return itanium_reorder (sched_verbose ? dump : NULL,
-			  ready, e_ready, reorder_type == 1);
+  if (ia64_tune == PROCESSOR_ITANIUM)
+    /* Insert additional cycles for MM-insns (MMMUL and MMSHF).
+       Itanium1 has a strange design, if the distance between an insn
+       and dependent MM-insn is less 4 then we have a 6 additional
+       cycles stall.  So we make the distance equal to 4 cycles if it
+       is less.  */
+    for (insn = get_next_important_insn (NEXT_INSN (prev_head_insn), tail);
+	 insn != NULL_RTX;
+	 insn = next_insn)
+      {
+	if (!INSN_P (insn)
+	    || ia64_safe_itanium_class (insn) == ITANIUM_CLASS_IGNORE
+	    || GET_CODE (PATTERN (insn)) == USE
+	    || GET_CODE (PATTERN (insn)) == CLOBBER)
+	  abort ();
+	next_insn = get_next_important_insn (NEXT_INSN (insn), tail);
+	if (INSN_UID (insn) < clocks_length && add_cycles [INSN_UID (insn)])
+	  /* We found a MM-insn which needs additional cycles.  */
+	  {
+	    rtx last;
+	    int i, j, n;
+	    int pred_stop_p;
+
+	    /* Now we are searching for a template of the bundle in
+	       which the MM-insn is placed and the position of the
+	       insn in the bundle (0, 1, 2).  Also we are searching
+	       for that there is a stop before the insn.  */
+	    last = prev_active_insn (insn);
+	    pred_stop_p = recog_memoized (last) == CODE_FOR_insn_group_barrier;
+	    if (pred_stop_p)
+	      last = prev_active_insn (last);
+	    n = 0;
+	    for (;; last = prev_active_insn (last))
+	      if (recog_memoized (last) == CODE_FOR_bundle_selector)
+		{
+		  template0 = XINT (XVECEXP (PATTERN (last), 0, 0), 0);
+		  if (template0 == 9)
+		    /* The insn is in MLX bundle.  Change the template
+		       onto MFI because we will add nops before the
+		       insn.  It simplifies subsequent code a lot.  */
+		    PATTERN (last)
+		      = gen_bundle_selector (GEN_INT (2)); /* -> MFI */
+		  break;
+		}
+	      else if (recog_memoized (last) != CODE_FOR_insn_group_barrier
+		       && (ia64_safe_itanium_class (last)
+			   != ITANIUM_CLASS_IGNORE))
+		n++;
+	    /* Some check of correctness: the stop is not at the
+	       bundle start, there are no more 3 insns in the bundle,
+	       and the MM-insn is not at the start of bundle with
+	       template MLX.  */
+	    if ((pred_stop_p && n == 0) || n > 2
+		|| (template0 == 9 && n != 0))
+	      abort ();
+	    /* Put nops after the insn in the bundle.  */
+	    for (j = 3 - n; j > 0; j --)
+	      ia64_emit_insn_before (gen_nop (), insn);
+	    /* It takes into account that we will add more N nops
+	       before the insn lately -- please see code below.  */
+	    add_cycles [INSN_UID (insn)]--;
+	    if (!pred_stop_p || add_cycles [INSN_UID (insn)])
+	      ia64_emit_insn_before (gen_insn_group_barrier (GEN_INT (3)),
+				     insn);
+	    if (pred_stop_p)
+	      add_cycles [INSN_UID (insn)]--;
+	    for (i = add_cycles [INSN_UID (insn)]; i > 0; i--)
+	      {
+		/* Insert "MII;" template.  */
+		ia64_emit_insn_before (gen_bundle_selector (GEN_INT (0)),
+				       insn);
+		ia64_emit_insn_before (gen_nop (), insn);
+		ia64_emit_insn_before (gen_nop (), insn);
+		if (i > 1)
+		  {
+		    /* To decrease code size, we use "MI;I;"
+		       template.  */
+		    ia64_emit_insn_before
+		      (gen_insn_group_barrier (GEN_INT (3)), insn);
+		    i--;
+		  }
+		ia64_emit_insn_before (gen_nop (), insn);
+		ia64_emit_insn_before (gen_insn_group_barrier (GEN_INT (3)),
+				       insn);
+	      }
+	    /* Put the MM-insn in the same slot of a bundle with the
+	       same template as the original one.  */
+	    ia64_emit_insn_before (gen_bundle_selector (GEN_INT (template0)),
+				   insn);
+	    /* To put the insn in the same slot, add necessary number
+	       of nops.  */
+	    for (j = n; j > 0; j --)
+	      ia64_emit_insn_before (gen_nop (), insn);
+	    /* Put the stop if the original bundle had it.  */
+	    if (pred_stop_p)
+	      ia64_emit_insn_before (gen_insn_group_barrier (GEN_INT (3)),
+				     insn);
+	  }
+      }
+  free (index_to_bundle_states);
+  finish_bundle_state_table ();
+  bundling_p = 0;
+  dfa_clean_insn_cache ();
 }
 
-static int
-ia64_sched_reorder (dump, sched_verbose, ready, pn_ready, clock_var)
-     FILE *dump;
-     int sched_verbose;
-     rtx *ready;
-     int *pn_ready;
-     int clock_var;
+/* The following function is called at the end of scheduling BB or
+   EBB.  After reload, it inserts stop bits and does insn bundling.  */
+
+static void
+ia64_sched_finish (FILE *dump, int sched_verbose)
 {
-  return ia64_internal_sched_reorder (dump, sched_verbose, ready,
-				      pn_ready, 0, clock_var);
+  if (sched_verbose)
+    fprintf (dump, "// Finishing schedule.\n");
+  if (!reload_completed)
+    return;
+  if (reload_completed)
+    {
+      final_emit_insn_group_barriers (dump);
+      bundling (dump, sched_verbose, current_sched_info->prev_head,
+		current_sched_info->next_tail);
+      if (sched_verbose && dump)
+	fprintf (dump, "//    finishing %d-%d\n",
+		 INSN_UID (NEXT_INSN (current_sched_info->prev_head)),
+		 INSN_UID (PREV_INSN (current_sched_info->next_tail)));
+
+      return;
+    }
 }
 
-/* Like ia64_sched_reorder, but called after issuing each insn.
-   Override the default sort algorithm to better slot instructions.  */
+/* The following function inserts stop bits in scheduled BB or EBB.  */
 
-static int
-ia64_sched_reorder2 (dump, sched_verbose, ready, pn_ready, clock_var)
-     FILE *dump ATTRIBUTE_UNUSED;
-     int sched_verbose ATTRIBUTE_UNUSED;
-     rtx *ready;
-     int *pn_ready;
-     int clock_var;
-{
-  if (sched_data.last_was_stop)
-    return 0;
+static void
+final_emit_insn_group_barriers (FILE *dump ATTRIBUTE_UNUSED)
+{
+  rtx insn;
+  int need_barrier_p = 0;
+  rtx prev_insn = NULL_RTX;
 
-  /* Detect one special case and try to optimize it.
-     If we have 1.M;;MI 2.MIx, and slots 2.1 (M) and 2.2 (I) are both NOPs,
-     then we can get better code by transforming this to 1.MFB;; 2.MIx.  */
-  if (sched_data.first_slot == 1
-      && sched_data.stopbit[0]
-      && ((sched_data.cur == 4
-	   && (sched_data.types[1] == TYPE_M || sched_data.types[1] == TYPE_A)
-	   && (sched_data.types[2] == TYPE_I || sched_data.types[2] == TYPE_A)
-	   && (sched_data.types[3] != TYPE_M && sched_data.types[3] != TYPE_A))
-	  || (sched_data.cur == 3
-	      && (sched_data.types[1] == TYPE_M
-		  || sched_data.types[1] == TYPE_A)
-	      && (sched_data.types[2] != TYPE_M
-		  && sched_data.types[2] != TYPE_I
-		  && sched_data.types[2] != TYPE_A))))
-      
-    {
-      int i, best;
-      rtx stop = sched_data.insns[1];
+  init_insn_group_barriers ();
 
-      /* Search backward for the stop bit that must be there.  */
-      while (1)
+  for (insn = NEXT_INSN (current_sched_info->prev_head);
+       insn != current_sched_info->next_tail;
+       insn = NEXT_INSN (insn))
+    {
+      if (GET_CODE (insn) == BARRIER)
 	{
-	  int insn_code;
-
-	  stop = PREV_INSN (stop);
-	  if (GET_CODE (stop) != INSN)
-	    abort ();
-	  insn_code = recog_memoized (stop);
-
-	  /* Ignore .pred.rel.mutex.
+	  rtx last = prev_active_insn (insn);
 
-	     ??? Update this to ignore cycle display notes too
-	     ??? once those are implemented  */
-	  if (insn_code == CODE_FOR_pred_rel_mutex
-	      || insn_code == CODE_FOR_prologue_use)
+	  if (! last)
 	    continue;
+	  if (GET_CODE (last) == JUMP_INSN
+	      && GET_CODE (PATTERN (last)) == ADDR_DIFF_VEC)
+	    last = prev_active_insn (last);
+	  if (recog_memoized (last) != CODE_FOR_insn_group_barrier)
+	    emit_insn_after (gen_insn_group_barrier (GEN_INT (3)), last);
 
-	  if (insn_code == CODE_FOR_insn_group_barrier)
-	    break;
-	  abort ();
+	  init_insn_group_barriers ();
+	  need_barrier_p = 0;
+	  prev_insn = NULL_RTX;
 	}
-
-      /* Adjust the stop bit's slot selector.  */
-      if (INTVAL (XVECEXP (PATTERN (stop), 0, 0)) != 1)
-	abort ();
-      XVECEXP (PATTERN (stop), 0, 0) = GEN_INT (3);
-
-      sched_data.stopbit[0] = 0;
-      sched_data.stopbit[2] = 1;
-
-      sched_data.types[5] = sched_data.types[3];
-      sched_data.types[4] = sched_data.types[2];
-      sched_data.types[3] = sched_data.types[1];
-      sched_data.insns[5] = sched_data.insns[3];
-      sched_data.insns[4] = sched_data.insns[2];
-      sched_data.insns[3] = sched_data.insns[1];
-      sched_data.stopbit[5] = sched_data.stopbit[4] = sched_data.stopbit[3] = 0;
-      sched_data.cur += 2;
-      sched_data.first_slot = 3;
-      for (i = 0; i < NR_PACKETS; i++)
+      else if (INSN_P (insn))
 	{
-	  const struct ia64_packet *p = packets + i;
-	  if (p->t[0] == TYPE_M && p->t[1] == TYPE_F && p->t[2] == TYPE_B)
+	  if (recog_memoized (insn) == CODE_FOR_insn_group_barrier)
 	    {
-	      sched_data.packet = p;
-	      break;
+	      init_insn_group_barriers ();
+	      need_barrier_p = 0;
+	      prev_insn = NULL_RTX;
 	    }
-	}
-      rotate_one_bundle (sched_verbose ? dump : NULL);
-
-      best = 6;
-      for (i = 0; i < NR_PACKETS; i++)
-	{
-	  const struct ia64_packet *p = packets + i;
-	  int split = get_split (p, sched_data.first_slot);
-	  int next;
-
-	  /* Disallow multiway branches here.  */
-	  if (p->t[1] == TYPE_B)
-	    continue;
-
-	  if (packet_matches_p (p, split, &next) && next < best)
+	  else if (need_barrier_p || group_barrier_needed_p (insn))
 	    {
-	      best = next;
-	      sched_data.packet = p;
-	      sched_data.split = split;
+	      if (TARGET_EARLY_STOP_BITS)
+		{
+		  rtx last;
+
+		  for (last = insn;
+		       last != current_sched_info->prev_head;
+		       last = PREV_INSN (last))
+		    if (INSN_P (last) && GET_MODE (last) == TImode
+			&& stops_p [INSN_UID (last)])
+		      break;
+		  if (last == current_sched_info->prev_head)
+		    last = insn;
+		  last = prev_active_insn (last);
+		  if (last
+		      && recog_memoized (last) != CODE_FOR_insn_group_barrier)
+		    emit_insn_after (gen_insn_group_barrier (GEN_INT (3)),
+				     last);
+		  init_insn_group_barriers ();
+		  for (last = NEXT_INSN (last);
+		       last != insn;
+		       last = NEXT_INSN (last))
+		    if (INSN_P (last))
+		      group_barrier_needed_p (last);
+		}
+	      else
+		{
+		  emit_insn_before (gen_insn_group_barrier (GEN_INT (3)),
+				    insn);
+		  init_insn_group_barriers ();
+		}
+	      group_barrier_needed_p (insn);
+	      prev_insn = NULL_RTX;
 	    }
+	  else if (recog_memoized (insn) >= 0)
+	    prev_insn = insn;
+	  need_barrier_p = (GET_CODE (insn) == CALL_INSN
+			    || GET_CODE (PATTERN (insn)) == ASM_INPUT
+			    || asm_noperands (PATTERN (insn)) >= 0);
 	}
-      if (best == 6)
-	abort ();
     }
+}
 
-  if (*pn_ready > 0)
-    {
-      int more = ia64_internal_sched_reorder (dump, sched_verbose,
-					      ready, pn_ready, 1,
-					      clock_var);
-      if (more)
-	return more;
-      /* Did we schedule a stop?  If so, finish this cycle.  */
-      if (sched_data.cur == sched_data.first_slot)
-	return 0;
-    }
+
 
-  if (sched_verbose)
-    fprintf (dump, "//   Can't issue more this cycle; updating type array.\n");
+/* If the following function returns TRUE, we will use the the DFA
+   insn scheduler.  */
 
-  cycle_end_fill_slots (sched_verbose ? dump : NULL);
-  if (sched_verbose)
-    dump_current_packet (dump);
-  return 0;
+static int
+ia64_use_dfa_pipeline_interface (void)
+{
+  return 1;
 }
 
-/* We are about to issue INSN.  Return the number of insns left on the
-   ready queue that can be issued this cycle.  */
+/* If the following function returns TRUE, we will use the the DFA
+   insn scheduler.  */
 
 static int
-ia64_variable_issue (dump, sched_verbose, insn, can_issue_more)
-     FILE *dump;
-     int sched_verbose;
-     rtx insn;
-     int can_issue_more ATTRIBUTE_UNUSED;
+ia64_first_cycle_multipass_dfa_lookahead (void)
 {
-  enum attr_type t = ia64_safe_type (insn);
+  return (reload_completed ? 6 : 4);
+}
 
-  if (sched_data.last_was_stop)
-    {
-      int t = sched_data.first_slot;
-      if (t == 0)
-	t = 3;
-      ia64_emit_insn_before (gen_insn_group_barrier (GEN_INT (t)), insn);
-      init_insn_group_barriers ();
-      sched_data.last_was_stop = 0;
-    }
+/* The following function initiates variable `dfa_pre_cycle_insn'.  */
 
-  if (t == TYPE_UNKNOWN)
+static void
+ia64_init_dfa_pre_cycle_insn (void)
+{
+  if (temp_dfa_state == NULL)
     {
-      if (sched_verbose)
-	fprintf (dump, "// Ignoring type %s\n", type_names[t]);
-      if (GET_CODE (PATTERN (insn)) == ASM_INPUT
-	  || asm_noperands (PATTERN (insn)) >= 0)
-	{
-	  /* This must be some kind of asm.  Clear the scheduling state.  */
-	  rotate_two_bundles (sched_verbose ? dump : NULL);
-	  if (ia64_final_schedule)
-	    group_barrier_needed_p (insn);
-	}
-      return 1;
+      dfa_state_size = state_size ();
+      temp_dfa_state = xmalloc (dfa_state_size);
+      prev_cycle_state = xmalloc (dfa_state_size);
     }
+  dfa_pre_cycle_insn = make_insn_raw (gen_pre_cycle ());
+  PREV_INSN (dfa_pre_cycle_insn) = NEXT_INSN (dfa_pre_cycle_insn) = NULL_RTX;
+  recog_memoized (dfa_pre_cycle_insn);
+  dfa_stop_insn = make_insn_raw (gen_insn_group_barrier (GEN_INT (3)));
+  PREV_INSN (dfa_stop_insn) = NEXT_INSN (dfa_stop_insn) = NULL_RTX;
+  recog_memoized (dfa_stop_insn);
+}
+
+/* The following function returns the pseudo insn DFA_PRE_CYCLE_INSN
+   used by the DFA insn scheduler.  */
+
+static rtx
+ia64_dfa_pre_cycle_insn (void)
+{
+  return dfa_pre_cycle_insn;
+}
+
+/* The following function returns TRUE if PRODUCER (of type ilog or
+   ld) produces address for CONSUMER (of type st or stf). */
 
-  /* This is _not_ just a sanity check.  group_barrier_needed_p will update
-     important state info.  Don't delete this test.  */
-  if (ia64_final_schedule
-      && group_barrier_needed_p (insn))
+int
+ia64_st_address_bypass_p (rtx producer, rtx consumer)
+{
+  rtx dest, reg, mem;
+
+  if (producer == NULL_RTX || consumer == NULL_RTX)
+    abort ();
+  dest = ia64_single_set (producer);
+  if (dest == NULL_RTX || (reg = SET_DEST (dest)) == NULL_RTX
+      || (GET_CODE (reg) != REG && GET_CODE (reg) != SUBREG))
     abort ();
+  if (GET_CODE (reg) == SUBREG)
+    reg = SUBREG_REG (reg);
+  dest = ia64_single_set (consumer);
+  if (dest == NULL_RTX || (mem = SET_DEST (dest)) == NULL_RTX
+      || GET_CODE (mem) != MEM)
+    abort ();
+  return reg_mentioned_p (reg, mem);
+}
 
-  sched_data.stopbit[sched_data.cur] = 0;
-  sched_data.insns[sched_data.cur] = insn;
-  sched_data.types[sched_data.cur] = t;
+/* The following function returns TRUE if PRODUCER (of type ilog or
+   ld) produces address for CONSUMER (of type ld or fld). */
 
-  sched_data.cur++;
-  if (sched_verbose)
-    fprintf (dump, "// Scheduling insn %d of type %s\n",
-	     INSN_UID (insn), type_names[t]);
+int
+ia64_ld_address_bypass_p (rtx producer, rtx consumer)
+{
+  rtx dest, src, reg, mem;
 
-  if (GET_CODE (insn) == CALL_INSN && ia64_final_schedule)
-    {
-      schedule_stop (sched_verbose ? dump : NULL);
-      sched_data.last_was_stop = 1;
-    }
+  if (producer == NULL_RTX || consumer == NULL_RTX)
+    abort ();
+  dest = ia64_single_set (producer);
+  if (dest == NULL_RTX || (reg = SET_DEST (dest)) == NULL_RTX
+      || (GET_CODE (reg) != REG && GET_CODE (reg) != SUBREG))
+    abort ();
+  if (GET_CODE (reg) == SUBREG)
+    reg = SUBREG_REG (reg);
+  src = ia64_single_set (consumer);
+  if (src == NULL_RTX || (mem = SET_SRC (src)) == NULL_RTX)
+    abort ();
+  if (GET_CODE (mem) == UNSPEC && XVECLEN (mem, 0) > 0)
+    mem = XVECEXP (mem, 0, 0);
+  while (GET_CODE (mem) == SUBREG || GET_CODE (mem) == ZERO_EXTEND)
+    mem = XEXP (mem, 0);
 
-  return 1;
+  /* Note that LO_SUM is used for GOT loads.  */
+  if (GET_CODE (mem) != LO_SUM && GET_CODE (mem) != MEM)
+    abort ();
+
+  return reg_mentioned_p (reg, mem);
 }
 
-/* Free data allocated by ia64_sched_init.  */
+/* The following function returns TRUE if INSN produces address for a
+   load/store insn.  We will place such insns into M slot because it
+   decreases its latency time. */
 
-static void
-ia64_sched_finish (dump, sched_verbose)
-     FILE *dump;
-     int sched_verbose;
+int
+ia64_produce_address_p (rtx insn)
 {
-  if (sched_verbose)
-    fprintf (dump, "// Finishing schedule.\n");
-  rotate_two_bundles (NULL);
-  free (sched_types);
-  free (sched_ready);
+  return insn->call;
 }
+
 
 /* Emit pseudo-ops for the assembler to describe predicate relations.
    At present this assumes that we only consider predicate pairs to
@@ -6967,14 +7436,14 @@ ia64_sched_finish (dump, sched_verbose)
    straight-line code.  */
 
 static void
-emit_predicate_relation_info ()
+emit_predicate_relation_info (void)
 {
   basic_block bb;
 
   FOR_EACH_BB_REVERSE (bb)
     {
       int r;
-      rtx head = bb->head;
+      rtx head = BB_HEAD (bb);
 
       /* We only need such notes at code labels.  */
       if (GET_CODE (head) != CODE_LABEL)
@@ -6988,8 +7457,8 @@ emit_predicate_relation_info ()
 	  {
 	    rtx p = gen_rtx_REG (BImode, r);
 	    rtx n = emit_insn_after (gen_pred_rel_mutex (p), head);
-	    if (head == bb->end)
-	      bb->end = n;
+	    if (head == BB_END (bb))
+	      BB_END (bb) = n;
 	    head = n;
 	  }
     }
@@ -7000,8 +7469,8 @@ emit_predicate_relation_info ()
      the call.  */
   FOR_EACH_BB_REVERSE (bb)
     {
-      rtx insn = bb->head;
-      
+      rtx insn = BB_HEAD (bb);
+
       while (1)
 	{
 	  if (GET_CODE (insn) == CALL_INSN
@@ -7010,130 +7479,23 @@ emit_predicate_relation_info ()
 	    {
 	      rtx b = emit_insn_before (gen_safe_across_calls_all (), insn);
 	      rtx a = emit_insn_after (gen_safe_across_calls_normal (), insn);
-	      if (bb->head == insn)
-		bb->head = b;
-	      if (bb->end == insn)
-		bb->end = a;
+	      if (BB_HEAD (bb) == insn)
+		BB_HEAD (bb) = b;
+	      if (BB_END (bb) == insn)
+		BB_END (bb) = a;
 	    }
-	  
-	  if (insn == bb->end)
+
+	  if (insn == BB_END (bb))
 	    break;
 	  insn = NEXT_INSN (insn);
 	}
     }
 }
 
-/* Generate a NOP instruction of type T.  We will never generate L type
-   nops.  */
-
-static rtx
-gen_nop_type (t)
-     enum attr_type t;
-{
-  switch (t)
-    {
-    case TYPE_M:
-      return gen_nop_m ();
-    case TYPE_I:
-      return gen_nop_i ();
-    case TYPE_B:
-      return gen_nop_b ();
-    case TYPE_F:
-      return gen_nop_f ();
-    case TYPE_X:
-      return gen_nop_x ();
-    default:
-      abort ();
-    }
-}
-
-/* After the last scheduling pass, fill in NOPs.  It's easier to do this
-   here than while scheduling.  */
-
-static void
-ia64_emit_nops ()
-{
-  rtx insn;
-  const struct bundle *b = 0;
-  int bundle_pos = 0;
-
-  for (insn = get_insns (); insn; insn = NEXT_INSN (insn))
-    {
-      rtx pat;
-      enum attr_type t;
-      pat = INSN_P (insn) ? PATTERN (insn) : const0_rtx;
-      if (GET_CODE (pat) == USE || GET_CODE (pat) == CLOBBER)
-	continue;
-      if ((GET_CODE (pat) == UNSPEC && XINT (pat, 1) == UNSPEC_BUNDLE_SELECTOR)
-	  || GET_CODE (insn) == CODE_LABEL)
-	{
-	  if (b)
-	    while (bundle_pos < 3)
-	      {
-		emit_insn_before (gen_nop_type (b->t[bundle_pos]), insn);
-		bundle_pos++;
-	      }
-	  if (GET_CODE (insn) != CODE_LABEL)
-	    b = bundle + INTVAL (XVECEXP (pat, 0, 0));
-	  else
-	    b = 0;
-	  bundle_pos = 0;
-	  continue;
-	}
-      else if (GET_CODE (pat) == UNSPEC_VOLATILE
-	       && XINT (pat, 1) == UNSPECV_INSN_GROUP_BARRIER)
-	{
-	  int t = INTVAL (XVECEXP (pat, 0, 0));
-	  if (b)
-	    while (bundle_pos < t)
-	      {
-		emit_insn_before (gen_nop_type (b->t[bundle_pos]), insn);
-		bundle_pos++;
-	      }
-	  continue;
-	}
-
-      if (bundle_pos == 3)
-	b = 0;
-
-      if (b && INSN_P (insn))
-	{
-	  t = ia64_safe_type (insn);
-	  if (asm_noperands (PATTERN (insn)) >= 0
-	      || GET_CODE (PATTERN (insn)) == ASM_INPUT)
-	    {
-	      while (bundle_pos < 3)
-		{
-		  if (b->t[bundle_pos] != TYPE_L)
-		    emit_insn_before (gen_nop_type (b->t[bundle_pos]), insn);
-		  bundle_pos++;
-		}
-	      continue;
-	    }
-
-	  if (t == TYPE_UNKNOWN)
-	    continue;
-	  while (bundle_pos < 3)
-	    {
-	      if (t == b->t[bundle_pos]
-		  || (t == TYPE_A && (b->t[bundle_pos] == TYPE_M
-				      || b->t[bundle_pos] == TYPE_I)))
-		break;
-
-	      emit_insn_before (gen_nop_type (b->t[bundle_pos]), insn);
-	      bundle_pos++;
-	    }
-	  if (bundle_pos < 3)
-	    bundle_pos++;
-	}
-    }
-}
-
 /* Perform machine dependent operations on the rtl chain INSNS.  */
 
-void
-ia64_reorg (insns)
-     rtx insns;
+static void
+ia64_reorg (void)
 {
   /* We are freeing block_for_insn in the toplev to keep compatibility
      with old MDEP_REORGS that are not CFG based.  Recompute it now.  */
@@ -7151,17 +7513,91 @@ ia64_reorg (insns)
     {
       timevar_push (TV_SCHED2);
       ia64_final_schedule = 1;
+
+      initiate_bundle_states ();
+      ia64_nop = make_insn_raw (gen_nop ());
+      PREV_INSN (ia64_nop) = NEXT_INSN (ia64_nop) = NULL_RTX;
+      recog_memoized (ia64_nop);
+      clocks_length = get_max_uid () + 1;
+      stops_p = xcalloc (1, clocks_length);
+      if (ia64_tune == PROCESSOR_ITANIUM)
+	{
+	  clocks = xcalloc (clocks_length, sizeof (int));
+	  add_cycles = xcalloc (clocks_length, sizeof (int));
+	}
+      if (ia64_tune == PROCESSOR_ITANIUM2)
+	{
+	  pos_1 = get_cpu_unit_code ("2_1");
+	  pos_2 = get_cpu_unit_code ("2_2");
+	  pos_3 = get_cpu_unit_code ("2_3");
+	  pos_4 = get_cpu_unit_code ("2_4");
+	  pos_5 = get_cpu_unit_code ("2_5");
+	  pos_6 = get_cpu_unit_code ("2_6");
+	  _0mii_ = get_cpu_unit_code ("2b_0mii.");
+	  _0mmi_ = get_cpu_unit_code ("2b_0mmi.");
+	  _0mfi_ = get_cpu_unit_code ("2b_0mfi.");
+	  _0mmf_ = get_cpu_unit_code ("2b_0mmf.");
+	  _0bbb_ = get_cpu_unit_code ("2b_0bbb.");
+	  _0mbb_ = get_cpu_unit_code ("2b_0mbb.");
+	  _0mib_ = get_cpu_unit_code ("2b_0mib.");
+	  _0mmb_ = get_cpu_unit_code ("2b_0mmb.");
+	  _0mfb_ = get_cpu_unit_code ("2b_0mfb.");
+	  _0mlx_ = get_cpu_unit_code ("2b_0mlx.");
+	  _1mii_ = get_cpu_unit_code ("2b_1mii.");
+	  _1mmi_ = get_cpu_unit_code ("2b_1mmi.");
+	  _1mfi_ = get_cpu_unit_code ("2b_1mfi.");
+	  _1mmf_ = get_cpu_unit_code ("2b_1mmf.");
+	  _1bbb_ = get_cpu_unit_code ("2b_1bbb.");
+	  _1mbb_ = get_cpu_unit_code ("2b_1mbb.");
+	  _1mib_ = get_cpu_unit_code ("2b_1mib.");
+	  _1mmb_ = get_cpu_unit_code ("2b_1mmb.");
+	  _1mfb_ = get_cpu_unit_code ("2b_1mfb.");
+	  _1mlx_ = get_cpu_unit_code ("2b_1mlx.");
+	}
+      else
+	{
+	  pos_1 = get_cpu_unit_code ("1_1");
+	  pos_2 = get_cpu_unit_code ("1_2");
+	  pos_3 = get_cpu_unit_code ("1_3");
+	  pos_4 = get_cpu_unit_code ("1_4");
+	  pos_5 = get_cpu_unit_code ("1_5");
+	  pos_6 = get_cpu_unit_code ("1_6");
+	  _0mii_ = get_cpu_unit_code ("1b_0mii.");
+	  _0mmi_ = get_cpu_unit_code ("1b_0mmi.");
+	  _0mfi_ = get_cpu_unit_code ("1b_0mfi.");
+	  _0mmf_ = get_cpu_unit_code ("1b_0mmf.");
+	  _0bbb_ = get_cpu_unit_code ("1b_0bbb.");
+	  _0mbb_ = get_cpu_unit_code ("1b_0mbb.");
+	  _0mib_ = get_cpu_unit_code ("1b_0mib.");
+	  _0mmb_ = get_cpu_unit_code ("1b_0mmb.");
+	  _0mfb_ = get_cpu_unit_code ("1b_0mfb.");
+	  _0mlx_ = get_cpu_unit_code ("1b_0mlx.");
+	  _1mii_ = get_cpu_unit_code ("1b_1mii.");
+	  _1mmi_ = get_cpu_unit_code ("1b_1mmi.");
+	  _1mfi_ = get_cpu_unit_code ("1b_1mfi.");
+	  _1mmf_ = get_cpu_unit_code ("1b_1mmf.");
+	  _1bbb_ = get_cpu_unit_code ("1b_1bbb.");
+	  _1mbb_ = get_cpu_unit_code ("1b_1mbb.");
+	  _1mib_ = get_cpu_unit_code ("1b_1mib.");
+	  _1mmb_ = get_cpu_unit_code ("1b_1mmb.");
+	  _1mfb_ = get_cpu_unit_code ("1b_1mfb.");
+	  _1mlx_ = get_cpu_unit_code ("1b_1mlx.");
+	}
       schedule_ebbs (rtl_dump_file);
+      finish_bundle_states ();
+      if (ia64_tune == PROCESSOR_ITANIUM)
+	{
+	  free (add_cycles);
+	  free (clocks);
+	}
+      free (stops_p);
+      emit_insn_group_barriers (rtl_dump_file);
+
       ia64_final_schedule = 0;
       timevar_pop (TV_SCHED2);
-
-      /* This relies on the NOTE_INSN_BASIC_BLOCK notes to be in the same
-	 place as they were during scheduling.  */
-      emit_insn_group_barriers (rtl_dump_file, insns);
-      ia64_emit_nops ();
     }
   else
-    emit_all_insn_group_barriers (rtl_dump_file, insns);
+    emit_all_insn_group_barriers (rtl_dump_file);
 
   /* A call must not be the last instruction in a function, so that the
      return address is still within the function, so that unwinding works
@@ -7174,11 +7610,12 @@ ia64_reorg (insns)
       insn = get_last_insn ();
       if (! INSN_P (insn))
         insn = prev_active_insn (insn);
-      if (GET_CODE (insn) == INSN
-	  && GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
-	  && XINT (PATTERN (insn), 1) == UNSPECV_INSN_GROUP_BARRIER)
-	{
-	  saw_stop = 1;
+      /* Skip over insns that expand to nothing.  */
+      while (GET_CODE (insn) == INSN && get_attr_empty (insn) == EMPTY_YES)
+        {
+	  if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
+	      && XINT (PATTERN (insn), 1) == UNSPECV_INSN_GROUP_BARRIER)
+	    saw_stop = 1;
 	  insn = prev_active_insn (insn);
 	}
       if (GET_CODE (insn) == CALL_INSN)
@@ -7197,8 +7634,7 @@ ia64_reorg (insns)
 /* Return true if REGNO is used by the epilogue.  */
 
 int
-ia64_epilogue_uses (regno)
-     int regno;
+ia64_epilogue_uses (int regno)
 {
   switch (regno)
     {
@@ -7237,8 +7673,7 @@ ia64_epilogue_uses (regno)
 /* Return true if REGNO is used by the frame unwinder.  */
 
 int
-ia64_eh_uses (regno)
-     int regno;
+ia64_eh_uses (int regno)
 {
   if (! reload_completed)
     return 0;
@@ -7262,18 +7697,7 @@ ia64_eh_uses (regno)
   return 0;
 }
 
-/* For ia64, SYMBOL_REF_FLAG set means that it is a function.
-
-   We add @ to the name if this goes in small data/bss.  We can only put
-   a variable in small data/bss if it is defined in this module or a module
-   that we are statically linked with.  We can't check the second condition,
-   but TREE_STATIC gives us the first one.  */
-
-/* ??? If we had IPA, we could check the second condition.  We could support
-   programmer added section attributes if the variable is not defined in this
-   module.  */
-
-/* ??? See the v850 port for a cleaner way to do this.  */
+/* Return true if this goes in small data/bss.  */
 
 /* ??? We could also support own long data here.  Generating movl/add/ld8
    instead of addl,ld8/ld8.  This makes the code bigger, but should make the
@@ -7281,12 +7705,19 @@ ia64_eh_uses (regno)
    types which can't go in sdata/sbss.  */
 
 static bool
-ia64_in_small_data_p (exp)
-     tree exp;
+ia64_in_small_data_p (tree exp)
 {
   if (TARGET_NO_SDATA)
     return false;
 
+  /* We want to merge strings, so we never consider them small data.  */
+  if (TREE_CODE (exp) == STRING_CST)
+    return false;
+
+  /* Functions are never small data.  */
+  if (TREE_CODE (exp) == FUNCTION_DECL)
+    return false;
+
   if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
     {
       const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp));
@@ -7306,93 +7737,6 @@ ia64_in_small_data_p (exp)
 
   return false;
 }
-
-static void
-ia64_encode_section_info (decl, first)
-     tree decl;
-     int first ATTRIBUTE_UNUSED;
-{
-  const char *symbol_str;
-  bool is_local;
-  rtx symbol;
-  char encoding = 0;
-
-  if (TREE_CODE (decl) == FUNCTION_DECL)
-    {
-      SYMBOL_REF_FLAG (XEXP (DECL_RTL (decl), 0)) = 1;
-      return;
-    }
-
-  /* Careful not to prod global register variables.  */
-  if (TREE_CODE (decl) != VAR_DECL
-      || GET_CODE (DECL_RTL (decl)) != MEM
-      || GET_CODE (XEXP (DECL_RTL (decl), 0)) != SYMBOL_REF)
-    return;
-
-  symbol = XEXP (DECL_RTL (decl), 0);
-  symbol_str = XSTR (symbol, 0);
-
-  is_local = (*targetm.binds_local_p) (decl);
-
-  if (TREE_CODE (decl) == VAR_DECL && DECL_THREAD_LOCAL (decl))
-    encoding = " GLil"[decl_tls_model (decl)];
-  /* Determine if DECL will wind up in .sdata/.sbss.  */
-  else if (is_local && ia64_in_small_data_p (decl))
-    encoding = 's';
-
-  /* Finally, encode this into the symbol string.  */
-  if (encoding)
-    {
-      char *newstr;
-      size_t len;
-
-      if (symbol_str[0] == ENCODE_SECTION_INFO_CHAR)
-	{
-	  if (encoding == symbol_str[1])
-	    return;
-	  /* ??? Sdata became thread or thread becaome not thread.  Lose.  */
-	  abort ();
-	}
-
-      len = strlen (symbol_str);
-      newstr = alloca (len + 3);
-      newstr[0] = ENCODE_SECTION_INFO_CHAR;
-      newstr[1] = encoding;
-      memcpy (newstr + 2, symbol_str, len + 1);
-
-      XSTR (symbol, 0) = ggc_alloc_string (newstr, len + 2);
-    }
-
-  /* This decl is marked as being in small data/bss but it shouldn't be;
-     one likely explanation for this is that the decl has been moved into
-     a different section from the one it was in when encode_section_info
-     was first called.  Remove the encoding.  */
-  else if (symbol_str[0] == ENCODE_SECTION_INFO_CHAR)
-    XSTR (symbol, 0) = ggc_strdup (symbol_str + 2);
-}
-
-static const char *
-ia64_strip_name_encoding (str)
-     const char *str;
-{
-  if (str[0] == ENCODE_SECTION_INFO_CHAR)
-    str += 2;
-  if (str[0] == '*')
-    str++;
-  return str;
-}
-
-/* True if it is OK to do sibling call optimization for the specified
-   call expression EXP.  DECL will be the called function, or NULL if
-   this is an indirect call.  */
-bool
-ia64_function_ok_for_sibcall (decl)
-     tree decl;
-{
-  /* We must always return with our current GP.  This means we can
-     only sibcall to functions defined in the current module.  */
-  return decl && (*targetm.binds_local_p) (decl);
-}
 
 /* Output assembly directives for prologue regions.  */
 
@@ -7407,7 +7751,7 @@ static bool need_copy_state;
 /* The function emits unwind directives for the start of an epilogue.  */
 
 static void
-process_epilogue ()
+process_epilogue (void)
 {
   /* If this isn't the last block of the function, then we need to label the
      current state, and copy it back in at the start of the next block.  */
@@ -7425,9 +7769,7 @@ process_epilogue ()
    which result in emitting an assembly directive required for unwinding.  */
 
 static int
-process_set (asm_out_file, pat)
-     FILE *asm_out_file;
-     rtx pat;
+process_set (FILE *asm_out_file, rtx pat)
 {
   rtx src = SET_SRC (pat);
   rtx dest = SET_DEST (pat);
@@ -7460,12 +7802,8 @@ process_set (asm_out_file, pat)
 	  if (op0 == dest && GET_CODE (op1) == CONST_INT)
 	    {
 	      if (INTVAL (op1) < 0)
-		{
-		  fputs ("\t.fframe ", asm_out_file);
-		  fprintf (asm_out_file, HOST_WIDE_INT_PRINT_DEC,
-			   -INTVAL (op1));
-		  fputc ('\n', asm_out_file);
-		}
+		fprintf (asm_out_file, "\t.fframe "HOST_WIDE_INT_PRINT_DEC"\n",
+			 -INTVAL (op1));
 	      else
 		process_epilogue ();
 	    }
@@ -7641,9 +7979,7 @@ process_set (asm_out_file, pat)
 /* This function looks at a single insn and emits any directives
    required to unwind this insn.  */
 void
-process_for_unwind_directive (asm_out_file, insn)
-     FILE *asm_out_file;
-     rtx insn;
+process_for_unwind_directive (FILE *asm_out_file, rtx insn)
 {
   if (flag_unwind_tables
       || (flag_exceptions && !USING_SJLJ_EXCEPTIONS))
@@ -7700,7 +8036,7 @@ process_for_unwind_directive (asm_out_file, insn)
 
 
 void
-ia64_init_builtins ()
+ia64_init_builtins (void)
 {
   tree psi_type_node = build_pointer_type (integer_type_node);
   tree pdi_type_node = build_pointer_type (long_integer_type_node);
@@ -7744,6 +8080,36 @@ ia64_init_builtins ()
   tree void_ftype_pdi
     = build_function_type_list (void_type_node, pdi_type_node, NULL_TREE);
 
+  tree fpreg_type;
+  tree float80_type;
+
+  /* The __fpreg type.  */
+  fpreg_type = make_node (REAL_TYPE);
+  /* ??? The back end should know to load/save __fpreg variables using
+     the ldf.fill and stf.spill instructions.  */
+  TYPE_PRECISION (fpreg_type) = 96;
+  layout_type (fpreg_type);
+  (*lang_hooks.types.register_builtin_type) (fpreg_type, "__fpreg");
+
+  /* The __float80 type.  */
+  float80_type = make_node (REAL_TYPE);
+  TYPE_PRECISION (float80_type) = 96;
+  layout_type (float80_type);
+  (*lang_hooks.types.register_builtin_type) (float80_type, "__float80");
+
+  /* The __float128 type.  */
+  if (!TARGET_HPUX)
+    {
+      tree float128_type = make_node (REAL_TYPE);
+      TYPE_PRECISION (float128_type) = 128;
+      layout_type (float128_type);
+      (*lang_hooks.types.register_builtin_type) (float128_type, "__float128");
+    }
+  else
+    /* Under HPUX, this is a synonym for "long double".  */
+    (*lang_hooks.types.register_builtin_type) (long_double_type_node,
+					       "__float128");
+
 #define def_builtin(name, type, code) \
   builtin_function ((name), (type), (code), BUILT_IN_MD, NULL, NULL_TREE)
 
@@ -7772,8 +8138,8 @@ ia64_init_builtins ()
 	       build_function_type (ptr_type_node, void_list_node),
 	       IA64_BUILTIN_BSP);
 
-  def_builtin ("__builtin_ia64_flushrs", 
-	       build_function_type (void_type_node, void_list_node), 
+  def_builtin ("__builtin_ia64_flushrs",
+	       build_function_type (void_type_node, void_list_node),
 	       IA64_BUILTIN_FLUSHRS);
 
   def_builtin ("__sync_fetch_and_add_si", si_ftype_psi_si,
@@ -7844,11 +8210,8 @@ ia64_init_builtins ()
 */
 
 static rtx
-ia64_expand_fetch_and_op (binoptab, mode, arglist, target)
-     optab binoptab;
-     enum machine_mode mode;
-     tree arglist;
-     rtx target;
+ia64_expand_fetch_and_op (optab binoptab, enum machine_mode mode,
+			  tree arglist, rtx target)
 {
   rtx ret, label, tmp, ccv, insn, mem, value;
   tree arg0, arg1;
@@ -7884,13 +8247,14 @@ ia64_expand_fetch_and_op (binoptab, mode, arglist, target)
     }
 
   tmp = gen_reg_rtx (mode);
-  ccv = gen_rtx_REG (mode, AR_CCV_REGNUM);
+  /* ar.ccv must always be loaded with a zero-extended DImode value.  */
+  ccv = gen_rtx_REG (DImode, AR_CCV_REGNUM);
   emit_move_insn (tmp, mem);
 
   label = gen_label_rtx ();
   emit_label (label);
   emit_move_insn (ret, tmp);
-  emit_move_insn (ccv, tmp);
+  convert_move (ccv, tmp, /*unsignedp=*/1);
 
   /* Perform the specific operation.  Special case NAND by noticing
      one_cmpl_optab instead.  */
@@ -7925,11 +8289,8 @@ ia64_expand_fetch_and_op (binoptab, mode, arglist, target)
 */
 
 static rtx
-ia64_expand_op_and_fetch (binoptab, mode, arglist, target)
-     optab binoptab;
-     enum machine_mode mode;
-     tree arglist;
-     rtx target;
+ia64_expand_op_and_fetch (optab binoptab, enum machine_mode mode,
+			  tree arglist, rtx target)
 {
   rtx old, label, tmp, ret, ccv, insn, mem, value;
   tree arg0, arg1;
@@ -7953,14 +8314,15 @@ ia64_expand_op_and_fetch (binoptab, mode, arglist, target)
   emit_insn (gen_mf ());
   tmp = gen_reg_rtx (mode);
   old = gen_reg_rtx (mode);
-  ccv = gen_rtx_REG (mode, AR_CCV_REGNUM);
+  /* ar.ccv must always be loaded with a zero-extended DImode value.  */
+  ccv = gen_rtx_REG (DImode, AR_CCV_REGNUM);
 
   emit_move_insn (tmp, mem);
 
   label = gen_label_rtx ();
   emit_label (label);
   emit_move_insn (old, tmp);
-  emit_move_insn (ccv, tmp);
+  convert_move (ccv, tmp, /*unsignedp=*/1);
 
   /* Perform the specific operation.  Special case NAND by noticing
      one_cmpl_optab instead.  */
@@ -7993,12 +8355,8 @@ ia64_expand_op_and_fetch (binoptab, mode, arglist, target)
 */
 
 static rtx
-ia64_expand_compare_and_swap (rmode, mode, boolp, arglist, target)
-     enum machine_mode rmode;
-     enum machine_mode mode;
-     int boolp;
-     tree arglist;
-     rtx target;
+ia64_expand_compare_and_swap (enum machine_mode rmode, enum machine_mode mode,
+			      int boolp, tree arglist, rtx target)
 {
   tree arg0, arg1, arg2;
   rtx mem, old, new, ccv, tmp, insn;
@@ -8013,6 +8371,11 @@ ia64_expand_compare_and_swap (rmode, mode, boolp, arglist, target)
   mem = gen_rtx_MEM (mode, force_reg (ptr_mode, mem));
   MEM_VOLATILE_P (mem) = 1;
 
+  if (GET_MODE (old) != mode)
+    old = convert_to_mode (mode, old, /*unsignedp=*/1);
+  if (GET_MODE (new) != mode)
+    new = convert_to_mode (mode, new, /*unsignedp=*/1);
+
   if (! register_operand (old, mode))
     old = copy_to_mode_reg (mode, old);
   if (! register_operand (new, mode))
@@ -8024,14 +8387,7 @@ ia64_expand_compare_and_swap (rmode, mode, boolp, arglist, target)
     tmp = gen_reg_rtx (mode);
 
   ccv = gen_rtx_REG (DImode, AR_CCV_REGNUM);
-  if (mode == DImode)
-    emit_move_insn (ccv, old);
-  else
-    {
-      rtx ccvtmp = gen_reg_rtx (DImode);
-      emit_insn (gen_zero_extendsidi2 (ccvtmp, old));
-      emit_move_insn (ccv, ccvtmp);
-    }
+  convert_move (ccv, old, /*unsignedp=*/1);
   emit_insn (gen_mf ());
   if (mode == SImode)
     insn = gen_cmpxchg_acq_si (tmp, mem, new, ccv);
@@ -8052,10 +8408,8 @@ ia64_expand_compare_and_swap (rmode, mode, boolp, arglist, target)
 /* Expand lock_test_and_set.  I.e. `xchgsz ret = [ptr], new'.  */
 
 static rtx
-ia64_expand_lock_test_and_set (mode, arglist, target)
-     enum machine_mode mode;
-     tree arglist;
-     rtx target;
+ia64_expand_lock_test_and_set (enum machine_mode mode, tree arglist,
+			       rtx target)
 {
   tree arg0, arg1;
   rtx mem, new, ret, insn;
@@ -8087,10 +8441,8 @@ ia64_expand_lock_test_and_set (mode, arglist, target)
 /* Expand lock_release.  I.e. `stsz.rel [ptr] = r0'.  */
 
 static rtx
-ia64_expand_lock_release (mode, arglist, target)
-     enum machine_mode mode;
-     tree arglist;
-     rtx target ATTRIBUTE_UNUSED;
+ia64_expand_lock_release (enum machine_mode mode, tree arglist,
+			  rtx target ATTRIBUTE_UNUSED)
 {
   tree arg0;
   rtx mem;
@@ -8107,12 +8459,9 @@ ia64_expand_lock_release (mode, arglist, target)
 }
 
 rtx
-ia64_expand_builtin (exp, target, subtarget, mode, ignore)
-     tree exp;
-     rtx target;
-     rtx subtarget ATTRIBUTE_UNUSED;
-     enum machine_mode mode ATTRIBUTE_UNUSED;
-     int ignore ATTRIBUTE_UNUSED;
+ia64_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
+		     enum machine_mode mode ATTRIBUTE_UNUSED,
+		     int ignore ATTRIBUTE_UNUSED)
 {
   tree fndecl = TREE_OPERAND (TREE_OPERAND (exp, 0), 0);
   unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
@@ -8203,6 +8552,9 @@ ia64_expand_builtin (exp, target, subtarget, mode, ignore)
       if (! target || ! register_operand (target, DImode))
 	target = gen_reg_rtx (DImode);
       emit_insn (gen_bsp_value (target));
+#ifdef POINTERS_EXTEND_UNSIGNED
+      target = convert_memory_address (ptr_mode, target);
+#endif
       return target;
 
     case IA64_BUILTIN_FLUSHRS:
@@ -8268,9 +8620,7 @@ ia64_expand_builtin (exp, target, subtarget, mode, ignore)
    most significant bits of the stack slot.  */
 
 enum direction
-ia64_hpux_function_arg_padding (mode, type)
-     enum machine_mode mode;
-     tree type;
+ia64_hpux_function_arg_padding (enum machine_mode mode, tree type)
 {
    /* Exception to normal case for structures/unions/etc.  */
 
@@ -8278,78 +8628,124 @@ ia64_hpux_function_arg_padding (mode, type)
        && int_size_in_bytes (type) < UNITS_PER_WORD)
      return upward;
 
-   /* This is the standard FUNCTION_ARG_PADDING with !BYTES_BIG_ENDIAN
-      hardwired to be true.  */
-
-   return((mode == BLKmode
-       ? (type && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
-          && int_size_in_bytes (type) < (PARM_BOUNDARY / BITS_PER_UNIT))
-       : GET_MODE_BITSIZE (mode) < PARM_BOUNDARY)
-      ? downward : upward);
+   /* Fall back to the default.  */
+   return DEFAULT_FUNCTION_ARG_PADDING (mode, type);
 }
 
 /* Linked list of all external functions that are to be emitted by GCC.
    We output the name if and only if TREE_SYMBOL_REFERENCED is set in
    order to avoid putting out names that are never really used.  */
 
-struct extern_func_list
+struct extern_func_list GTY(())
 {
-  struct extern_func_list *next; /* next external */
-  char *name;                    /* name of the external */
-} *extern_func_head = 0;
+  struct extern_func_list *next;
+  tree decl;
+};
+
+static GTY(()) struct extern_func_list *extern_func_head;
 
 static void
-ia64_hpux_add_extern_decl (name)
-        const char *name;
+ia64_hpux_add_extern_decl (tree decl)
 {
-  struct extern_func_list *p;
+  struct extern_func_list *p = ggc_alloc (sizeof (struct extern_func_list));
 
-  p = (struct extern_func_list *) xmalloc (sizeof (struct extern_func_list));
-  p->name = xmalloc (strlen (name) + 1);
-  strcpy(p->name, name);
+  p->decl = decl;
   p->next = extern_func_head;
   extern_func_head = p;
 }
 
 /* Print out the list of used global functions.  */
 
-void
-ia64_hpux_asm_file_end (file)
-	FILE *file;
+static void
+ia64_hpux_file_end (void)
 {
-  while (extern_func_head)
+  struct extern_func_list *p;
+
+  for (p = extern_func_head; p; p = p->next)
     {
-      const char *real_name;
-      tree decl;
+      tree decl = p->decl;
+      tree id = DECL_ASSEMBLER_NAME (decl);
 
-      real_name = (* targetm.strip_name_encoding) (extern_func_head->name);
-      decl = maybe_get_identifier (real_name);
+      if (!id)
+	abort ();
 
-      if (!decl
-	  || (! TREE_ASM_WRITTEN (decl) && TREE_SYMBOL_REFERENCED (decl)))
+      if (!TREE_ASM_WRITTEN (decl) && TREE_SYMBOL_REFERENCED (id))
         {
-	  if (decl)
-	    TREE_ASM_WRITTEN (decl) = 1;
-	  (*targetm.asm_out.globalize_label) (file, extern_func_head->name);
-	  fprintf (file, "%s", TYPE_ASM_OP);
-	  assemble_name (file, extern_func_head->name);
-	  putc (',', file);
-	  fprintf (file, TYPE_OPERAND_FMT, "function");
-	  putc ('\n', file);
+	  const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
+
+	  TREE_ASM_WRITTEN (decl) = 1;
+	  (*targetm.asm_out.globalize_label) (asm_out_file, name);
+	  fputs (TYPE_ASM_OP, asm_out_file);
+	  assemble_name (asm_out_file, name);
+	  fprintf (asm_out_file, "," TYPE_OPERAND_FMT "\n", "function");
         }
-      extern_func_head = extern_func_head->next;
     }
+
+  extern_func_head = 0;
 }
 
+/* Rename all the TFmode libfuncs using the HPUX conventions.  */
+
+static void
+ia64_hpux_init_libfuncs (void)
+{
+  set_optab_libfunc (add_optab, TFmode, "_U_Qfadd");
+  set_optab_libfunc (sub_optab, TFmode, "_U_Qfsub");
+  set_optab_libfunc (smul_optab, TFmode, "_U_Qfmpy");
+  set_optab_libfunc (sdiv_optab, TFmode, "_U_Qfdiv");
+  set_optab_libfunc (smin_optab, TFmode, "_U_Qfmin");
+  set_optab_libfunc (smax_optab, TFmode, "_U_Qfmax");
+  set_optab_libfunc (abs_optab, TFmode, "_U_Qfabs");
+  set_optab_libfunc (neg_optab, TFmode, "_U_Qfneg");
+
+  /* ia64_expand_compare uses this.  */
+  cmptf_libfunc = init_one_libfunc ("_U_Qfcmp");
+
+  /* These should never be used.  */
+  set_optab_libfunc (eq_optab, TFmode, 0);
+  set_optab_libfunc (ne_optab, TFmode, 0);
+  set_optab_libfunc (gt_optab, TFmode, 0);
+  set_optab_libfunc (ge_optab, TFmode, 0);
+  set_optab_libfunc (lt_optab, TFmode, 0);
+  set_optab_libfunc (le_optab, TFmode, 0);
+
+  set_conv_libfunc (sext_optab,   TFmode, SFmode, "_U_Qfcnvff_sgl_to_quad");
+  set_conv_libfunc (sext_optab,   TFmode, DFmode, "_U_Qfcnvff_dbl_to_quad");
+  set_conv_libfunc (sext_optab,   TFmode, XFmode, "_U_Qfcnvff_f80_to_quad");
+  set_conv_libfunc (trunc_optab,  SFmode, TFmode, "_U_Qfcnvff_quad_to_sgl");
+  set_conv_libfunc (trunc_optab,  DFmode, TFmode, "_U_Qfcnvff_quad_to_dbl");
+  set_conv_libfunc (trunc_optab,  XFmode, TFmode, "_U_Qfcnvff_quad_to_f80");
+
+  set_conv_libfunc (sfix_optab,   SImode, TFmode, "_U_Qfcnvfxt_quad_to_sgl");
+  set_conv_libfunc (sfix_optab,   DImode, TFmode, "_U_Qfcnvfxt_quad_to_dbl");
+  set_conv_libfunc (ufix_optab,   SImode, TFmode, "_U_Qfcnvfxut_quad_to_sgl");
+  set_conv_libfunc (ufix_optab,   DImode, TFmode, "_U_Qfcnvfxut_quad_to_dbl");
+
+  set_conv_libfunc (sfloat_optab, TFmode, SImode, "_U_Qfcnvxf_sgl_to_quad");
+  set_conv_libfunc (sfloat_optab, TFmode, DImode, "_U_Qfcnvxf_dbl_to_quad");
+}
+
+/* Rename the division and modulus functions in VMS.  */
+
+static void
+ia64_vms_init_libfuncs (void)
+{
+  set_optab_libfunc (sdiv_optab, SImode, "OTS$DIV_I");
+  set_optab_libfunc (sdiv_optab, DImode, "OTS$DIV_L");
+  set_optab_libfunc (udiv_optab, SImode, "OTS$DIV_UI");
+  set_optab_libfunc (udiv_optab, DImode, "OTS$DIV_UL");
+  set_optab_libfunc (smod_optab, SImode, "OTS$REM_I");
+  set_optab_libfunc (smod_optab, DImode, "OTS$REM_L");
+  set_optab_libfunc (umod_optab, SImode, "OTS$REM_UI");
+  set_optab_libfunc (umod_optab, DImode, "OTS$REM_UL");
+}
 
 /* Switch to the section to which we should output X.  The only thing
    special we do here is to honor small data.  */
 
 static void
-ia64_select_rtx_section (mode, x, align)
-     enum machine_mode mode;
-     rtx x;
-     unsigned HOST_WIDE_INT align;
+ia64_select_rtx_section (enum machine_mode mode, rtx x,
+			 unsigned HOST_WIDE_INT align)
 {
   if (GET_MODE_SIZE (mode) > 0
       && GET_MODE_SIZE (mode) <= ia64_section_threshold)
@@ -8362,27 +8758,20 @@ ia64_select_rtx_section (mode, x, align)
    Pretend flag_pic is always set.  */
 
 static void
-ia64_rwreloc_select_section (exp, reloc, align)
-     tree exp;
-     int reloc;
-     unsigned HOST_WIDE_INT align;
+ia64_rwreloc_select_section (tree exp, int reloc, unsigned HOST_WIDE_INT align)
 {
   default_elf_select_section_1 (exp, reloc, align, true);
 }
 
 static void
-ia64_rwreloc_unique_section (decl, reloc)
-     tree decl;
-     int reloc;
+ia64_rwreloc_unique_section (tree decl, int reloc)
 {
   default_unique_section_1 (decl, reloc, true);
 }
 
 static void
-ia64_rwreloc_select_rtx_section (mode, x, align)
-     enum machine_mode mode;
-     rtx x;
-     unsigned HOST_WIDE_INT align;
+ia64_rwreloc_select_rtx_section (enum machine_mode mode, rtx x,
+				 unsigned HOST_WIDE_INT align)
 {
   int save_pic = flag_pic;
   flag_pic = 1;
@@ -8391,32 +8780,50 @@ ia64_rwreloc_select_rtx_section (mode, x, align)
 }
 
 static unsigned int
-ia64_rwreloc_section_type_flags (decl, name, reloc)
-     tree decl;
-     const char *name;
-     int reloc;
+ia64_rwreloc_section_type_flags (tree decl, const char *name, int reloc)
 {
   return default_section_type_flags_1 (decl, name, reloc, true);
 }
 
+/* Returns true if FNTYPE (a FUNCTION_TYPE or a METHOD_TYPE) returns a
+   structure type and that the address of that type should be passed
+   in out0, rather than in r8.  */
+
+static bool
+ia64_struct_retval_addr_is_first_parm_p (tree fntype)
+{
+  tree ret_type = TREE_TYPE (fntype);
+
+  /* The Itanium C++ ABI requires that out0, rather than r8, be used
+     as the structure return address parameter, if the return value
+     type has a non-trivial copy constructor or destructor.  It is not
+     clear if this same convention should be used for other
+     programming languages.  Until G++ 3.4, we incorrectly used r8 for
+     these return values.  */
+  return (abi_version_at_least (2)
+	  && ret_type
+	  && TYPE_MODE (ret_type) == BLKmode 
+	  && TREE_ADDRESSABLE (ret_type)
+	  && strcmp (lang_hooks.name, "GNU C++") == 0);
+}
 
 /* Output the assembler code for a thunk function.  THUNK_DECL is the
    declaration for the thunk function itself, FUNCTION is the decl for
    the target function.  DELTA is an immediate constant offset to be
-   added to THIS.  If VCALL_OFFSET is non-zero, the word at
+   added to THIS.  If VCALL_OFFSET is nonzero, the word at
    *(*this + vcall_offset) should be added to THIS.  */
 
 static void
-ia64_output_mi_thunk (file, thunk, delta, vcall_offset, function)
-     FILE *file;
-     tree thunk ATTRIBUTE_UNUSED;
-     HOST_WIDE_INT delta;
-     HOST_WIDE_INT vcall_offset;
-     tree function;
+ia64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
+		      HOST_WIDE_INT delta, HOST_WIDE_INT vcall_offset,
+		      tree function)
 {
   rtx this, insn, funexp;
+  unsigned int this_parmno;
+  unsigned int this_regno;
 
   reload_completed = 1;
+  epilogue_completed = 1;
   no_new_pseudos = 1;
 
   /* Set things up as ia64_expand_prologue might.  */
@@ -8427,16 +8834,32 @@ ia64_output_mi_thunk (file, thunk, delta, vcall_offset, function)
   current_frame_info.n_input_regs = 1;
   current_frame_info.need_regstk = (TARGET_REG_NAMES != 0);
 
-  if (!TARGET_REG_NAMES)
-    reg_names[IN_REG (0)] = ia64_reg_numbers[0];
-
   /* Mark the end of the (empty) prologue.  */
-  emit_note (NULL, NOTE_INSN_PROLOGUE_END);
+  emit_note (NOTE_INSN_PROLOGUE_END);
+
+  /* Figure out whether "this" will be the first parameter (the
+     typical case) or the second parameter (as happens when the
+     virtual function returns certain class objects).  */
+  this_parmno
+    = (ia64_struct_retval_addr_is_first_parm_p (TREE_TYPE (thunk))
+       ? 1 : 0);
+  this_regno = IN_REG (this_parmno);
+  if (!TARGET_REG_NAMES)
+    reg_names[this_regno] = ia64_reg_numbers[this_parmno];
 
-  this = gen_rtx_REG (Pmode, IN_REG (0));
+  this = gen_rtx_REG (Pmode, this_regno);
   if (TARGET_ILP32)
-    emit_insn (gen_ptr_extend (this,
-			       gen_rtx_REG (ptr_mode, IN_REG (0))));
+    {
+      rtx tmp = gen_rtx_REG (ptr_mode, this_regno);
+      REG_POINTER (tmp) = 1;
+      if (delta && CONST_OK_FOR_I (delta))
+	{
+	  emit_insn (gen_ptr_extend_plus_imm (this, tmp, GEN_INT (delta)));
+	  delta = 0;
+	}
+      else
+	emit_insn (gen_ptr_extend (this, tmp));
+    }
 
   /* Apply the constant offset, if required.  */
   if (delta)
@@ -8461,19 +8884,30 @@ ia64_output_mi_thunk (file, thunk, delta, vcall_offset, function)
       if (TARGET_ILP32)
 	{
 	  rtx t = gen_rtx_REG (ptr_mode, 2);
+	  REG_POINTER (t) = 1;
 	  emit_move_insn (t, gen_rtx_MEM (ptr_mode, this));
-	  emit_insn (gen_ptr_extend (tmp, t));
+	  if (CONST_OK_FOR_I (vcall_offset))
+	    {
+	      emit_insn (gen_ptr_extend_plus_imm (tmp, t, 
+						  vcall_offset_rtx));
+	      vcall_offset = 0;
+	    }
+	  else
+	    emit_insn (gen_ptr_extend (tmp, t));
 	}
       else
 	emit_move_insn (tmp, gen_rtx_MEM (Pmode, this));
 
-      if (!CONST_OK_FOR_J (vcall_offset))
+      if (vcall_offset)
 	{
-	  rtx tmp2 = gen_rtx_REG (Pmode, next_scratch_gr_reg ());
-	  emit_move_insn (tmp2, vcall_offset_rtx);
-	  vcall_offset_rtx = tmp2;
+	  if (!CONST_OK_FOR_J (vcall_offset))
+	    {
+	      rtx tmp2 = gen_rtx_REG (Pmode, next_scratch_gr_reg ());
+	      emit_move_insn (tmp2, vcall_offset_rtx);
+	      vcall_offset_rtx = tmp2;
+	    }
+	  emit_insn (gen_adddi3 (tmp, tmp, vcall_offset_rtx));
 	}
-      emit_insn (gen_adddi3 (tmp, tmp, vcall_offset_rtx));
 
       if (TARGET_ILP32)
 	emit_move_insn (gen_rtx_REG (ptr_mode, 2), 
@@ -8498,6 +8932,7 @@ ia64_output_mi_thunk (file, thunk, delta, vcall_offset, function)
 
   /* Code generation for calls relies on splitting.  */
   reload_completed = 1;
+  epilogue_completed = 1;
   try_split (PATTERN (insn), insn, 0);
 
   emit_barrier ();
@@ -8507,15 +8942,28 @@ ia64_output_mi_thunk (file, thunk, delta, vcall_offset, function)
      instruction scheduling worth while.  Note that use_thunk calls
      assemble_start_function and assemble_end_function.  */
 
+  insn_locators_initialize ();
+  emit_all_insn_group_barriers (NULL);
   insn = get_insns ();
-  emit_all_insn_group_barriers (NULL, insn);
   shorten_branches (insn);
   final_start_function (insn, file, 1);
   final (insn, file, 1, 0);
   final_end_function ();
 
   reload_completed = 0;
+  epilogue_completed = 0;
   no_new_pseudos = 0;
 }
 
+/* Worker function for TARGET_STRUCT_VALUE_RTX.  */
+
+static rtx
+ia64_struct_value_rtx (tree fntype,
+		       int incoming ATTRIBUTE_UNUSED)
+{
+  if (fntype && ia64_struct_retval_addr_is_first_parm_p (fntype))
+    return NULL_RTX;
+  return gen_rtx_REG (Pmode, GR_REG (8));
+}
+
 #include "gt-ia64.h"