genattrtab.c (expand_units): For large nr opclasses, expand function_units_used with ORX to prevent blowups.

* genattrtab.c (expand_units): For large nr opclasses, expand function_units_used with ORX to prevent blowups. Tag with FFS. (num_unit_opclasses): New variable. (gen_unit): Update it. (enum operator): Add ORX_OP. (operate_exp): Treat ORX as or, except don't expand across an if. Reuse number rtx's after operating on them. (check_attr_value): Accept IOR, AND, & FFS. (write_test_expr): Transmute `in_comparison' to `flags'. Allow for attribute value caching. Handle CONST_STRING, IF_THEN_ELSE. (write_expr_attr_cache, write_toplevel_expr): New functions. (write_attr_get): Handle FFS-tagged expressions. (make_canonical): Don't expand const attributes. (convert_const_symbol_ref): Dike out. (evaluate_eq_attr): Handle SYMBOL_REF. (main): Don't emit get_attr_foo for const attributes. * alpha.c (override_options): Reinstate PROCESSOR_EV6. (alpha_adjust_cost): Add EV6 tuning; streamline EV5 tests. * alpha.h (REGISTER_MOVE_COST): Increase ftoi/itof cost slightly. * alpha.md: Redo all of the scheduling, adding EV6 support, and combining function units where possible. (attr "type"): Split loads, stores, cmov into int/fp. Combine multiplies and divides. Add EV6 sqrt, ftoi, itof. (attr "opsize"): New attribute. (sqrtsf2-1, sqrtdf2-1): Provide proper TP_INSN patterns. (movsf2-[12], movdf2-[12]): Provide CIX varients; don't allow CIX to control register allocation. (movsi2-1, movdi2-1): Likewise. From-SVN: r17212
1997-12-22 21:34:27 -08:00 · 1997-12-22 21:34:27 -08:00 · 71d9b4930e
commit 71d9b4930e
parent b0435cf4cf
5 changed files with 704 additions and 268 deletions
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@ -1,3 +1,35 @@
+Tue Dec 23 05:17:28 1997  Richard Henderson  <rth@cygnus.com>
+
+	* genattrtab.c (expand_units): For large nr opclasses, expand
+	function_units_used with ORX to prevent blowups.  Tag with FFS.
+	(num_unit_opclasses): New variable.
+	(gen_unit): Update it.
+	(enum operator): Add ORX_OP.
+	(operate_exp): Treat ORX as or, except don't expand across an if.
+	Reuse number rtx's after operating on them.
+	(check_attr_value): Accept IOR, AND, & FFS.
+	(write_test_expr): Transmute `in_comparison' to `flags'.  Allow
+	for attribute value caching.  Handle CONST_STRING, IF_THEN_ELSE.
+	(write_expr_attr_cache, write_toplevel_expr): New functions.
+	(write_attr_get): Handle FFS-tagged expressions.
+	(make_canonical): Don't expand const attributes.
+	(convert_const_symbol_ref): Dike out.
+	(evaluate_eq_attr): Handle SYMBOL_REF.
+	(main): Don't emit get_attr_foo for const attributes.
+
+	* alpha.c (override_options): Reinstate PROCESSOR_EV6.
+	(alpha_adjust_cost): Add EV6 tuning; streamline EV5 tests.
+	* alpha.h (REGISTER_MOVE_COST): Increase ftoi/itof cost slightly.
+	* alpha.md: Redo all of the scheduling, adding EV6 support, and 
+	combining function units where possible.  
+	(attr "type"): Split loads, stores, cmov into int/fp.  Combine
+	multiplies and divides.  Add EV6 sqrt, ftoi, itof.
+	(attr "opsize"): New attribute.
+	(sqrtsf2-1, sqrtdf2-1): Provide proper TP_INSN patterns.
+	(movsf2-[12], movdf2-[12]): Provide CIX varients; don't allow CIX
+	to control register allocation.
+	(movsi2-1, movdi2-1): Likewise.
+
 Tue Dec 23 03:53:21 1997  Richard Henderson  <rth@cygnus.com>
 	
 	* alpha.h (CPP_PREDEFINES, LIB_SPEC, LINK_SPEC, STARTFILE_SPEC,
--- a/gcc/config/alpha/alpha.c
+++ b/gcc/config/alpha/alpha.c
@ -134,7 +134,7 @@ override_options ()
  /* 971208 -- EV6 scheduling parameters are still secret, so don't even
     pretend and just schedule for an EV5 for now.  -- r~  */
  alpha_cpu
-    = TARGET_CPU_DEFAULT & MASK_CPU_EV6 ? PROCESSOR_EV5
+    = TARGET_CPU_DEFAULT & MASK_CPU_EV6 ? PROCESSOR_EV6
      : (TARGET_CPU_DEFAULT & MASK_CPU_EV5 ? PROCESSOR_EV5 : PROCESSOR_EV4);

  if (alpha_cpu_string)
@ -169,7 +169,7 @@ override_options ()
      else if (! strcmp (alpha_cpu_string, "ev6")
 	       || ! strcmp (alpha_cpu_string, "21264"))
 	{
-	  alpha_cpu = PROCESSOR_EV5;
+	  alpha_cpu = PROCESSOR_EV6;
 	  target_flags |= MASK_BWX | MASK_CIX | MASK_MAX;
 	}
      else
@ -274,7 +274,7 @@ override_options ()
 	{
 	  { 3, 30, -1 },	/* ev4 -- Bcache is a guess */
 	  { 2, 12, 38 },	/* ev5 -- Bcache from PC164 LMbench numbers */
-	  { 3, 12, -1 },	/* ev6 -- Ho hum, doesn't exist yet */
+	  { 3, 13, -1 },	/* ev6 -- Ho hum, doesn't exist yet */
 	};

 	lat = alpha_mlat_string[1] - '0';
@ -1291,80 +1291,30 @@ alpha_adjust_cost (insn, link, dep_insn, cost)
  dep_insn_type = get_attr_type (dep_insn);

  /* Bring in the user-defined memory latency.  */
-  if (dep_insn_type == TYPE_LD || dep_insn_type == TYPE_LDSYM)
+  if (dep_insn_type == TYPE_ILD
+      || dep_insn_type == TYPE_FLD
+      || dep_insn_type == TYPE_LDSYM)
    cost += alpha_memory_latency-1;

-  if (alpha_cpu == PROCESSOR_EV5)
-    {
-      /* And the lord DEC saith:  "A special bypass provides an effective
-	 latency of 0 cycles for an ICMP or ILOG insn producing the test
-	 operand of an IBR or CMOV insn." */
-      if ((dep_insn_type == TYPE_ICMP
-	   || dep_insn_type == TYPE_ILOG)
-	  && (insn_type == TYPE_IBR
-	      || (insn_type == TYPE_CMOV
-		  && !((set = single_set (dep_insn)) != 0
-		       && GET_CODE (PATTERN (insn)) == SET
-		       && (set_src = SET_SRC (PATTERN (insn)),
-			   GET_CODE (set_src) == IF_THEN_ELSE)
-		       && (set = SET_DEST (set),
-			   rtx_equal_p (set, XEXP (set_src, 1))
-			   || rtx_equal_p (set, XEXP (set_src, 2)))))))
-	return 0;
-
-      /* "The multiplier is unable to receive data from IEU bypass paths.
-	 The instruction issues at the expected time, but its latency is
-	 increased by the time it takes for the input data to become
-	 available to the multiplier" -- which happens in pipeline stage
-	 six, when results are comitted to the register file.  */
-
-      if ((insn_type == TYPE_IMULL
-	   || insn_type == TYPE_IMULQ
-	   || insn_type == TYPE_IMULH)
-	  && (set = single_set (dep_insn)) != 0
-	  && GET_CODE (PATTERN (insn)) == SET
-	  && (set_src = SET_SRC (PATTERN (insn)),
-	      GET_CODE (set_src) == MULT)
-	  && (set = SET_DEST (set),
-	      rtx_equal_p (set, XEXP (set_src, 0))
-	      || rtx_equal_p (set, XEXP (set_src, 1))))
-	{
-	  switch (dep_insn_type)
-	    {
-	    /* These insns produce their results in pipeline stage five.  */
-	    case TYPE_LD:
-	    case TYPE_CMOV:
-	    case TYPE_IMULL:
-	    case TYPE_IMULQ:
-	    case TYPE_IMULH:
-	    case TYPE_MVI:
-	      return cost + 1;
-
-	    /* Other integer insns produce results in pipeline stage four.  */
-	    default:
-	      return cost + 2;
-	    }
-	}
-    }
-  else
+  switch (alpha_cpu)
    {
+    case PROCESSOR_EV4:
      /* On EV4, if INSN is a store insn and DEP_INSN is setting the data
 	 being stored, we can sometimes lower the cost.  */

-      if (insn_type == TYPE_ST
+      if ((insn_type == TYPE_IST || insn_type == TYPE_FST)
 	  && (set = single_set (dep_insn)) != 0
 	  && GET_CODE (PATTERN (insn)) == SET
 	  && rtx_equal_p (SET_DEST (set), SET_SRC (PATTERN (insn))))
 	{
 	  switch (dep_insn_type)
 	    {
-	    case TYPE_LD:
+	    case TYPE_ILD:
+	    case TYPE_FLD:
 	      /* No savings here.  */
 	      return cost;

-	    case TYPE_IMULL:
-	    case TYPE_IMULQ:
-	    case TYPE_IMULH:
+	    case TYPE_IMUL:
 	      /* In these cases, we save one cycle.  */
 	      return cost - 1;

@ -1377,14 +1327,17 @@ alpha_adjust_cost (insn, link, dep_insn, cost)
      /* Another case that needs adjustment is an arithmetic or logical
 	 operation.  It's cost is usually one cycle, but we default it to
 	 two in the MD file.  The only case that it is actually two is
-	 for the address in loads and stores.  */
+	 for the address in loads, stores, and jumps.  */

      if (dep_insn_type == TYPE_IADD || dep_insn_type == TYPE_ILOG)
 	{
 	  switch (insn_type)
 	    {
-	    case TYPE_LD:
-	    case TYPE_ST:
+	    case TYPE_ILD:
+	    case TYPE_IST:
+	    case TYPE_FLD:
+	    case TYPE_FST:
+	    case TYPE_JSR:
 	      return cost;
 	    default:
 	      return 1;
@ -1396,6 +1349,62 @@ alpha_adjust_cost (insn, link, dep_insn, cost)

      if (dep_insn_type == TYPE_ICMP && insn_type == TYPE_IBR)
 	return 1;
+      break;
+
+    case PROCESSOR_EV5:
+      /* And the lord DEC saith:  "A special bypass provides an effective
+	 latency of 0 cycles for an ICMP or ILOG insn producing the test
+	 operand of an IBR or ICMOV insn." */
+
+      if ((dep_insn_type == TYPE_ICMP || dep_insn_type == TYPE_ILOG)
+	  && (set = single_set (dep_insn)) != 0)
+	{
+	  /* A branch only has one input.  This must be it.  */
+	  if (insn_type == TYPE_IBR)
+	    return 0;
+	  /* A conditional move has three, make sure it is the test.  */
+	  if (insn_type == TYPE_ICMOV
+	      && GET_CODE (set_src = PATTERN (insn)) == SET
+	      && GET_CODE (set_src = SET_SRC (set_src)) == IF_THEN_ELSE
+	      && rtx_equal_p (SET_DEST (set), XEXP (set_src, 0)))
+	    return 0;
+	}
+
+      /* "The multiplier is unable to receive data from IEU bypass paths.
+	 The instruction issues at the expected time, but its latency is
+	 increased by the time it takes for the input data to become
+	 available to the multiplier" -- which happens in pipeline stage
+	 six, when results are comitted to the register file.  */
+
+      if (insn_type == TYPE_IMUL)
+	{
+	  switch (dep_insn_type)
+	    {
+	    /* These insns produce their results in pipeline stage five.  */
+	    case TYPE_ILD:
+	    case TYPE_ICMOV:
+	    case TYPE_IMUL:
+	    case TYPE_MVI:
+	      return cost + 1;
+
+	    /* Other integer insns produce results in pipeline stage four.  */
+	    default:
+	      return cost + 2;
+	    }
+	}
+      break;
+
+    case PROCESSOR_EV6:
+      /* There is additional latency to move the result of (most) FP 
+         operations anywhere but the FP register file.  */
+
+      if ((insn_type == TYPE_FST || insn_type == TYPE_FTOI)
+	  && (dep_insn_type == TYPE_FADD ||
+	      dep_insn_type == TYPE_FMUL ||
+	      dep_insn_type == TYPE_FCMOV))
+        return cost + 2;
+
+      break;
    }

  /* Otherwise, return the default cost. */
--- a/gcc/config/alpha/alpha.h
+++ b/gcc/config/alpha/alpha.h
@ -769,9 +769,10 @@ enum reg_class { NO_REGS, GENERAL_REGS, FLOAT_REGS, ALL_REGS,
   reduce the impact of not being able to allocate a pseudo to a
   hard register.  */

-#define REGISTER_MOVE_COST(CLASS1, CLASS2)				\
-  (TARGET_CIX || ((CLASS1) == FLOAT_REGS) == ((CLASS2) == FLOAT_REGS)	\
-   ? 2 : 4+2*alpha_memory_latency)
+#define REGISTER_MOVE_COST(CLASS1, CLASS2)		\
+  (((CLASS1) == FLOAT_REGS) == ((CLASS2) == FLOAT_REGS)	\
+   ? 2							\
+   : TARGET_CIX ? 3 : 4+2*alpha_memory_latency)

 /* A C expressions returning the cost of moving data of MODE from a register to
   or from memory.
@ -1596,7 +1597,7 @@ extern void alpha_init_expanders ();
   our own exit function.  */
 #define HAVE_ATEXIT

-/* The EV4 is dual issue; EV5 is quad issue.  */
+/* The EV4 is dual issue; EV5/EV6 are quad issue.  */
 #define ISSUE_RATE  (alpha_cpu == PROCESSOR_EV4 ? 2 : 4)

 /* Compute the cost of computing a constant rtl expression RTX
--- a/gcc/config/alpha/alpha.md
+++ b/gcc/config/alpha/alpha.md
@ -24,7 +24,7 @@
 ;; Processor type -- this attribute must exactly match the processor_type
 ;; enumeration in alpha.h.

-(define_attr "cpu" "ev4,ev5"
+(define_attr "cpu" "ev4,ev5,ev6"
  (const (symbol_ref "alpha_cpu")))

 ;; Define an insn type attribute.  This is used in function unit delay
@ -33,194 +33,319 @@
 ;; separately.

 (define_attr "type"
-  "ld,st,ibr,fbr,jsr,iadd,ilog,shift,cmov,icmp,imull,imulq,imulh,fadd,fmul,fcpys,fdivs,fdivt,ldsym,misc,mvi"
+  "ild,fld,ldsym,ist,fst,ibr,fbr,jsr,iadd,ilog,shift,icmov,fcmov,icmp,imul,fadd,fmul,fcpys,fdiv,fsqrt,misc,mvi,ftoi,itof"
  (const_string "iadd"))

+(define_attr "opsize" "si,di,udi" (const_string "di"))
+
 ;; The TRAP_TYPE attribute marks instructions that may generate traps
 ;; (which are imprecise and may need a trapb if software completion
 ;; is desired).
 (define_attr "trap" "no,yes" (const_string "no"))

-;; For the EV4 we include four function units: ABOX, which computes
-;; the address, BBOX, used for branches, EBOX, used for integer
-;; operations, and FBOX, used for FP operations.
+
+;; On EV4 there are two classes of resources to consider: resources needed
+;; to issue, and resources needed to execute.  IBUS[01] are in the first
+;; category.  ABOX, BBOX, EBOX, FBOX, IMUL & FDIV make up the second.
+;; (There are are a few other register-like resources, but ...)

-;; Memory delivers its result in three cycles.  Actually return one and
-;; take care of this in adjust_cost, since we want to handle user-defined
-;; memory latencies.
-(define_function_unit "ev4_abox" 1 0
+; First, describe all of the issue constraints with single cycle delays.
+; All insns need a bus, but all except loads require one or the other.
+(define_function_unit "ev4_ibus0" 1 0
  (and (eq_attr "cpu" "ev4")
-       (eq_attr "type" "ld,ldsym,st"))
+       (eq_attr "type" "fst,fbr,iadd,imul,ilog,shift,icmov,icmp"))
  1 1)

-;; Branches have no delay cost, but do tie up the unit for two cycles.
+(define_function_unit "ev4_ibus1" 1 0
+  (and (eq_attr "cpu" "ev4")
+       (eq_attr "type" "ist,ibr,jsr,fadd,fcmov,fcpys,fmul,fdiv,misc"))
+  1 1)
+
+; Memory delivers its result in three cycles.  Actually return one and
+; take care of this in adjust_cost, since we want to handle user-defined
+; memory latencies.
+(define_function_unit "ev4_abox" 1 0
+  (and (eq_attr "cpu" "ev4")
+       (eq_attr "type" "ild,fld,ldsym,ist,fst"))
+  1 1)
+
+; Branches have no delay cost, but do tie up the unit for two cycles.
 (define_function_unit "ev4_bbox" 1 1
  (and (eq_attr "cpu" "ev4")
       (eq_attr "type" "ibr,fbr,jsr"))
  2 2)

-;; Arithmetic insns are normally have their results available after
-;; two cycles.  There are a number of exceptions.  They are encoded in
-;; ADJUST_COST.  Some of the other insns have similar exceptions.
-
+; Arithmetic insns are normally have their results available after
+; two cycles.  There are a number of exceptions.  They are encoded in
+; ADJUST_COST.  Some of the other insns have similar exceptions.
 (define_function_unit "ev4_ebox" 1 0
  (and (eq_attr "cpu" "ev4")
-       (eq_attr "type" "iadd,ilog,shift,cmov,icmp"))
+       (eq_attr "type" "iadd,ilog,shift,icmov,icmp,misc"))
  2 1)

-;; These really don't take up the integer pipeline, but they do occupy
-;; IBOX1; we approximate here.
-
-(define_function_unit "ev4_ebox" 1 0
+(define_function_unit "imul" 1 0
  (and (eq_attr "cpu" "ev4")
-       (eq_attr "type" "imull"))
-  21 1)
-
-(define_function_unit "ev4_ebox" 1 0
-  (and (eq_attr "cpu" "ev4")
-       (eq_attr "type" "imulq,imulh"))
-  23 1)
-
-(define_function_unit "ev4_imult" 1 0
-  (and (eq_attr "cpu" "ev4")
-       (eq_attr "type" "imull"))
+       (and (eq_attr "type" "imul")
+	    (eq_attr "opsize" "si")))
  21 19)

-(define_function_unit "ev4_imult" 1 0
+(define_function_unit "imul" 1 0
  (and (eq_attr "cpu" "ev4")
-       (eq_attr "type" "imulq,imulh"))
+       (and (eq_attr "type" "imul")
+	    (eq_attr "opsize" "!si")))
  23 21)

 (define_function_unit "ev4_fbox" 1 0
  (and (eq_attr "cpu" "ev4")
-       (eq_attr "type" "fadd,fmul,fcpys"))
+       (eq_attr "type" "fadd,fmul,fcpys,fcmov"))
  6 1)

-(define_function_unit "ev4_fbox" 1 0
+(define_function_unit "fdiv" 1 0
  (and (eq_attr "cpu" "ev4")
-       (eq_attr "type" "fdivs"))
-  34 0)
-
-(define_function_unit "ev4_fbox" 1 0
-  (and (eq_attr "cpu" "ev4")
-       (eq_attr "type" "fdivt"))
-  63 0)
-
-(define_function_unit "ev4_divider" 1 0
-  (and (eq_attr "cpu" "ev4")
-       (eq_attr "type" "fdivs"))
+       (and (eq_attr "type" "fdiv")
+	    (eq_attr "opsize" "si")))
  34 30)

-(define_function_unit "ev4_divider" 1 0
+(define_function_unit "fdiv" 1 0
  (and (eq_attr "cpu" "ev4")
-       (eq_attr "type" "fdivt"))
-  64 59)
+       (and (eq_attr "type" "fdiv")
+	    (eq_attr "opsize" "di")))
+  63 59)

 ;; EV5 scheduling.  EV5 can issue 4 insns per clock.
-;; We consider the EV6 and EV5 for now.
-
-;; EV5 has two asymetric integer units.  Model this with ebox,e0,e1.
-;; Everything uses ebox, and those that require particular pipes grab
-;; those as well.
+;;
+;; EV5 has two asymetric integer units.  Model this with E0 & E1 along
+;; with the combined resource EBOX.

 (define_function_unit "ev5_ebox" 2 0
  (and (eq_attr "cpu" "ev5")
-       (eq_attr "type" "iadd,ilog,icmp,st,shift,imull,imulq,imulh,mvi"))
+       (eq_attr "type" "!fbr,fcmov,fadd,fmul,fcpys,fdiv"))
  1 1)

-;; Memory takes at least 2 clocks, and load cannot dual issue with stores.
-;; Return one from here and fix up with user-defined latencies in adjust_cost.
+; Memory takes at least 2 clocks.  Return one from here and fix up with
+; user-defined latencies in adjust_cost.
+; ??? How to: "An instruction of class LD cannot be issued in the _second_
+; cycle after an instruction of class ST is issued."
 (define_function_unit "ev5_ebox" 2 0
  (and (eq_attr "cpu" "ev5")
-       (eq_attr "type" "ld,ldsym"))
+       (eq_attr "type" "ild,fld,ldsym"))
  1 1)

+; Stores, shifts, multiplies can only issue to E0
 (define_function_unit "ev5_e0" 1 0
  (and (eq_attr "cpu" "ev5")
-       (eq_attr "type" "ld,ldsym"))
-  0 1
-  [(eq_attr "type" "st")])
-
-;; Conditional moves always take 2 ticks.
-(define_function_unit "ev5_ebox" 2 0
-  (and (eq_attr "cpu" "ev5")
-       (eq_attr "type" "cmov"))
-  2 1)
-
-;; Stores, shifts, multiplies can only issue to E0
-(define_function_unit "ev5_e0" 1 0
-  (and (eq_attr "cpu" "ev5")
-       (eq_attr "type" "st"))
+       (eq_attr "type" "ist,fst,shift,imul"))
  1 1)

-;; Motion video insns also issue only to E0, and take two ticks.
+; Motion video insns also issue only to E0, and take two ticks.
 (define_function_unit "ev5_e0" 1 0
  (and (eq_attr "cpu" "ev5")
       (eq_attr "type" "mvi"))
  2 1)

-;; But shifts and multiplies don't conflict with loads.
-(define_function_unit "ev5_e0" 1 0
+; Conditional moves always take 2 ticks.
+(define_function_unit "ev5_ebox" 2 0
  (and (eq_attr "cpu" "ev5")
-       (eq_attr "type" "shift,imull,imulq,imulh,mvi"))
-  1 1
-  [(eq_attr "type" "st,shift,imull,imulq,imulh,mvi")])
+       (eq_attr "type" "icmov"))
+  2 1)

-;; Branches can only issue to E1
+; Branches can only issue to E1
 (define_function_unit "ev5_e1" 1 0
  (and (eq_attr "cpu" "ev5")
       (eq_attr "type" "ibr,jsr"))
  1 1)

-;; Multiplies also use the integer multiplier.
-(define_function_unit "ev5_imult" 1 0
+; Multiplies also use the integer multiplier.
+; ??? How to: "No instruction can be issued to pipe E0 exactly two
+; cycles before an integer multiplication completes."
+(define_function_unit "imul" 1 0
  (and (eq_attr "cpu" "ev5")
-       (eq_attr "type" "imull"))
+       (and (eq_attr "type" "imul")
+	    (eq_attr "opsize" "si")))
  8 4)

-(define_function_unit "ev5_imult" 1 0
+(define_function_unit "imul" 1 0
  (and (eq_attr "cpu" "ev5")
-       (eq_attr "type" "imulq"))
+       (and (eq_attr "type" "imul")
+	    (eq_attr "opsize" "di")))
  12 8)

-(define_function_unit "ev5_imult" 1 0
+(define_function_unit "imul" 1 0
  (and (eq_attr "cpu" "ev5")
-       (eq_attr "type" "imulh"))
+       (and (eq_attr "type" "imul")
+	    (eq_attr "opsize" "udi")))
  14 8)

 ;; Similarly for the FPU we have two asymetric units.  But fcpys can issue
 ;; on either so we have to play the game again.

-(define_function_unit "ev5_fpu" 2 0
+(define_function_unit "ev5_fbox" 2 0
  (and (eq_attr "cpu" "ev5")
-       (eq_attr "type" "fadd,fmul,fcpys,fbr,fdivs,fdivt"))
+       (eq_attr "type" "fadd,fcmov,fmul,fcpys,fbr,fdiv"))
  4 1)
  
-;; Multiplies (resp. adds) also use the fmul (resp. fadd) units.
 (define_function_unit "ev5_fm" 1 0
  (and (eq_attr "cpu" "ev5")
       (eq_attr "type" "fmul"))
  4 1)

+; Add and cmov as you would expect; fbr never produces a result;
+; fdiv issues through fa to the divider, 
 (define_function_unit "ev5_fa" 1 0
  (and (eq_attr "cpu" "ev5")
-       (eq_attr "type" "fadd"))
+       (eq_attr "type" "fadd,fcmov,fbr,fdiv"))
  4 1)

-(define_function_unit "ev5_fa" 1 0
+; ??? How to: "No instruction can be issued to pipe FA exactly five
+; cycles before a floating point divide completes."
+(define_function_unit "fdiv" 1 0
  (and (eq_attr "cpu" "ev5")
-       (eq_attr "type" "fbr"))
+       (and (eq_attr "type" "fdiv")
+	    (eq_attr "opsize" "si")))
+  15 15)				; 15 to 31 data dependant
+
+(define_function_unit "fdiv" 1 0
+  (and (eq_attr "cpu" "ev5")
+       (and (eq_attr "type" "fdiv")
+	    (eq_attr "opsize" "di")))
+  22 22)				; 22 to 60 data dependant
+
+;; EV6 scheduling.  EV6 can issue 4 insns per clock.
+;;
+;; EV6 has two symmetric pairs ("clusters") of two asymetric integer units
+;; ("upper" and "lower"), yielding pipe names U0, U1, L0, L1.
+
+;; Conditional moves decompose into two independant primitives, each 
+;; taking one cycle.  Since ev6 is out-of-order, we can't see anything
+;; but two cycles.
+(define_function_unit "ev6_ebox" 4 0
+  (and (eq_attr "cpu" "ev6")
+       (eq_attr "type" "icmov"))
+  2 1)
+
+(define_function_unit "ev6_ebox" 4 0
+  (and (eq_attr "cpu" "ev6")
+       (eq_attr "type" "!fbr,fcmov,fadd,fmul,fcpys,fdiv,fsqrt"))
  1 1)

-(define_function_unit "ev5_fa" 1 0
-  (and (eq_attr "cpu" "ev5")
-       (eq_attr "type" "fdivs"))
-  15 1)
+;; Integer loads take at least 3 clocks, and only issue to lower units.
+;; Return one from here and fix up with user-defined latencies in adjust_cost.
+(define_function_unit "ev6_l" 2 0
+  (and (eq_attr "cpu" "ev6")
+       (eq_attr "type" "ild,ldsym,ist,fst"))
+  1 1)
+
+;; FP loads take at least 4 clocks.  Return two from here...
+(define_function_unit "ev6_l" 2 0
+  (and (eq_attr "cpu" "ev6")
+       (eq_attr "type" "fld"))
+  2 1)
+
+;; Motion video insns also issue only to U0, and take three ticks.
+(define_function_unit "ev6_u0" 1 0
+  (and (eq_attr "cpu" "ev6")
+       (eq_attr "type" "mvi"))
+  3 1)
+
+(define_function_unit "ev6_u" 2 0
+  (and (eq_attr "cpu" "ev6")
+       (eq_attr "type" "mvi"))
+  3 1)
+
+;; Shifts issue to either upper pipe.
+(define_function_unit "ev6_u" 2 0
+  (and (eq_attr "cpu" "ev6")
+       (eq_attr "type" "shift"))
+  1 1)
+
+;; Multiplies issue only to U1, and all take 7 ticks.
+;; Rather than create a new function unit just for U1, reuse IMUL
+(define_function_unit "imul" 1 0
+  (and (eq_attr "cpu" "ev6")
+       (eq_attr "type" "imul"))
+  7 1)
+
+(define_function_unit "ev6_u" 2 0
+  (and (eq_attr "cpu" "ev6")
+       (eq_attr "type" "imul"))
+  7 1)
+
+;; Branches issue to either upper pipe
+(define_function_unit "ev6_u" 2 0
+  (and (eq_attr "cpu" "ev6")
+       (eq_attr "type" "ibr"))
+  3 1)
+
+;; Calls only issue to L0.
+(define_function_unit "ev6_l0" 1 0
+  (and (eq_attr "cpu" "ev6")
+       (eq_attr "type" "jsr"))
+  1 1)
+
+(define_function_unit "ev6_l" 2 0
+  (and (eq_attr "cpu" "ev6")
+       (eq_attr "type" "jsr"))
+  1 1)
+
+;; Ftoi/itof only issue to lower pipes
+(define_function_unit "ev6_l" 2 0
+  (and (eq_attr "cpu" "ev6")
+       (eq_attr "type" "ftoi"))
+  3 1)
+
+(define_function_unit "ev6_l" 2 0
+  (and (eq_attr "cpu" "ev6")
+       (eq_attr "type" "itof"))
+  4 1)
+
+;; For the FPU we are very similar to EV5, except there's no insn that
+;; can issue to fm & fa, so we get to leave that out.
+  
+(define_function_unit "ev6_fm" 1 0
+  (and (eq_attr "cpu" "ev6")
+       (eq_attr "type" "fmul"))
+  4 1)
+
+(define_function_unit "ev6_fa" 1 0
+  (and (eq_attr "cpu" "ev6")
+       (eq_attr "type" "fadd,fcpys,fbr,fdiv,fsqrt"))
+  4 1)
+
+(define_function_unit "ev6_fa" 1 0
+  (and (eq_attr "cpu" "ev6")
+       (eq_attr "type" "fcmov"))
+  8 1)
+
+(define_function_unit "fdiv" 1 0
+  (and (eq_attr "cpu" "ev6")
+       (and (eq_attr "type" "fdiv")
+	    (eq_attr "opsize" "si")))
+  12 10)
+
+(define_function_unit "fdiv" 1 0
+  (and (eq_attr "cpu" "ev6")
+       (and (eq_attr "type" "fdiv")
+	    (eq_attr "opsize" "di")))
+  15 13)
+
+(define_function_unit "fsqrt" 1 0
+  (and (eq_attr "cpu" "ev6")
+       (and (eq_attr "type" "fsqrt")
+	    (eq_attr "opsize" "si")))
+  16 14)
+
+(define_function_unit "fsqrt" 1 0
+  (and (eq_attr "cpu" "ev6")
+       (and (eq_attr "type" "fsqrt")
+	    (eq_attr "opsize" "di")))
+  32 30)
+
+; ??? The FPU communicates with memory and the integer register file
+; via two fp store units.  We need a slot in the fst immediately, and
+; a slot in LOW after the operand data is ready.  At which point the
+; data may be movedeither to the store queue or the integer register
+; file and the insn retired.

-(define_function_unit "ev5_fa" 1 0
-  (and (eq_attr "cpu" "ev5")
-       (eq_attr "type" "fdivt"))
-  22 1)

 ;; First define the arithmetic insns.  Note that the 32-bit forms also
 ;; sign-extend.
@ -238,7 +363,7 @@
   addl %1,$31,%0
   ldl %0,%1
   cvtql %1,%0\;cvtlq %0,%0"
-  [(set_attr "type" "iadd,ld,fadd")])
+  [(set_attr "type" "iadd,ild,fadd")])

 ;; Do addsi3 the way expand_binop would do if we didn't have one.  This
 ;; generates better code.  We have the anonymous addsi3 pattern below in
@ -595,7 +720,8 @@
 		 (match_operand:SI 2 "reg_or_0_operand" "rJ")))]
  ""
  "mull %r1,%r2,%0"
-  [(set_attr "type" "imull")])
+  [(set_attr "type" "imul")
+   (set_attr "opsize" "si")])

 (define_insn ""
  [(set (match_operand:DI 0 "register_operand" "=r")
@ -603,7 +729,8 @@
 				 (match_operand:SI 2 "reg_or_0_operand" "rJ"))))]
  ""
  "mull %r1,%r2,%0"
-  [(set_attr "type" "imull")])
+  [(set_attr "type" "imul")
+   (set_attr "opsize" "si")])

 (define_insn "muldi3"
  [(set (match_operand:DI 0 "register_operand" "=r")
@ -611,7 +738,7 @@
 		 (match_operand:DI 2 "reg_or_0_operand" "rJ")))]
  ""
  "mulq %r1,%r2,%0"
-  [(set_attr "type" "imulq")])
+  [(set_attr "type" "imul")])

 (define_insn "umuldi3_highpart"
  [(set (match_operand:DI 0 "register_operand" "=r")
@ -622,7 +749,8 @@
 	  (const_int 64))))]
  ""
  "umulh %1,%2,%0"
-  [(set_attr "type" "imulh")])
+  [(set_attr "type" "imul")
+   (set_attr "opsize" "udi")])

 (define_insn ""
  [(set (match_operand:DI 0 "register_operand" "=r")
@ -633,7 +761,8 @@
 	  (const_int 64))))]
  ""
  "umulh %1,%2,%0"
-  [(set_attr "type" "imulh")])
+  [(set_attr "type" "imul")
+   (set_attr "opsize" "udi")])

 ;; The divide and remainder operations always take their inputs from
 ;; r24 and r25, put their output in r27, and clobber r23 and r28.
@ -824,7 +953,7 @@
  "@
   and %1,255,%0
   ldbu %0,%1"
-  [(set_attr "type" "ilog,ld")])
+  [(set_attr "type" "ilog,ild")])

 (define_insn ""
  [(set (match_operand:SI 0 "register_operand" "=r")
@ -846,7 +975,7 @@
  "@
   and %1,255,%0
   ldbu %0,%1"
-  [(set_attr "type" "ilog,ld")])
+  [(set_attr "type" "ilog,ild")])

 (define_insn ""
  [(set (match_operand:DI 0 "register_operand" "=r")
@ -868,7 +997,7 @@
  "@
   zapnot %1,3,%0
   ldwu %0,%1"
-  [(set_attr "type" "shift,ld")])
+  [(set_attr "type" "shift,ild")])

 (define_insn ""
  [(set (match_operand:SI 0 "register_operand" "=r")
@ -890,7 +1019,7 @@
  "@
   zapnot %1,3,%0
   ldwu %0,%1"
-  [(set_attr "type" "shift,ld")])
+  [(set_attr "type" "shift,ild")])

 (define_insn ""
  [(set (match_operand:DI 0 "register_operand" "=r")
@ -985,7 +1114,9 @@
 	(unspec [(match_operand:DI 1 "register_operand" "r")] 1))]
  "TARGET_CIX"
  "cttz %1,%0"
-  [(set_attr "type" "shift")])
+  ; ev6 calls all mvi and cttz/ctlz/popc class imisc, so just 
+  ; reuse the existing type name.
+  [(set_attr "type" "mvi")])

 ;; Next come the shifts and the various extract and insert operations.

@ -1619,7 +1750,7 @@
   cpys %1,%1,%0
   ld%, %0,%1
   st%- %1,%0"
-  [(set_attr "type" "fcpys,ld,st")
+  [(set_attr "type" "fcpys,fld,fst")
   (set_attr "trap" "yes")])

 (define_insn ""
@ -1644,7 +1775,8 @@
 		(match_operand:SF 2 "reg_or_fp0_operand" "fG")))]
  "TARGET_FP && alpha_tp == ALPHA_TP_INSN"
  "div%,%)%& %R1,%R2,%0"
-  [(set_attr "type" "fdivs")
+  [(set_attr "type" "fdiv")
+   (set_attr "opsize" "si")
   (set_attr "trap" "yes")])

 (define_insn "divsf3"
@ -1653,7 +1785,8 @@
 		(match_operand:SF 2 "reg_or_fp0_operand" "fG")))]
  "TARGET_FP"
  "div%,%)%& %R1,%R2,%0"
-  [(set_attr "type" "fdivs")
+  [(set_attr "type" "fdiv")
+   (set_attr "opsize" "si")
   (set_attr "trap" "yes")])

 (define_insn ""
@ -1662,7 +1795,7 @@
 		(match_operand:DF 2 "reg_or_fp0_operand" "fG")))]
  "TARGET_FP && alpha_tp == ALPHA_TP_INSN"
  "div%-%)%& %R1,%R2,%0"
-  [(set_attr "type" "fdivt")
+  [(set_attr "type" "fdiv")
   (set_attr "trap" "yes")])

 (define_insn "divdf3"
@ -1671,7 +1804,7 @@
 		(match_operand:DF 2 "reg_or_fp0_operand" "fG")))]
  "TARGET_FP"
  "div%-%)%& %R1,%R2,%0"
-  [(set_attr "type" "fdivt")
+  [(set_attr "type" "fdiv")
   (set_attr "trap" "yes")])

 (define_insn ""
@ -1680,7 +1813,7 @@
 		(match_operand:DF 2 "reg_or_fp0_operand" "fG")))]
  "TARGET_FP && alpha_tp != ALPHA_TP_INSN"
  "div%-%)%& %R1,%R2,%0"
-  [(set_attr "type" "fdivt")
+  [(set_attr "type" "fdiv")
   (set_attr "trap" "yes")])

 (define_insn ""
@ -1690,7 +1823,7 @@
 		 (match_operand:SF 2 "reg_or_fp0_operand" "fG"))))]
  "TARGET_FP && alpha_tp != ALPHA_TP_INSN"
  "div%-%)%& %R1,%R2,%0"
-  [(set_attr "type" "fdivt")
+  [(set_attr "type" "fdiv")
   (set_attr "trap" "yes")])

 (define_insn ""
@ -1699,7 +1832,7 @@
 		(float_extend:DF (match_operand:SF 2 "reg_or_fp0_operand" "fG"))))]
  "TARGET_FP && alpha_tp != ALPHA_TP_INSN"
  "div%-%)%& %R1,%R2,%0"
-  [(set_attr "type" "fdivt")
+  [(set_attr "type" "fdiv")
   (set_attr "trap" "yes")])

 (define_insn ""
@ -1826,29 +1959,38 @@
  [(set_attr "type" "fadd")
   (set_attr "trap" "yes")])

+(define_insn ""
+  [(set (match_operand:SF 0 "register_operand" "=&f")
+	(sqrt:SF (match_operand:SF 1 "reg_or_fp0_operand" "fG")))]
+  "TARGET_FP && TARGET_CIX && alpha_tp == ALPHA_TP_INSN"
+  "sqrt%,%)%& %R1,%0"
+  [(set_attr "type" "fsqrt")
+   (set_attr "opsize" "si")
+   (set_attr "trap" "yes")])
+
 (define_insn "sqrtsf2"
  [(set (match_operand:SF 0 "register_operand" "=f")
 	(sqrt:SF (match_operand:SF 1 "reg_or_fp0_operand" "fG")))]
  "TARGET_FP && TARGET_CIX"
-  "sqrt%, %1,%0"
-  [(set_attr "type" "fdivs")
+  "sqrt%,%)%& %R1,%0"
+  [(set_attr "type" "fsqrt")
+   (set_attr "opsize" "si")
+   (set_attr "trap" "yes")])
+
+(define_insn ""
+  [(set (match_operand:DF 0 "register_operand" "=&f")
+	(sqrt:DF (match_operand:DF 1 "reg_or_fp0_operand" "fG")))]
+  "TARGET_FP && TARGET_CIX && alpha_tp == ALPHA_TP_INSN"
+  "sqrt%-%)%& %R1,%0"
+  [(set_attr "type" "fsqrt")
   (set_attr "trap" "yes")])

 (define_insn "sqrtdf2"
  [(set (match_operand:DF 0 "register_operand" "=f")
 	(sqrt:DF (match_operand:DF 1 "reg_or_fp0_operand" "fG")))]
  "TARGET_FP && TARGET_CIX"
-  "sqrt%- %1,%0"
-  [(set_attr "type" "fdivt")
-   (set_attr "trap" "yes")])
-
-(define_insn ""
-  [(set (match_operand:DF 0 "register_operand" "=f")
-	(sqrt:DF (float_extend:DF
-		  (match_operand:SF 1 "reg_or_fp0_operand" "fG"))))]
-  "TARGET_FP && TARGET_CIX&& alpha_tp != ALPHA_TP_INSN"
-  "sqrt%- %1,%0"
-  [(set_attr "type" "fdivt")
+  "sqrt%-%)%& %1,%0"
+  [(set_attr "type" "fsqrt")
   (set_attr "trap" "yes")])

 ;; Next are all the integer comparisons, and conditional moves and branches
@ -1889,7 +2031,7 @@
   cmov%D2 %r3,%5,%0
   cmov%c2 %r4,%1,%0
   cmov%d2 %r4,%5,%0"
-  [(set_attr "type" "cmov")])
+  [(set_attr "type" "icmov")])

 (define_insn ""
  [(set (match_operand:DI 0 "register_operand" "=r,r,r,r")
@ -1905,7 +2047,7 @@
   cmov%D2 %r3,%5,%0
   cmov%c2 %r4,%1,%0
   cmov%d2 %r4,%5,%0"
-  [(set_attr "type" "cmov")])
+  [(set_attr "type" "icmov")])

 (define_insn ""
  [(set (match_operand:DI 0 "register_operand" "=r,r")
@ -1920,7 +2062,7 @@
  "@
   cmovlbc %r2,%1,%0
   cmovlbs %r2,%3,%0"
-  [(set_attr "type" "cmov")])
+  [(set_attr "type" "icmov")])

 (define_insn ""
  [(set (match_operand:DI 0 "register_operand" "=r,r")
@ -1935,7 +2077,7 @@
  "@
   cmovlbs %r2,%1,%0
   cmovlbc %r2,%3,%0"
-  [(set_attr "type" "cmov")])
+  [(set_attr "type" "icmov")])

 ;; This form is added since combine thinks that an IF_THEN_ELSE with both
 ;; arms constant is a single insn, so it won't try to form it if combine
@ -1954,7 +2096,7 @@
   (clobber (match_scratch:DI 4 "=&r"))]
  ""
  "addq %0,%1,%4\;cmov%C2 %r3,%4,%0"
-  [(set_attr "type" "cmov")])
+  [(set_attr "type" "icmov")])

 (define_split
  [(set (match_operand:DI 0 "register_operand" "")
@ -2165,7 +2307,7 @@
 		 (const_int 0)))]
  ""
  "cmovlt %0,0,%0"
-  [(set_attr "type" "cmov")])
+  [(set_attr "type" "icmov")])

 (define_expand "smindi3"
  [(set (match_dup 3)
@ -2196,7 +2338,7 @@
 		 (const_int 0)))]
  ""
  "cmovgt %0,0,%0"
-  [(set_attr "type" "cmov")])
+  [(set_attr "type" "icmov")])

 (define_expand "umaxdi3"
  [(set (match_dup 3) 
@ -2389,7 +2531,7 @@
  "@
   fcmov%C3 %R4,%R1,%0
   fcmov%D3 %R4,%R5,%0"
-  [(set_attr "type" "fadd")])
+  [(set_attr "type" "fcmov")])

 (define_insn ""
  [(set (match_operand:DF 0 "register_operand" "=f,f")
@ -2403,7 +2545,7 @@
  "@
   fcmov%C3 %R4,%R1,%0
   fcmov%D3 %R4,%R5,%0"
-  [(set_attr "type" "fadd")])
+  [(set_attr "type" "fcmov")])

 (define_insn ""
  [(set (match_operand:SF 0 "register_operand" "=&f,f")
@ -2417,7 +2559,7 @@
  "@
   fcmov%C3 %R4,%R1,%0
   fcmov%D3 %R4,%R5,%0"
-  [(set_attr "type" "fadd")])
+  [(set_attr "type" "fcmov")])

 (define_insn ""
  [(set (match_operand:SF 0 "register_operand" "=f,f")
@ -2431,7 +2573,7 @@
  "@
   fcmov%C3 %R4,%R1,%0
   fcmov%D3 %R4,%R5,%0"
-  [(set_attr "type" "fadd")])
+  [(set_attr "type" "fcmov")])

 (define_insn ""
  [(set (match_operand:DF 0 "register_operand" "=f,f")
@ -2445,7 +2587,7 @@
  "@
   fcmov%C3 %R4,%R1,%0
   fcmov%D3 %R4,%R5,%0"
-  [(set_attr "type" "fadd")])
+  [(set_attr "type" "fcmov")])

 (define_insn ""
  [(set (match_operand:DF 0 "register_operand" "=f,f")
@ -2460,7 +2602,7 @@
  "@
   fcmov%C3 %R4,%R1,%0
   fcmov%D3 %R4,%R5,%0"
-  [(set_attr "type" "fadd")])
+  [(set_attr "type" "fcmov")])

 (define_insn ""
  [(set (match_operand:SF 0 "register_operand" "=f,f")
@ -2475,7 +2617,7 @@
  "@
   fcmov%C3 %R4,%R1,%0
   fcmov%D3 %R4,%R5,%0"
-  [(set_attr "type" "fadd")])
+  [(set_attr "type" "fcmov")])

 (define_insn ""
  [(set (match_operand:DF 0 "register_operand" "=f,f")
@ -2490,7 +2632,7 @@
  "@
   fcmov%C3 %R4,%R1,%0
   fcmov%D3 %R4,%R5,%0"
-  [(set_attr "type" "fadd")])
+  [(set_attr "type" "fcmov")])

 (define_expand "maxdf3"
  [(set (match_dup 3)
@ -3687,7 +3829,7 @@
 ;; want to have to include pal.h in our .s file.
 ;;
 ;; Technically the type for call_pal is jsr, but we use that for determining
-;; if we need a GP.  Use ibr instead since it has the same scheduling 
+;; if we need a GP.  Use ibr instead since it has the same EV5 scheduling
 ;; characteristics.
 (define_insn ""
  [(unspec_volatile [(const_int 0)] 0)]
@ -3702,8 +3844,9 @@
 (define_insn ""
  [(set (match_operand:SF 0 "nonimmediate_operand" "=r,r,m,f,f,f,m")
 	(match_operand:SF 1 "input_operand" "rG,m,rG,f,G,m,fG"))]
-  "register_operand (operands[0], SFmode)
-   || reg_or_fp0_operand (operands[1], SFmode)"
+  "! TARGET_CIX
+   && (register_operand (operands[0], SFmode)
+       || reg_or_fp0_operand (operands[1], SFmode))"
  "@
   bis %r1,%r1,%0
   ldl %0,%1
@ -3712,13 +3855,32 @@
   cpys $f31,$f31,%0
   ld%, %0,%1
   st%, %R1,%0"
-  [(set_attr "type" "ilog,ld,st,fcpys,fcpys,ld,st")])
+  [(set_attr "type" "ilog,ild,ist,fcpys,fcpys,fld,fst")])
+
+(define_insn ""
+  [(set (match_operand:SF 0 "nonimmediate_operand" "=r,r,m,f,f,f,m,f,*r")
+	(match_operand:SF 1 "input_operand" "rG,m,rG,f,G,m,fG,r,*f"))]
+  "TARGET_CIX
+   && (register_operand (operands[0], SFmode)
+       || reg_or_fp0_operand (operands[1], SFmode))"
+  "@
+   bis %r1,%r1,%0
+   ldl %0,%1
+   stl %r1,%0
+   cpys %1,%1,%0
+   cpys $f31,$f31,%0
+   ld%, %0,%1
+   st%, %R1,%0
+   itofs %1,%0
+   ftois %1,%0"
+  [(set_attr "type" "ilog,ild,ist,fcpys,fcpys,fld,fst,itof,ftoi")])

 (define_insn ""
  [(set (match_operand:DF 0 "nonimmediate_operand" "=r,r,m,f,f,f,m")
 	(match_operand:DF 1 "input_operand" "rG,m,rG,f,G,m,fG"))]
-  "register_operand (operands[0], DFmode)
-   || reg_or_fp0_operand (operands[1], DFmode)"
+  "! TARGET_CIX
+   && (register_operand (operands[0], DFmode)
+       || reg_or_fp0_operand (operands[1], DFmode))"
  "@
   bis %r1,%r1,%0
   ldq %0,%1
@ -3727,7 +3889,25 @@
   cpys $f31,$f31,%0
   ld%- %0,%1
   st%- %R1,%0"
-  [(set_attr "type" "ilog,ld,st,fcpys,fcpys,ld,st")])
+  [(set_attr "type" "ilog,ild,ist,fcpys,fcpys,fld,fst")])
+
+(define_insn ""
+  [(set (match_operand:DF 0 "nonimmediate_operand" "=r,r,m,f,f,f,m,f,*r")
+	(match_operand:DF 1 "input_operand" "rG,m,rG,f,G,m,fG,r,*f"))]
+  "TARGET_CIX
+   && (register_operand (operands[0], DFmode)
+       || reg_or_fp0_operand (operands[1], DFmode))"
+  "@
+   bis %r1,%r1,%0
+   ldq %0,%1
+   stq %r1,%0
+   cpys %1,%1,%0
+   cpys $f31,$f31,%0
+   ld%- %0,%1
+   st%- %R1,%0
+   itoft %1,%0
+   ftoit %1,%0"
+  [(set_attr "type" "ilog,ild,ist,fcpys,fcpys,fld,fst,itof,ftoi")])

 (define_expand "movsf"
  [(set (match_operand:SF 0 "nonimmediate_operand" "")
@ -3769,11 +3949,11 @@
   cpys $f31,$f31,%0
   ld%, %0,%1
   st%, %R1,%0"
-  [(set_attr "type" "ilog,ilog,ilog,iadd,iadd,ld,st,fcpys,fcpys,ld,st")])
+  [(set_attr "type" "ilog,ilog,ilog,iadd,iadd,ild,ist,fcpys,fcpys,fld,fst")])

 (define_insn ""
-  [(set (match_operand:SI 0 "nonimmediate_operand" "=r,r,r,r,r,r,m,f,f,f,m,r,f")
-	(match_operand:SI 1 "input_operand" "r,J,I,K,L,m,rJ,f,J,m,fG,f,r"))]
+  [(set (match_operand:SI 0 "nonimmediate_operand" "=r,r,r,r,r,r,m,f,f,f,m,r,*f")
+	(match_operand:SI 1 "input_operand" "r,J,I,K,L,m,rJ,f,J,m,fG,f,*r"))]
  "! TARGET_WINDOWS_NT && ! TARGET_OPEN_VMS && TARGET_CIX
   && (register_operand (operands[0], SImode)
       || reg_or_0_operand (operands[1], SImode))"
@ -3790,8 +3970,8 @@
   ld%, %0,%1
   st%, %R1,%0
   ftois %1,%0
-   itof%, %1,%0"
-  [(set_attr "type" "ilog,ilog,ilog,iadd,iadd,ld,st,fcpys,fcpys,ld,st,ld,st")])
+   itofs %1,%0"
+  [(set_attr "type" "ilog,ilog,ilog,iadd,iadd,ild,ist,fcpys,fcpys,fld,fst,ftoi,itof")])

 (define_insn ""
  [(set (match_operand:SI 0 "nonimmediate_operand" "=r,r,r,r,r,r,r,m,f,f,f,m")
@ -3812,7 +3992,7 @@
   cpys $f31,$f31,%0
   ld%, %0,%1
   st%, %R1,%0"
-  [(set_attr "type" "ilog,ilog,ilog,iadd,iadd,ldsym,ld,st,fcpys,fcpys,ld,st")])
+  [(set_attr "type" "ilog,ilog,ilog,iadd,iadd,ldsym,ild,ist,fcpys,fcpys,fld,fst")])

 (define_insn ""
  [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r,r,r,f,f")
@ -3844,7 +4024,7 @@
   stw %r1,%0
   cpys %1,%1,%0
   cpys $f31,$f31,%0"
-  [(set_attr "type" "ilog,ilog,ilog,iadd,ld,st,fcpys,fcpys")])
+  [(set_attr "type" "ilog,ilog,ilog,iadd,ild,ist,fcpys,fcpys")])

 (define_insn ""
  [(set (match_operand:QI 0 "nonimmediate_operand" "=r,r,r,r,f,f")
@ -3876,7 +4056,7 @@
   stb %r1,%0
   cpys %1,%1,%0
   cpys $f31,$f31,%0"
-  [(set_attr "type" "ilog,ilog,ilog,iadd,ld,st,fcpys,fcpys")])
+  [(set_attr "type" "ilog,ilog,ilog,iadd,ild,ist,fcpys,fcpys")])

 ;; We do two major things here: handle mem->mem and construct long
 ;; constants.
@ -3940,11 +4120,11 @@
   cpys $f31,$f31,%0
   ldt %0,%1
   stt %R1,%0"
-  [(set_attr "type" "ilog,ilog,ilog,iadd,iadd,ldsym,ld,st,fcpys,fcpys,ld,st")])
+  [(set_attr "type" "ilog,ilog,ilog,iadd,iadd,ldsym,ild,ist,fcpys,fcpys,fld,fst")])

 (define_insn ""
-  [(set (match_operand:DI 0 "general_operand" "=r,r,r,r,r,r,r,m,f,f,f,Q,r,f")
-	(match_operand:DI 1 "input_operand" "r,J,I,K,L,s,m,rJ,f,J,Q,fG,f,r"))]
+  [(set (match_operand:DI 0 "general_operand" "=r,r,r,r,r,r,r,m,f,f,f,Q,r,*f")
+	(match_operand:DI 1 "input_operand" "r,J,I,K,L,s,m,rJ,f,J,Q,fG,f,*r"))]
  "TARGET_CIX
   && (register_operand (operands[0], DImode)
       || reg_or_0_operand (operands[1], DImode))"
@ -3963,7 +4143,7 @@
   stt %R1,%0
   ftoit %1,%0
   itoft %1,%0"
-  [(set_attr "type" "ilog,ilog,ilog,iadd,iadd,ldsym,ld,st,fcpys,fcpys,ld,st,ld,st")])
+  [(set_attr "type" "ilog,ilog,ilog,iadd,iadd,ldsym,ild,ist,fcpys,fcpys,fld,fst,ftoi,itof")])

 ;; We do three major things here: handle mem->mem, put 64-bit constants in
 ;; memory, and construct long 32-bit constants.
--- a/gcc/genattrtab.c
+++ b/gcc/genattrtab.c
@ -318,12 +318,12 @@ static int address_used;
 static int length_used;
 static int num_delays;
 static int have_annul_true, have_annul_false;
-static int num_units;
+static int num_units, num_unit_opclasses;
 static int num_insn_ents;

 /* Used as operand to `operate_exp':  */

-enum operator {PLUS_OP, MINUS_OP, POS_MINUS_OP, EQ_OP, OR_OP, MAX_OP, MIN_OP, RANGE_OP};
+enum operator {PLUS_OP, MINUS_OP, POS_MINUS_OP, EQ_OP, OR_OP, ORX_OP, MAX_OP, MIN_OP, RANGE_OP};

 /* Stores, for each insn code, the number of constraint alternatives.  */

@ -381,7 +381,9 @@ static rtx check_attr_value	PROTO((rtx, struct attr_desc *));
 static rtx convert_set_attr_alternative PROTO((rtx, int, int, int));
 static rtx convert_set_attr	PROTO((rtx, int, int, int));
 static void check_defs		PROTO((void));
+#if 0
 static rtx convert_const_symbol_ref PROTO((rtx, struct attr_desc *));
+#endif
 static rtx make_canonical	PROTO((struct attr_desc *, rtx));
 static struct attr_value *get_attr_value PROTO((rtx, struct attr_desc *, int));
 static rtx copy_rtx_unchanging	PROTO((rtx));
@ -447,6 +449,8 @@ static void write_eligible_delay PROTO((char *));
 static void write_function_unit_info PROTO((void));
 static void write_complex_function PROTO((struct function_unit *, char *,
 					  char *));
+static int write_expr_attr_cache PROTO((rtx, struct attr_desc *));
+static void write_toplevel_expr	PROTO((rtx));
 static int n_comma_elts		PROTO((char *));
 static char *next_comma_elt	PROTO((char **));
 static struct attr_desc *find_attr PROTO((char *, int));
@ -1052,7 +1056,7 @@ check_attr_test (exp, is_const)
      /* These cases can't be simplified.  */
      RTX_UNCHANGING_P (exp) = 1;
      break;
-
+ 
    case LE:  case LT:  case GT:  case GE:
    case LEU: case LTU: case GTU: case GEU:
    case NE:  case EQ:
@ -1144,6 +1148,16 @@ check_attr_value (exp, attr)
      XEXP (exp, 2) = check_attr_value (XEXP (exp, 2), attr);
      break;

+    case IOR:
+    case AND:
+      XEXP (exp, 0) = check_attr_value (XEXP (exp, 0), attr);
+      XEXP (exp, 1) = check_attr_value (XEXP (exp, 1), attr);
+      break;
+
+    case FFS:
+      XEXP (exp, 0) = check_attr_value (XEXP (exp, 0), attr);
+      break;
+
    case COND:
      if (XVECLEN (exp, 0) % 2 != 0)
 	fatal ("First operand of COND must have even length");
@ -1303,6 +1317,7 @@ check_defs ()
    }
 }

+#if 0
 /* Given a constant SYMBOL_REF expression, convert to a COND that
   explicitly tests each enumerated value.  */

@ -1353,6 +1368,7 @@ convert_const_symbol_ref (exp, attr)

  return condexp;
 }
+#endif

 /* Given a valid expression for an attribute value, remove any IF_THEN_ELSE
   expressions by converting them into a COND.  This removes cases from this
@ -1390,6 +1406,10 @@ make_canonical (attr, exp)
 	 This makes the COND something that won't be considered an arbitrary
 	 expression by walk_attr_value.  */
      RTX_UNCHANGING_P (exp) = 1;
+#if 0
+      /* ??? Why do we do this?  With attribute values { A B C D E }, this
+         tends to generate (!(x==A) && !(x==B) && !(x==C) && !(x==D)) rather
+	 than (x==E). */
      exp = convert_const_symbol_ref (exp, attr);
      RTX_UNCHANGING_P (exp) = 1;
      exp = check_attr_value (exp, attr);
@ -1397,6 +1417,10 @@ make_canonical (attr, exp)
         new expression is rescanned, all symbol_ref notes are marked as
 	 unchanging.  */
      goto cond;
+#else
+      exp = check_attr_value (exp, attr);
+      break;
+#endif

    case IF_THEN_ELSE:
      newexp = rtx_alloc (COND);
@ -1634,6 +1658,7 @@ operate_exp (op, left, right)
 	      break;

 	    case OR_OP:
+	    case ORX_OP:
 	      i = left_value | right_value;
 	      break;

@ -1663,6 +1688,10 @@ operate_exp (op, left, right)
 	      abort ();
 	    }

+	  if (i == left_value)
+	    return left;
+	  if (i == right_value)
+	    return right;
 	  return make_numeric_value (i);
 	}
      else if (GET_CODE (right) == IF_THEN_ELSE)
@ -1715,6 +1744,13 @@ operate_exp (op, left, right)
 	fatal ("Badly formed attribute value");
    }

+  /* A hack to prevent expand_units from completely blowing up: ORX_OP does
+     not associate through IF_THEN_ELSE.  */
+  else if (op == ORX_OP && GET_CODE (right) == IF_THEN_ELSE)
+    {
+      return attr_rtx (IOR, left, right);
+    }
+
  /* Otherwise, do recursion the other way.  */
  else if (GET_CODE (left) == IF_THEN_ELSE)
    {
@ -1857,18 +1893,48 @@ expand_units ()
  newexp = rtx_alloc (IF_THEN_ELSE);
  XEXP (newexp, 2) = make_numeric_value (0);

-  /* Merge each function unit into the unit mask attributes.  */
-  for (unit = units; unit; unit = unit->next)
+  /* If we have just a few units, we may be all right expanding the whole
+     thing.  But the expansion is 2**N in space on the number of opclasses,
+     so we can't do this for very long -- Alpha and MIPS in particular have
+     problems with this.  So in that situation, we fall back on an alternate
+     implementation method.  */
+#define NUM_UNITOP_CUTOFF 20
+
+  if (num_unit_opclasses < NUM_UNITOP_CUTOFF)
    {
-      XEXP (newexp, 0) = unit->condexp;
-      XEXP (newexp, 1) = make_numeric_value (1 << unit->num);
-      unitsmask = operate_exp (OR_OP, unitsmask, newexp);
+      /* Merge each function unit into the unit mask attributes.  */
+      for (unit = units; unit; unit = unit->next)
+        {
+          XEXP (newexp, 0) = unit->condexp;
+          XEXP (newexp, 1) = make_numeric_value (1 << unit->num);
+          unitsmask = operate_exp (OR_OP, unitsmask, newexp);
+        }
+    }
+  else
+    {
+      /* Merge each function unit into the unit mask attributes.  */
+      for (unit = units; unit; unit = unit->next)
+        {
+          XEXP (newexp, 0) = unit->condexp;
+          XEXP (newexp, 1) = make_numeric_value (1 << unit->num);
+          unitsmask = operate_exp (ORX_OP, unitsmask, attr_copy_rtx (newexp));
+        }
    }

  /* Simplify the unit mask expression, encode it, and make an attribute
     for the function_units_used function.  */
  unitsmask = simplify_by_exploding (unitsmask);
-  unitsmask = encode_units_mask (unitsmask);
+
+  if (num_unit_opclasses < NUM_UNITOP_CUTOFF)
+    unitsmask = encode_units_mask (unitsmask);
+  else
+    {
+      /* We can no longer encode unitsmask at compile time, so emit code to
+         calculate it at runtime.  Rather, put a marker for where we'd do
+	 the code, and actually output it in write_attr_get().  */
+      unitsmask = attr_rtx (FFS, unitsmask);
+    }
+
  make_internal_attr ("*function_units_used", unitsmask, 2);

  /* Create an array of ops for each unit.  Add an extra unit for the
@ -2737,6 +2803,26 @@ evaluate_eq_attr (exp, value, insn_code, insn_index)
      else
 	newexp = false_rtx;
    }
+  else if (GET_CODE (value) == SYMBOL_REF)
+    {
+      char *p, *string;
+
+      if (GET_CODE (exp) != EQ_ATTR)
+	abort();
+
+      string = (char *) alloca (2 + strlen (XSTR (exp, 0))
+				+ strlen (XSTR (exp, 1)));
+      strcpy (string, XSTR (exp, 0));
+      strcat (string, "_");
+      strcat (string, XSTR (exp, 1));
+      for (p = string; *p ; p++)
+	if (*p >= 'a' && *p <= 'z')
+	  *p -= 'a' - 'A';
+      
+      newexp = attr_rtx (EQ, value,
+			 attr_rtx (SYMBOL_REF,
+				   attr_string(string, strlen(string))));
+    }
  else if (GET_CODE (value) == COND)
    {
      /* We construct an IOR of all the cases for which the requested attribute
@ -3694,7 +3780,7 @@ add_values_to_cover (dim)
    abort ();
  else if (nalt == dim->num_values)
    ; /* Ok.  */
-  else if (nalt * 2 < dim->num_values * 3)
+  else if (nalt * 2 >= dim->num_values)
    {
      /* Most all the values of the attribute are used, so add all the unused
 	 values.  */
@ -4292,6 +4378,7 @@ gen_unit (def)
  op->issue_delay = issue_delay;
  op->next = unit->ops;
  unit->ops = op;
+  num_unit_opclasses++;

  /* Set our issue expression based on whether or not an optional conflict
     vector was specified.  */
@ -4319,14 +4406,18 @@ gen_unit (def)
 }

 /* Given a piece of RTX, print a C expression to test it's truth value.
+
   We use AND and IOR both for logical and bit-wise operations, so 
   interpret them as logical unless they are inside a comparison expression.
-   The second operand of this function will be non-zero in that case.  */
+   The first bit of FLAGS will be non-zero in that case.
+
+   Set the second bit of FLAGS to make references to attribute values use
+   a cached local variable instead of calling a function.  */

 static void
-write_test_expr (exp, in_comparison)
+write_test_expr (exp, flags)
     rtx exp;
-     int in_comparison;
+     int flags;
 {
  int comparison_operator = 0;
  RTX_CODE code;
@ -4348,7 +4439,7 @@ write_test_expr (exp, in_comparison)
    case PLUS:   case MINUS:  case MULT:     case DIV:      case MOD:
    case AND:    case IOR:    case XOR:
    case ASHIFT: case LSHIFTRT: case ASHIFTRT:
-      write_test_expr (XEXP (exp, 0), in_comparison || comparison_operator);
+      write_test_expr (XEXP (exp, 0), flags | comparison_operator);
      switch (code)
        {
 	case EQ:
@ -4397,13 +4488,13 @@ write_test_expr (exp, in_comparison)
 	  printf (" %% ");
 	  break;
 	case AND:
-	  if (in_comparison)
+	  if (flags & 1)
 	    printf (" & ");
 	  else
 	    printf (" && ");
 	  break;
 	case IOR:
-	  if (in_comparison)
+	  if (flags & 1)
 	    printf (" | ");
 	  else
 	    printf (" || ");
@ -4422,12 +4513,12 @@ write_test_expr (exp, in_comparison)
 	  abort ();
        }

-      write_test_expr (XEXP (exp, 1), in_comparison || comparison_operator);
+      write_test_expr (XEXP (exp, 1), flags | comparison_operator);
      break;

    case NOT:
      /* Special-case (not (eq_attrq "alternative" "x")) */
-      if (! in_comparison && GET_CODE (XEXP (exp, 0)) == EQ_ATTR
+      if (! (flags & 1) && GET_CODE (XEXP (exp, 0)) == EQ_ATTR
 	  && XSTR (XEXP (exp, 0), 0) == alternative_name)
 	{
 	  printf ("which_alternative != %s", XSTR (XEXP (exp, 0), 1));
@ -4441,7 +4532,7 @@ write_test_expr (exp, in_comparison)
      switch (code)
 	{
 	case NOT:
-	  if (in_comparison)
+	  if (flags & 1)
 	    printf ("~ ");
 	  else
 	    printf ("! ");
@ -4456,14 +4547,14 @@ write_test_expr (exp, in_comparison)
 	  abort ();
 	}

-      write_test_expr (XEXP (exp, 0), in_comparison);
+      write_test_expr (XEXP (exp, 0), flags);
      break;

    /* Comparison test of an attribute with a value.  Most of these will
       have been removed by optimization.   Handle "alternative"
       specially and give error if EQ_ATTR present inside a comparison.  */
    case EQ_ATTR:
-      if (in_comparison)
+      if (flags & 1)
 	fatal ("EQ_ATTR not valid inside comparison");

      if (XSTR (exp, 0) == alternative_name)
@ -4480,18 +4571,22 @@ write_test_expr (exp, in_comparison)
 	{
 	  write_test_expr (evaluate_eq_attr (exp, attr->default_val->value,
 					     -2, -2),
-			   in_comparison);
+			   flags);
 	}
      else
 	{
-	  printf ("get_attr_%s (insn) == ", attr->name);
-	  write_attr_valueq (attr, XSTR (exp, 1)); 
+	  if (flags & 2)
+	    printf ("attr_%s", attr->name);
+	  else
+	    printf ("get_attr_%s (insn)", attr->name);
+	  printf (" == ");
+	  write_attr_valueq (attr, XSTR (exp, 1));
 	}
      break;

    /* Comparison test of flags for define_delays.  */
    case ATTR_FLAG:
-      if (in_comparison)
+      if (flags & 1)
 	fatal ("ATTR_FLAG not valid inside comparison");
      printf ("(flags & ATTR_FLAG_%s) != 0", XSTR (exp, 0));
      break;
@ -4541,6 +4636,18 @@ write_test_expr (exp, in_comparison)
      printf ("insn_current_address");
      break;

+    case CONST_STRING:
+      printf ("%s", XSTR (exp, 0));
+      break;
+
+    case IF_THEN_ELSE:
+      write_test_expr (XEXP (exp, 0), flags & 2);
+      printf (" ? ");
+      write_test_expr (XEXP (exp, 1), flags | 1);
+      printf (" : ");
+      write_test_expr (XEXP (exp, 2), flags | 1);
+      break;
+
    default:
      fatal ("bad RTX code `%s' in attribute calculation\n",
 	     GET_RTX_NAME (code));
@ -4707,17 +4814,40 @@ write_attr_get (attr)
      printf ("}\n\n");
      return;
    }
+
  printf ("     rtx insn;\n");
  printf ("{\n");
-  printf ("  switch (recog_memoized (insn))\n");
-  printf ("    {\n");

-  for (av = attr->first_value; av; av = av->next)
-    if (av != common_av)
-      write_attr_case (attr, av, 1, "return", ";", 4, true_rtx);
+  if (GET_CODE (common_av->value) == FFS)
+    {
+      rtx p = XEXP (common_av->value, 0);

-  write_attr_case (attr, common_av, 0, "return", ";", 4, true_rtx);
-  printf ("    }\n}\n\n");
+      /* No need to emit code to abort if the insn is unrecognized; the 
+         other get_attr_foo functions will do that when we call them.  */
+
+      write_toplevel_expr (p);
+
+      printf ("\n  if (accum && accum == (accum & -accum))\n");
+      printf ("    {\n");
+      printf ("      int i;\n");
+      printf ("      for (i = 0; accum >>= 1; ++i) continue;\n");
+      printf ("      accum = i;\n");
+      printf ("    }\n  else\n");
+      printf ("    accum = ~accum;\n");
+      printf ("  return accum;\n}\n\n");
+    }
+  else
+    {
+      printf ("  switch (recog_memoized (insn))\n");
+      printf ("    {\n");
+
+      for (av = attr->first_value; av; av = av->next)
+	if (av != common_av)
+	  write_attr_case (attr, av, 1, "return", ";", 4, true_rtx);
+
+      write_attr_case (attr, common_av, 0, "return", ";", 4, true_rtx);
+      printf ("    }\n}\n\n");
+    }
 }

 /* Given an AND tree of known true terms (because we are inside an `if' with
@ -4927,6 +5057,90 @@ write_attr_case (attr, av, write_case_lines, prefix, suffix, indent,
  printf ("\n");
 }

+/* Search for uses of non-const attributes and write code to cache them.  */
+
+static int
+write_expr_attr_cache (p, attr)
+     rtx p;
+     struct attr_desc *attr;
+{
+  char *fmt;
+  int i, ie, j, je;
+
+  if (GET_CODE (p) == EQ_ATTR)
+    {
+      if (XSTR (p, 0) != attr->name)
+	return 0;
+
+      if (!attr->is_numeric)
+	printf ("  register enum attr_%s ", attr->name);
+      else if (attr->unsigned_p)
+	printf ("  register unsigned int ");
+      else
+	printf ("  register int ");
+
+      printf ("attr_%s = get_attr_%s (insn);\n", attr->name, attr->name);
+      return 1;
+    }
+
+  fmt = GET_RTX_FORMAT (GET_CODE (p));
+  ie = GET_RTX_LENGTH (GET_CODE (p));
+  for (i = 0; i < ie; i++)
+    {
+      switch (*fmt++)
+	{
+	case 'e':
+	  if (write_expr_attr_cache (XEXP (p, i), attr))
+	    return 1;
+	  break;
+
+	case 'E':
+	  je = XVECLEN (p, i);
+	  for (j = 0; j < je; ++j)
+	    if (write_expr_attr_cache (XVECEXP (p, i, j), attr))
+	      return 1;
+	  break;
+	}
+    }
+
+  return 0;
+}
+
+/* Evaluate an expression at top level.  A front end to write_test_expr,
+   in which we cache attribute values and break up excessively large
+   expressions to cater to older compilers.  */
+
+static void
+write_toplevel_expr (p)
+     rtx p;
+{
+  struct attr_desc *attr;
+  int i;
+
+  for (i = 0; i < MAX_ATTRS_INDEX; ++i)
+    for (attr = attrs[i]; attr ; attr = attr->next)
+      if (!attr->is_const)
+	write_expr_attr_cache (p, attr);
+
+  printf("  register unsigned long accum = 0;\n\n");
+
+  while (GET_CODE (p) == IOR)
+    {
+      rtx e;
+      if (GET_CODE (XEXP (p, 0)) == IOR)
+	e = XEXP (p, 1), p = XEXP (p, 0);
+      else
+	e = XEXP (p, 0), p = XEXP (p, 1);
+
+      printf ("  accum |= ");
+      write_test_expr (e, 3);
+      printf (";\n");
+    }
+  printf ("  accum |= ");
+  write_test_expr (p, 3);
+  printf (";\n");
+}
+
 /* Utilities to write names in various forms.  */

 static void
@ -5735,7 +5949,7 @@ from the machine description file `md'.  */\n\n");
  for (i = 0; i < MAX_ATTRS_INDEX; i++)
    for (attr = attrs[i]; attr; attr = attr->next)
      {
-	if (! attr->is_special)
+	if (! attr->is_special && ! attr->is_const)
 	  write_attr_get (attr);
      }