expmed.c (choose_mult_variant, [...]): New, split from...

* expmed.c (choose_mult_variant, expand_mult_const): New, split from... (expand_mult): ...here. (extract_high_half): New, split out from expand_mult_highpart. (expand_highpart_optab): Likewise. Don't clobber target prematurely. (expand_highpart): Evaluate the cost of a shift/add sequence, then see if any of the specialized optabs are cheaper. From-SVN: r79673
2004-03-19 09:59:00 +00:00 · 2004-03-19 09:59:00 +00:00 · 8efc8980df
commit 8efc8980df
parent d36d56001a
2 changed files with 342 additions and 302 deletions
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@ -1,3 +1,12 @@
 2004-03-19  Richard Sandiford  <rsandifo@redhat.com>
 	* expmed.c (choose_mult_variant, expand_mult_const): New, split from...
 	(expand_mult): ...here.
 	(extract_high_half): New, split out from expand_mult_highpart.
 	(expand_highpart_optab): Likewise.  Don't clobber target prematurely.
 	(expand_highpart): Evaluate the cost of a shift/add sequence,
 	then see if any of the specialized optabs are cheaper.
 2004-03-18  Ian Lance Taylor  <ian@wasabisystems.com>
 	* mklibgcc.in: Remove obsolete MAYBE_USE_COLLECT2.
--- a/gcc/expmed.c
+++ b/gcc/expmed.c
@ -2149,11 +2149,24 @@ struct algorithm
  char log[MAX_BITS_PER_WORD];
 };
 /* Indicates the type of fixup needed after a constant multiplication.
   BASIC_VARIANT means no fixup is needed, NEGATE_VARIANT means that
   the result should be negated, and ADD_VARIANT means that the
   multiplicand should be added to the result.  */
 enum mult_variant {basic_variant, negate_variant, add_variant};
 static void synth_mult (struct algorithm *, unsigned HOST_WIDE_INT, int);
 static bool choose_mult_variant (enum machine_mode, HOST_WIDE_INT,
 				 struct algorithm *, enum mult_variant *);
 static rtx expand_mult_const (enum machine_mode, rtx, HOST_WIDE_INT, rtx,
 			      const struct algorithm *, enum mult_variant);
 static unsigned HOST_WIDE_INT choose_multiplier (unsigned HOST_WIDE_INT, int,
 						 int, unsigned HOST_WIDE_INT *,
 						 int *, int *);
 static unsigned HOST_WIDE_INT invert_mod2n (unsigned HOST_WIDE_INT, int);
 static rtx extract_high_half (enum machine_mode, rtx);
 static rtx expand_mult_highpart_optab (enum machine_mode, rtx, rtx, rtx,
 				       int, int);
 /* Compute and return the best algorithm for multiplying by T.
   The algorithm must cost less than cost_limit
   If retval.cost >= COST_LIMIT, no algorithm was found and all
@ -2396,6 +2409,198 @@ synth_mult (struct algorithm *alg_out, unsigned HOST_WIDE_INT t,
 	  alg_out->ops * sizeof *alg_out->log);
 }
 /* Find the cheapeast way of multiplying a value of mode MODE by VAL.
   Try three variations:
       - a shift/add sequence based on VAL itself
       - a shift/add sequence based on -VAL, followed by a negation
       - a shift/add sequence based on VAL - 1, followed by an addition.
   Return true if the cheapest of these is better than register
   multiplication, describing the algorithm in *ALG and final
   fixup in *VARIANT.  */
 static bool
 choose_mult_variant (enum machine_mode mode, HOST_WIDE_INT val,
 		     struct algorithm *alg, enum mult_variant *variant)
 {
  int mult_cost;
  struct algorithm alg2;
  rtx reg;
  reg = gen_rtx_REG (mode, FIRST_PSEUDO_REGISTER);
  mult_cost = rtx_cost (gen_rtx_MULT (mode, reg, GEN_INT (val)), SET);
  mult_cost = MIN (12 * add_cost, mult_cost);
  *variant = basic_variant;
  synth_mult (alg, val, mult_cost);
  /* This works only if the inverted value actually fits in an
     `unsigned int' */
  if (HOST_BITS_PER_INT >= GET_MODE_BITSIZE (mode))
    {
      synth_mult (&alg2, -val, MIN (alg->cost, mult_cost) - negate_cost);
      alg2.cost += negate_cost;
      if (alg2.cost < alg->cost)
 	*alg = alg2, *variant = negate_variant;
    }
  /* This proves very useful for division-by-constant.  */
  synth_mult (&alg2, val - 1, MIN (alg->cost, mult_cost) - add_cost);
  alg2.cost += add_cost;
  if (alg2.cost < alg->cost)
    *alg = alg2, *variant = add_variant;
  return alg->cost < mult_cost;
 }
 /* A subroutine of expand_mult, used for constant multiplications.
   Multiply OP0 by VAL in mode MODE, storing the result in TARGET if
   convenient.  Use the shift/add sequence described by ALG and apply
   the final fixup specified by VARIANT.  */
 static rtx
 expand_mult_const (enum machine_mode mode, rtx op0, HOST_WIDE_INT val,
 		   rtx target, const struct algorithm *alg,
 		   enum mult_variant variant)
 {
  HOST_WIDE_INT val_so_far;
  rtx insn, accum, tem;
  int opno;
  enum machine_mode nmode;
  /* op0 must be register to make mult_cost match the precomputed
     shiftadd_cost array.  */
  op0 = protect_from_queue (op0, 0);
  /* Avoid referencing memory over and over.
     For speed, but also for correctness when mem is volatile.  */
  if (GET_CODE (op0) == MEM)
    op0 = force_reg (mode, op0);
  /* ACCUM starts out either as OP0 or as a zero, depending on
     the first operation.  */
  if (alg->op[0] == alg_zero)
    {
      accum = copy_to_mode_reg (mode, const0_rtx);
      val_so_far = 0;
    }
  else if (alg->op[0] == alg_m)
    {
      accum = copy_to_mode_reg (mode, op0);
      val_so_far = 1;
    }
  else
    abort ();
  for (opno = 1; opno < alg->ops; opno++)
    {
      int log = alg->log[opno];
      int preserve = preserve_subexpressions_p ();
      rtx shift_subtarget = preserve ? 0 : accum;
      rtx add_target
 	= (opno == alg->ops - 1 && target != 0 && variant != add_variant
 	   && ! preserve)
 	  ? target : 0;
      rtx accum_target = preserve ? 0 : accum;
      switch (alg->op[opno])
 	{
 	case alg_shift:
 	  accum = expand_shift (LSHIFT_EXPR, mode, accum,
 				build_int_2 (log, 0), NULL_RTX, 0);
 	  val_so_far <<= log;
 	  break;
 	case alg_add_t_m2:
 	  tem = expand_shift (LSHIFT_EXPR, mode, op0,
 			      build_int_2 (log, 0), NULL_RTX, 0);
 	  accum = force_operand (gen_rtx_PLUS (mode, accum, tem),
 				 add_target ? add_target : accum_target);
 	  val_so_far += (HOST_WIDE_INT) 1 << log;
 	  break;
 	case alg_sub_t_m2:
 	  tem = expand_shift (LSHIFT_EXPR, mode, op0,
 			      build_int_2 (log, 0), NULL_RTX, 0);
 	  accum = force_operand (gen_rtx_MINUS (mode, accum, tem),
 				 add_target ? add_target : accum_target);
 	  val_so_far -= (HOST_WIDE_INT) 1 << log;
 	  break;
 	case alg_add_t2_m:
 	  accum = expand_shift (LSHIFT_EXPR, mode, accum,
 				build_int_2 (log, 0), shift_subtarget,
 				0);
 	  accum = force_operand (gen_rtx_PLUS (mode, accum, op0),
 				 add_target ? add_target : accum_target);
 	  val_so_far = (val_so_far << log) + 1;
 	  break;
 	case alg_sub_t2_m:
 	  accum = expand_shift (LSHIFT_EXPR, mode, accum,
 				build_int_2 (log, 0), shift_subtarget, 0);
 	  accum = force_operand (gen_rtx_MINUS (mode, accum, op0),
 				 add_target ? add_target : accum_target);
 	  val_so_far = (val_so_far << log) - 1;
 	  break;
 	case alg_add_factor:
 	  tem = expand_shift (LSHIFT_EXPR, mode, accum,
 			      build_int_2 (log, 0), NULL_RTX, 0);
 	  accum = force_operand (gen_rtx_PLUS (mode, accum, tem),
 				 add_target ? add_target : accum_target);
 	  val_so_far += val_so_far << log;
 	  break;
 	case alg_sub_factor:
 	  tem = expand_shift (LSHIFT_EXPR, mode, accum,
 			      build_int_2 (log, 0), NULL_RTX, 0);
 	  accum = force_operand (gen_rtx_MINUS (mode, tem, accum),
 				 (add_target ? add_target
 				  : preserve ? 0 : tem));
 	  val_so_far = (val_so_far << log) - val_so_far;
 	  break;
 	default:
 	  abort ();
 	}
      /* Write a REG_EQUAL note on the last insn so that we can cse
 	 multiplication sequences.  Note that if ACCUM is a SUBREG,
 	 we've set the inner register and must properly indicate
 	 that.  */
      tem = op0, nmode = mode;
      if (GET_CODE (accum) == SUBREG)
 	{
 	  nmode = GET_MODE (SUBREG_REG (accum));
 	  tem = gen_lowpart (nmode, op0);
 	}
      insn = get_last_insn ();
      set_unique_reg_note (insn, REG_EQUAL,
 			   gen_rtx_MULT (nmode, tem, GEN_INT (val_so_far)));
    }
  if (variant == negate_variant)
    {
      val_so_far = -val_so_far;
      accum = expand_unop (mode, neg_optab, accum, target, 0);
    }
  else if (variant == add_variant)
    {
      val_so_far = val_so_far + 1;
      accum = force_operand (gen_rtx_PLUS (mode, accum, op0), target);
    }
  if (val != val_so_far)
    abort ();
  return accum;
 }
 /* Perform a multiplication and return an rtx for the result.
   MODE is mode of value; OP0 and OP1 are what to multiply (rtx's);
   TARGET is a suggestion for where to store the result (an rtx).
@ -2409,6 +2614,8 @@ expand_mult (enum machine_mode mode, rtx op0, rtx op1, rtx target,
 	     int unsignedp)
 {
  rtx const_op1 = op1;
  enum mult_variant variant;
  struct algorithm algorithm;
  /* synth_mult does an `unsigned int' multiply.  As long as the mode is
     less than or equal in size to `unsigned int' this doesn't matter.
@ -2435,190 +2642,10 @@ expand_mult (enum machine_mode mode, rtx op0, rtx op1, rtx target,
     that it seems better to use synth_mult always.  */
  if (const_op1 && GET_CODE (const_op1) == CONST_INT
-      && (unsignedp || ! flag_trapv))
+      && (unsignedp || !flag_trapv)
-    {
+      && choose_mult_variant (mode, INTVAL (const_op1), &algorithm, &variant))
-      struct algorithm alg;
+    return expand_mult_const (mode, op0, INTVAL (const_op1), target,
-      struct algorithm alg2;
+			      &algorithm, variant);
      HOST_WIDE_INT val = INTVAL (op1);
      HOST_WIDE_INT val_so_far;
      rtx insn;
      int mult_cost;
      enum {basic_variant, negate_variant, add_variant} variant = basic_variant;
      /* op0 must be register to make mult_cost match the precomputed
         shiftadd_cost array.  */
      op0 = force_reg (mode, op0);
      /* Try to do the computation three ways: multiply by the negative of OP1
 	 and then negate, do the multiplication directly, or do multiplication
 	 by OP1 - 1.  */
      mult_cost = rtx_cost (gen_rtx_MULT (mode, op0, op1), SET);
      mult_cost = MIN (12 * add_cost, mult_cost);
      synth_mult (&alg, val, mult_cost);
      /* This works only if the inverted value actually fits in an
 	 `unsigned int' */
      if (HOST_BITS_PER_INT >= GET_MODE_BITSIZE (mode))
 	{
 	  synth_mult (&alg2, - val,
 		      (alg.cost < mult_cost ? alg.cost : mult_cost) - negate_cost);
 	  if (alg2.cost + negate_cost < alg.cost)
 	    alg = alg2, variant = negate_variant;
 	}
      /* This proves very useful for division-by-constant.  */
      synth_mult (&alg2, val - 1,
 		  (alg.cost < mult_cost ? alg.cost : mult_cost) - add_cost);
      if (alg2.cost + add_cost < alg.cost)
 	alg = alg2, variant = add_variant;
      if (alg.cost < mult_cost)
 	{
 	  /* We found something cheaper than a multiply insn.  */
 	  int opno;
 	  rtx accum, tem;
 	  enum machine_mode nmode;
 	  op0 = protect_from_queue (op0, 0);
 	  /* Avoid referencing memory over and over.
 	     For speed, but also for correctness when mem is volatile.  */
 	  if (GET_CODE (op0) == MEM)
 	    op0 = force_reg (mode, op0);
 	  /* ACCUM starts out either as OP0 or as a zero, depending on
 	     the first operation.  */
 	  if (alg.op[0] == alg_zero)
 	    {
 	      accum = copy_to_mode_reg (mode, const0_rtx);
 	      val_so_far = 0;
 	    }
 	  else if (alg.op[0] == alg_m)
 	    {
 	      accum = copy_to_mode_reg (mode, op0);
 	      val_so_far = 1;
 	    }
 	  else
 	    abort ();
 	  for (opno = 1; opno < alg.ops; opno++)
 	    {
 	      int log = alg.log[opno];
 	      int preserve = preserve_subexpressions_p ();
 	      rtx shift_subtarget = preserve ? 0 : accum;
 	      rtx add_target
 		= (opno == alg.ops - 1 && target != 0 && variant != add_variant
 		   && ! preserve)
 		  ? target : 0;
 	      rtx accum_target = preserve ? 0 : accum;
 	      switch (alg.op[opno])
 		{
 		case alg_shift:
 		  accum = expand_shift (LSHIFT_EXPR, mode, accum,
 					build_int_2 (log, 0), NULL_RTX, 0);
 		  val_so_far <<= log;
 		  break;
 		case alg_add_t_m2:
 		  tem = expand_shift (LSHIFT_EXPR, mode, op0,
 				      build_int_2 (log, 0), NULL_RTX, 0);
 		  accum = force_operand (gen_rtx_PLUS (mode, accum, tem),
 					 add_target
 					 ? add_target : accum_target);
 		  val_so_far += (HOST_WIDE_INT) 1 << log;
 		  break;
 		case alg_sub_t_m2:
 		  tem = expand_shift (LSHIFT_EXPR, mode, op0,
 				      build_int_2 (log, 0), NULL_RTX, 0);
 		  accum = force_operand (gen_rtx_MINUS (mode, accum, tem),
 					 add_target
 					 ? add_target : accum_target);
 		  val_so_far -= (HOST_WIDE_INT) 1 << log;
 		  break;
 		case alg_add_t2_m:
 		  accum = expand_shift (LSHIFT_EXPR, mode, accum,
 					build_int_2 (log, 0), shift_subtarget,
 					0);
 		  accum = force_operand (gen_rtx_PLUS (mode, accum, op0),
 					 add_target
 					 ? add_target : accum_target);
 		  val_so_far = (val_so_far << log) + 1;
 		  break;
 		case alg_sub_t2_m:
 		  accum = expand_shift (LSHIFT_EXPR, mode, accum,
 					build_int_2 (log, 0), shift_subtarget,
 					0);
 		  accum = force_operand (gen_rtx_MINUS (mode, accum, op0),
 					 add_target
 					 ? add_target : accum_target);
 		  val_so_far = (val_so_far << log) - 1;
 		  break;
 		case alg_add_factor:
 		  tem = expand_shift (LSHIFT_EXPR, mode, accum,
 				      build_int_2 (log, 0), NULL_RTX, 0);
 		  accum = force_operand (gen_rtx_PLUS (mode, accum, tem),
 					 add_target
 					 ? add_target : accum_target);
 		  val_so_far += val_so_far << log;
 		  break;
 		case alg_sub_factor:
 		  tem = expand_shift (LSHIFT_EXPR, mode, accum,
 				      build_int_2 (log, 0), NULL_RTX, 0);
 		  accum = force_operand (gen_rtx_MINUS (mode, tem, accum),
 					 (add_target ? add_target
 					  : preserve ? 0 : tem));
 		  val_so_far = (val_so_far << log) - val_so_far;
 		  break;
 		default:
 		  abort ();
 		}
 	      /* Write a REG_EQUAL note on the last insn so that we can cse
 		 multiplication sequences.  Note that if ACCUM is a SUBREG,
 		 we've set the inner register and must properly indicate
 		 that.  */
 	      tem = op0, nmode = mode;
 	      if (GET_CODE (accum) == SUBREG)
 		{
 		  nmode = GET_MODE (SUBREG_REG (accum));
 		  tem = gen_lowpart (nmode, op0);
 		}
 	      insn = get_last_insn ();
 	      set_unique_reg_note (insn,
 				   REG_EQUAL,
 				   gen_rtx_MULT (nmode, tem,
 					         GEN_INT (val_so_far)));
 	    }
 	  if (variant == negate_variant)
 	    {
 	      val_so_far = - val_so_far;
 	      accum = expand_unop (mode, neg_optab, accum, target, 0);
 	    }
 	  else if (variant == add_variant)
 	    {
 	      val_so_far = val_so_far + 1;
 	      accum = force_operand (gen_rtx_PLUS (mode, accum, op0), target);
 	    }
 	  if (val != val_so_far)
 	    abort ();
 	  return accum;
 	}
    }
  if (GET_CODE (op0) == CONST_DOUBLE)
    {
@ -2832,6 +2859,108 @@ expand_mult_highpart_adjust (enum machine_mode mode, rtx adj_operand, rtx op0,
  return target;
 }
 /* Subroutine of expand_mult_highpart.  Return the MODE high part of OP.  */
 static rtx
 extract_high_half (enum machine_mode mode, rtx op)
 {
  enum machine_mode wider_mode;
  if (mode == word_mode)
    return gen_highpart (mode, op);
  wider_mode = GET_MODE_WIDER_MODE (mode);
  op = expand_shift (RSHIFT_EXPR, wider_mode, op,
 		     build_int_2 (GET_MODE_BITSIZE (mode), 0), 0, 1);
  return convert_modes (mode, wider_mode, op, 0);
 }
 /* Like expand_mult_highpart, but only consider using a multiplication
   optab.  OP1 is an rtx for the constant operand.  */
 static rtx
 expand_mult_highpart_optab (enum machine_mode mode, rtx op0, rtx op1,
 			    rtx target, int unsignedp, int max_cost)
 {
  enum machine_mode wider_mode;
  optab moptab;
  rtx tem;
  int size;
  wider_mode = GET_MODE_WIDER_MODE (mode);
  size = GET_MODE_BITSIZE (mode);
  /* Firstly, try using a multiplication insn that only generates the needed
     high part of the product, and in the sign flavor of unsignedp.  */
  if (mul_highpart_cost[(int) mode] < max_cost)
    {
      moptab = unsignedp ? umul_highpart_optab : smul_highpart_optab;
      tem = expand_binop (mode, moptab, op0, op1, target,
 			  unsignedp, OPTAB_DIRECT);
      if (tem)
 	return tem;
    }
  /* Secondly, same as above, but use sign flavor opposite of unsignedp.
     Need to adjust the result after the multiplication.  */
  if (size - 1 < BITS_PER_WORD
      && (mul_highpart_cost[(int) mode] + 2 * shift_cost[size-1] + 4 * add_cost
 	  < max_cost))
    {
      moptab = unsignedp ? smul_highpart_optab : umul_highpart_optab;
      tem = expand_binop (mode, moptab, op0, op1, target,
 			  unsignedp, OPTAB_DIRECT);
      if (tem)
 	/* We used the wrong signedness.  Adjust the result.  */
 	return expand_mult_highpart_adjust (mode, tem, op0, op1,
 					    tem, unsignedp);
    }
  /* Try widening multiplication.  */
  moptab = unsignedp ? umul_widen_optab : smul_widen_optab;
  if (moptab->handlers[(int) wider_mode].insn_code != CODE_FOR_nothing
      && mul_widen_cost[(int) wider_mode] < max_cost)
    {
      tem = expand_binop (wider_mode, moptab, op0, op1, 0,
 			  unsignedp, OPTAB_WIDEN);
      if (tem)
 	return extract_high_half (mode, tem);
    }
  /* Try widening the mode and perform a non-widening multiplication.  */
  moptab = smul_optab;
  if (smul_optab->handlers[(int) wider_mode].insn_code != CODE_FOR_nothing
      && size - 1 < BITS_PER_WORD
      && mul_cost[(int) wider_mode] + shift_cost[size-1] < max_cost)
    {
      tem = expand_binop (wider_mode, moptab, op0, op1, 0,
 			  unsignedp, OPTAB_WIDEN);
      if (tem)
 	return extract_high_half (mode, tem);
    }
  /* Try widening multiplication of opposite signedness, and adjust.  */
  moptab = unsignedp ? smul_widen_optab : umul_widen_optab;
  if (moptab->handlers[(int) wider_mode].insn_code != CODE_FOR_nothing
      && size - 1 < BITS_PER_WORD
      && (mul_widen_cost[(int) wider_mode]
 	  + 2 * shift_cost[size-1] + 4 * add_cost < max_cost))
    {
      rtx regop1 = force_reg (mode, op1);
      tem = expand_binop (wider_mode, moptab, op0, regop1,
 			  NULL_RTX, ! unsignedp, OPTAB_WIDEN);
      if (tem != 0)
 	{
 	  tem = extract_high_half (mode, tem);
 	  /* We used the wrong signedness.  Adjust the result.  */
 	  return expand_mult_highpart_adjust (mode, tem, op0, op1,
 					      target, unsignedp);
 	}
    }
  return 0;
 }
 /* Emit code to multiply OP0 and CNST1, putting the high half of the result
   in TARGET if that is convenient, and return where the result is.  If the
   operation can not be performed, 0 is returned.
@ -2847,133 +2976,35 @@ expand_mult_highpart (enum machine_mode mode, rtx op0,
 		      unsigned HOST_WIDE_INT cnst1, rtx target,
 		      int unsignedp, int max_cost)
 {
-  enum machine_mode wider_mode = GET_MODE_WIDER_MODE (mode);
+  enum machine_mode wider_mode;
-  optab mul_highpart_optab;
+  enum mult_variant variant;
-  optab moptab;
+  struct algorithm alg;
-  rtx tem;
+  rtx op1, tem;
  int size = GET_MODE_BITSIZE (mode);
  rtx op1, wide_op1;
  /* We can't support modes wider than HOST_BITS_PER_INT.  */
-  if (size > HOST_BITS_PER_WIDE_INT)
+  if (GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT)
    abort ();
  op1 = gen_int_mode (cnst1, mode);
-  wide_op1
+  /* See whether shift/add multiplication is cheap enough.  */
-    = immed_double_const (cnst1,
+  if (choose_mult_variant (mode, cnst1, &alg, &variant)
-			  (unsignedp
+      && (alg.cost += shift_cost[GET_MODE_BITSIZE (mode) - 1]) < max_cost)
 			   ? (HOST_WIDE_INT) 0
 			   : -(cnst1 >> (HOST_BITS_PER_WIDE_INT - 1))),
 			  wider_mode);
  /* expand_mult handles constant multiplication of word_mode
     or narrower.  It does a poor job for large modes.  */
  if (size < BITS_PER_WORD
      && mul_cost[(int) wider_mode] + shift_cost[size-1] < max_cost)
    {
-      /* We have to do this, since expand_binop doesn't do conversion for
+      /* See whether the specialized multiplication optabs are
-	 multiply.  Maybe change expand_binop to handle widening multiply?  */
+	 cheaper than the shift/add version.  */
      tem = expand_mult_highpart_optab (mode, op0, op1, target,
 					unsignedp, alg.cost);
      if (tem)
 	return tem;
      wider_mode = GET_MODE_WIDER_MODE (mode);
      op0 = convert_to_mode (wider_mode, op0, unsignedp);
-
+      tem = expand_mult_const (wider_mode, op0, cnst1, 0, &alg, variant);
-      /* We know that this can't have signed overflow, so pretend this is
+      return extract_high_half (mode, tem);
         an unsigned multiply.  */
      tem = expand_mult (wider_mode, op0, wide_op1, NULL_RTX, 0);
      tem = expand_shift (RSHIFT_EXPR, wider_mode, tem,
 			  build_int_2 (size, 0), NULL_RTX, 1);
      return convert_modes (mode, wider_mode, tem, unsignedp);
    }
  if (target == 0)
    target = gen_reg_rtx (mode);
  /* Firstly, try using a multiplication insn that only generates the needed
     high part of the product, and in the sign flavor of unsignedp.  */
  if (mul_highpart_cost[(int) mode] < max_cost)
    {
      mul_highpart_optab = unsignedp ? umul_highpart_optab : smul_highpart_optab;
      target = expand_binop (mode, mul_highpart_optab,
 			     op0, op1, target, unsignedp, OPTAB_DIRECT);
      if (target)
 	return target;
    }
  /* Secondly, same as above, but use sign flavor opposite of unsignedp.
     Need to adjust the result after the multiplication.  */
  if (size - 1 < BITS_PER_WORD
      && (mul_highpart_cost[(int) mode] + 2 * shift_cost[size-1] + 4 * add_cost
 	  < max_cost))
    {
      mul_highpart_optab = unsignedp ? smul_highpart_optab : umul_highpart_optab;
      target = expand_binop (mode, mul_highpart_optab,
 			     op0, op1, target, unsignedp, OPTAB_DIRECT);
      if (target)
 	/* We used the wrong signedness.  Adjust the result.  */
 	return expand_mult_highpart_adjust (mode, target, op0,
 					    op1, target, unsignedp);
    }
  /* Try widening multiplication.  */
  moptab = unsignedp ? umul_widen_optab : smul_widen_optab;
  if (moptab->handlers[(int) wider_mode].insn_code != CODE_FOR_nothing
      && mul_widen_cost[(int) wider_mode] < max_cost)
    {
      op1 = force_reg (mode, op1);
      goto try;
    }
  /* Try widening the mode and perform a non-widening multiplication.  */
  moptab = smul_optab;
  if (smul_optab->handlers[(int) wider_mode].insn_code != CODE_FOR_nothing
      && size - 1 < BITS_PER_WORD
      && mul_cost[(int) wider_mode] + shift_cost[size-1] < max_cost)
    {
      op1 = wide_op1;
      goto try;
    }
  /* Try widening multiplication of opposite signedness, and adjust.  */
  moptab = unsignedp ? smul_widen_optab : umul_widen_optab;
  if (moptab->handlers[(int) wider_mode].insn_code != CODE_FOR_nothing
      && size - 1 < BITS_PER_WORD
      && (mul_widen_cost[(int) wider_mode]
 	  + 2 * shift_cost[size-1] + 4 * add_cost < max_cost))
    {
      rtx regop1 = force_reg (mode, op1);
      tem = expand_binop (wider_mode, moptab, op0, regop1,
 			  NULL_RTX, ! unsignedp, OPTAB_WIDEN);
      if (tem != 0)
 	{
 	  /* Extract the high half of the just generated product.  */
 	  tem = expand_shift (RSHIFT_EXPR, wider_mode, tem,
 			      build_int_2 (size, 0), NULL_RTX, 1);
 	  tem = convert_modes (mode, wider_mode, tem, unsignedp);
 	  /* We used the wrong signedness.  Adjust the result.  */
 	  return expand_mult_highpart_adjust (mode, tem, op0, op1,
 					      target, unsignedp);
 	}
    }
  return 0;
 try:
  /* Pass NULL_RTX as target since TARGET has wrong mode.  */
  tem = expand_binop (wider_mode, moptab, op0, op1,
 		      NULL_RTX, unsignedp, OPTAB_WIDEN);
  if (tem == 0)
    return 0;
  /* Extract the high half of the just generated product.  */
  if (mode == word_mode)
    {
      return gen_highpart (mode, tem);
    }
  else
    {
      tem = expand_shift (RSHIFT_EXPR, wider_mode, tem,
 			  build_int_2 (size, 0), NULL_RTX, 1);
      return convert_modes (mode, wider_mode, tem, unsignedp);
    }
  return expand_mult_highpart_optab (mode, op0, op1, target,
 				     unsignedp, max_cost);
 }
 /* Emit the code to divide OP0 by OP1, putting the result in TARGET