diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 8d776d5ecd1..83328632961 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,14 @@
+2006-10-29  Richard Guenther  <rguenther@suse.de>
+
+	* config/i386/i386-protos.h (ix86_expand_round): Declare.
+	(ix86_expand_rounddf_32): Likewise.
+	* config/i386/i386.c (ix86_expand_round): New function expanding
+	round inline for SSE math and -fno-trapping-math and if not
+	optimizing for size.
+	(ix86_expand_rounddf_32): Same for DFmode on 32bit archs.
+	* config/i386/i386.md (rounddf2, roundsf2): New pattern expanding
+	round via ix86_expand_round.
+
 2006-10-29  Richard Guenther  <rguenther@suse.de>
 
 	* config/i386/i386-protos.h (ix86_expand_floorceil): Declare.
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index a6f760cc686..6393f94a703 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -162,6 +162,8 @@ extern void ix86_expand_lfloorceil (rtx, rtx, bool);
 extern void ix86_expand_rint (rtx, rtx);
 extern void ix86_expand_floorceil (rtx, rtx, bool);
 extern void ix86_expand_floorceildf_32 (rtx, rtx, bool);
+extern void ix86_expand_round (rtx, rtx);
+extern void ix86_expand_rounddf_32 (rtx, rtx);
 
 #ifdef TREE_CODE
 extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree);
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 6a125da2525..0a36e602661 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -19563,4 +19563,129 @@ ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
   emit_move_insn (operand0, res);
 }
 
+/* Expand SSE sequence for computing round from OPERAND1 storing
+   into OPERAND0.  Sequence that works without relying on DImode truncation
+   via cvttsd2siq that is only available on 64bit targets.  */
+void
+ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
+{
+  /* C code for the stuff we expand below.
+        double xa = fabs (x), xa2, x2;
+        if (!isless (xa, TWO52))
+          return x;
+     Using the absolute value and copying back sign makes
+     -0.0 -> -0.0 correct.
+        xa2 = xa + TWO52 - TWO52;
+     Compensate.
+	dxa = xa2 - xa;
+        if (dxa <= -0.5)
+          xa2 += 1;
+        else if (dxa > 0.5)
+          xa2 -= 1;
+        x2 = copysign (xa2, x);
+        return x2;
+   */
+  enum machine_mode mode = GET_MODE (operand0);
+  rtx xa, xa2, dxa, TWO52, tmp, label, half, mhalf, one, res, mask;
+
+  TWO52 = ix86_gen_TWO52 (mode);
+
+  /* Temporary for holding the result, initialized to the input
+     operand to ease control flow.  */
+  res = gen_reg_rtx (mode);
+  emit_move_insn (res, operand1);
+
+  /* xa = abs (operand1) */
+  xa = ix86_expand_sse_fabs (res, &mask);
+
+  /* if (!isless (xa, TWO52)) goto label; */
+  label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
+
+  /* xa2 = xa + TWO52 - TWO52; */
+  xa2 = gen_reg_rtx (mode);
+  expand_simple_binop (mode, PLUS, xa, TWO52, xa2, 0, OPTAB_DIRECT);
+  expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
+
+  /* dxa = xa2 - xa; */
+  dxa = gen_reg_rtx (mode);
+  expand_simple_binop (mode, MINUS, xa2, xa, dxa, 0, OPTAB_DIRECT);
+
+  /* generate 0.5, 1.0 and -0.5 */
+  half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
+  one = gen_reg_rtx (mode);
+  expand_simple_binop (mode, PLUS, half, half, one, 0, OPTAB_DIRECT);
+  mhalf = gen_reg_rtx (mode);
+  expand_simple_binop (mode, MINUS, half, one, mhalf, 0, OPTAB_DIRECT);
+
+  /* Compensate.  */
+  tmp = gen_reg_rtx (mode);
+  /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
+  tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
+  emit_insn (gen_rtx_SET (VOIDmode, tmp,
+                          gen_rtx_AND (mode, one, tmp)));
+  expand_simple_binop (mode, MINUS, xa2, tmp, xa2, 0, OPTAB_DIRECT);
+  /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
+  tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
+  emit_insn (gen_rtx_SET (VOIDmode, tmp,
+                          gen_rtx_AND (mode, one, tmp)));
+  expand_simple_binop (mode, PLUS, xa2, tmp, xa2, 0, OPTAB_DIRECT);
+
+  /* res = copysign (xa2, operand1) */
+  ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
+
+  emit_label (label);
+  LABEL_NUSES (label) = 1;
+
+  emit_move_insn (operand0, res);
+}
+
+/* Expand SSE sequence for computing round from OPERAND1 storing
+   into OPERAND0.  */
+void
+ix86_expand_round (rtx operand0, rtx operand1)
+{
+  /* C code for the stuff we're doing below:
+        double xa = fabs (x);
+        if (!isless (xa, TWO52))
+          return x;
+        xa = (double)(long)(xa + nextafter (0.5, 0.0));
+        return copysign (xa, x);
+   */
+  enum machine_mode mode = GET_MODE (operand0);
+  rtx res, TWO52, xa, label, xi, half, mask;
+  const struct real_format *fmt;
+  REAL_VALUE_TYPE pred_half, half_minus_pred_half;
+
+  /* Temporary for holding the result, initialized to the input
+     operand to ease control flow.  */
+  res = gen_reg_rtx (mode);
+  emit_move_insn (res, operand1);
+
+  TWO52 = ix86_gen_TWO52 (mode);
+  xa = ix86_expand_sse_fabs (res, &mask);
+  label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
+
+  /* load nextafter (0.5, 0.0) */
+  fmt = REAL_MODE_FORMAT (mode);
+  real_2expN (&half_minus_pred_half, -(fmt->p) - 1);
+  REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
+
+  /* xa = xa + 0.5 */
+  half = force_reg (mode, const_double_from_real_value (pred_half, mode));
+  expand_simple_binop (mode, PLUS, xa, half, xa, 0, OPTAB_DIRECT);
+
+  /* xa = (double)(int64_t)xa */
+  xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
+  expand_fix (xi, xa, 0);
+  expand_float (xa, xi, 0);
+
+  /* res = copysign (xa, operand1) */
+  ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
+
+  emit_label (label);
+  LABEL_NUSES (label) = 1;
+
+  emit_move_insn (operand0, res);
+}
+
 #include "gt-i386.h"
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 99ab0db3f8f..a274597b96c 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -17217,6 +17217,31 @@
   DONE;
 })
 
+(define_expand "roundsf2"
+  [(match_operand:SF 0 "register_operand" "")
+   (match_operand:SF 1 "nonimmediate_operand" "")]
+  "SSE_FLOAT_MODE_P (SFmode) && TARGET_SSE_MATH
+   && !flag_trapping_math && !flag_rounding_math
+   && !optimize_size"
+{
+  ix86_expand_round (operand0, operand1);
+  DONE;
+})
+
+(define_expand "rounddf2"
+  [(match_operand:DF 0 "register_operand" "")
+   (match_operand:DF 1 "nonimmediate_operand" "")]
+  "SSE_FLOAT_MODE_P (DFmode) && TARGET_SSE_MATH
+   && !flag_trapping_math && !flag_rounding_math
+   && !optimize_size"
+{
+  if (TARGET_64BIT)
+    ix86_expand_round (operand0, operand1);
+  else
+    ix86_expand_rounddf_32 (operand0, operand1);
+  DONE;
+})
+
 (define_insn_and_split "*fistdi2_1"
   [(set (match_operand:DI 0 "nonimmediate_operand" "=m,?r")
 	(unspec:DI [(match_operand:XF 1 "register_operand" "f,f")]
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 55d7f956939..6231f078fb2 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,7 @@
+2006-10-29  Richard Guenther  <rguenther@suse.de>
+
+	* gcc.target/i386/math-torture/round.c: New testcase.
+
 2006-10-29  Richard Guenther  <rguenther@suse.de>
 
 	* gcc.target/i386/math-torture/ceil.c: New testcase.
diff --git a/gcc/testsuite/gcc.target/i386/math-torture/round.c b/gcc/testsuite/gcc.target/i386/math-torture/round.c
new file mode 100644
index 00000000000..fddac7abbe5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/math-torture/round.c
@@ -0,0 +1,15 @@
+/* { dg-do assemble } */
+
+float testlf (float x)
+{
+  return __builtin_roundf (x);
+}
+double testl (double x)
+{
+  return __builtin_round (x);
+}
+long double testll (long double x)
+{
+  return __builtin_roundl (x);
+}
+