From 7395928b957ebb35afb696c3278d14122aa97b51 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Mon, 5 Jun 2017 07:41:14 -0700
Subject: [PATCH] x86_64: Remove redundant REX bytes from memrchr.S

By x86-64 specification, 32-bit destination registers are zero-extended
to 64 bits.  There is no need to use 64-bit registers when only the lower
32 bits are non-zero.  Also 2 instructions in:

	mov	%rdi, %rcx
	and	$15, %rcx
	jz	L(length_less16_offset0)

	mov	%rdi, %rcx		<<< redundant
	and	$15, %rcx		<<< redundant

are redundant.

	* sysdeps/x86_64/memrchr.S (__memrchr): Use 32-bit registers for
	the lower 32 bits.  Remove redundant instructions.
---
 ChangeLog                |  5 +++++
 sysdeps/x86_64/memrchr.S | 36 +++++++++++++++++-------------------
 2 files changed, 22 insertions(+), 19 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 1cbcf564e0..1549eb6422 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2017-06-05  H.J. Lu  <hongjiu.lu@intel.com>
+
+	* sysdeps/x86_64/memrchr.S (__memrchr): Use 32-bit registers for
+	the lower 32 bits.  Remove redundant instructions.
+
 2017-06-05  H.J. Lu  <hongjiu.lu@intel.com>
 
 	* sysdeps/unix/sysv/linux/x86_64/sysdep.h (LO_HI_LONG): Pass
diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S
index aab1a4a0c4..5fa0fe9c1c 100644
--- a/sysdeps/x86_64/memrchr.S
+++ b/sysdeps/x86_64/memrchr.S
@@ -22,7 +22,7 @@
 
 	.text
 ENTRY (__memrchr)
-	movd	%rsi, %xmm1
+	movd	%esi, %xmm1
 
 	sub	$16, %rdx
 	jbe	L(length_less16)
@@ -42,8 +42,8 @@ ENTRY (__memrchr)
 	jnz	L(matches0)
 
 	sub	$64, %rdi
-	mov	%rdi, %rcx
-	and	$15, %rcx
+	mov	%edi, %ecx
+	and	$15, %ecx
 	jz	L(loop_prolog)
 
 	add	$16, %rdi
@@ -108,8 +108,8 @@ L(loop_prolog):
 	test	%eax, %eax
 	jnz	L(matches0)
 
-	mov	%rdi, %rcx
-	and	$63, %rcx
+	mov	%edi, %ecx
+	and	$63, %ecx
 	jz	L(align64_loop)
 
 	add	$64, %rdi
@@ -166,8 +166,8 @@ L(align64_loop):
 
 	.p2align 4
 L(exit_loop):
-	add	$64, %rdx
-	cmp	$32, %rdx
+	add	$64, %edx
+	cmp	$32, %edx
 	jbe	L(exit_loop_32)
 
 	movdqa	48(%rdi), %xmm0
@@ -187,7 +187,7 @@ L(exit_loop):
 	pmovmskb	%xmm3, %eax
 	test	%eax, %eax
 	jnz	L(matches16_1)
-	cmp	$48, %rdx
+	cmp	$48, %edx
 	jbe	L(return_null)
 
 	pcmpeqb	(%rdi), %xmm1
@@ -204,7 +204,7 @@ L(exit_loop_32):
 	pmovmskb	%xmm0, %eax
 	test	%eax, %eax
 	jnz	L(matches48_1)
-	cmp	$16, %rdx
+	cmp	$16, %edx
 	jbe	L(return_null)
 
 	pcmpeqb	32(%rdi), %xmm1
@@ -276,7 +276,7 @@ L(matches48_1):
 
 	.p2align 4
 L(return_null):
-	xor	%rax, %rax
+	xor	%eax, %eax
 	ret
 
 	.p2align 4
@@ -306,18 +306,16 @@ L(length_less16):
 	punpcklbw	%xmm1, %xmm1
 	punpcklbw	%xmm1, %xmm1
 
-	add	$16, %rdx
+	add	$16, %edx
 
 	pshufd	$0, %xmm1, %xmm1
 
-	mov	%rdi, %rcx
-	and	$15, %rcx
+	mov	%edi, %ecx
+	and	$15, %ecx
 	jz	L(length_less16_offset0)
 
-	mov	%rdi, %rcx
-	and	$15, %rcx
 	mov	%cl, %dh
-	mov	%rcx, %r8
+	mov	%ecx, %esi
 	add	%dl, %dh
 	and	$-16, %rdi
 
@@ -340,7 +338,7 @@ L(length_less16):
 
 	bsr	%eax, %eax
 	add	%rdi, %rax
-	add	%r8, %rax
+	add	%rsi, %rax
 	ret
 
 	.p2align 4
@@ -362,14 +360,14 @@ L(length_less16_part2):
 	pcmpeqb	(%rdi), %xmm1
 	pmovmskb	%xmm1, %eax
 
-	mov	%r8, %rcx
+	mov	%esi, %ecx
 	sar	%cl, %eax
 	test	%eax, %eax
 	jz	L(return_null)
 
 	bsr	%eax, %eax
 	add	%rdi, %rax
-	add	%r8, %rax
+	add	%rsi, %rax
 	ret
 
 	.p2align 4