x86_64: Remove redundant REX bytes from memrchr.S

By x86-64 specification, 32-bit destination registers are zero-extended
to 64 bits.  There is no need to use 64-bit registers when only the lower
32 bits are non-zero.  Also 2 instructions in:

	mov	%rdi, %rcx
	and	$15, %rcx
	jz	L(length_less16_offset0)

	mov	%rdi, %rcx		<<< redundant
	and	$15, %rcx		<<< redundant

are redundant.

	* sysdeps/x86_64/memrchr.S (__memrchr): Use 32-bit registers for
	the lower 32 bits.  Remove redundant instructions.
This commit is contained in:
H.J. Lu 2017-06-05 07:41:14 -07:00
parent d8a7d10324
commit 7395928b95
2 changed files with 22 additions and 19 deletions

View File

@ -1,3 +1,8 @@
2017-06-05 H.J. Lu <hongjiu.lu@intel.com>
* sysdeps/x86_64/memrchr.S (__memrchr): Use 32-bit registers for
the lower 32 bits. Remove redundant instructions.
2017-06-05 H.J. Lu <hongjiu.lu@intel.com>
* sysdeps/unix/sysv/linux/x86_64/sysdep.h (LO_HI_LONG): Pass

View File

@ -22,7 +22,7 @@
.text
ENTRY (__memrchr)
movd %rsi, %xmm1
movd %esi, %xmm1
sub $16, %rdx
jbe L(length_less16)
@ -42,8 +42,8 @@ ENTRY (__memrchr)
jnz L(matches0)
sub $64, %rdi
mov %rdi, %rcx
and $15, %rcx
mov %edi, %ecx
and $15, %ecx
jz L(loop_prolog)
add $16, %rdi
@ -108,8 +108,8 @@ L(loop_prolog):
test %eax, %eax
jnz L(matches0)
mov %rdi, %rcx
and $63, %rcx
mov %edi, %ecx
and $63, %ecx
jz L(align64_loop)
add $64, %rdi
@ -166,8 +166,8 @@ L(align64_loop):
.p2align 4
L(exit_loop):
add $64, %rdx
cmp $32, %rdx
add $64, %edx
cmp $32, %edx
jbe L(exit_loop_32)
movdqa 48(%rdi), %xmm0
@ -187,7 +187,7 @@ L(exit_loop):
pmovmskb %xmm3, %eax
test %eax, %eax
jnz L(matches16_1)
cmp $48, %rdx
cmp $48, %edx
jbe L(return_null)
pcmpeqb (%rdi), %xmm1
@ -204,7 +204,7 @@ L(exit_loop_32):
pmovmskb %xmm0, %eax
test %eax, %eax
jnz L(matches48_1)
cmp $16, %rdx
cmp $16, %edx
jbe L(return_null)
pcmpeqb 32(%rdi), %xmm1
@ -276,7 +276,7 @@ L(matches48_1):
.p2align 4
L(return_null):
xor %rax, %rax
xor %eax, %eax
ret
.p2align 4
@ -306,18 +306,16 @@ L(length_less16):
punpcklbw %xmm1, %xmm1
punpcklbw %xmm1, %xmm1
add $16, %rdx
add $16, %edx
pshufd $0, %xmm1, %xmm1
mov %rdi, %rcx
and $15, %rcx
mov %edi, %ecx
and $15, %ecx
jz L(length_less16_offset0)
mov %rdi, %rcx
and $15, %rcx
mov %cl, %dh
mov %rcx, %r8
mov %ecx, %esi
add %dl, %dh
and $-16, %rdi
@ -340,7 +338,7 @@ L(length_less16):
bsr %eax, %eax
add %rdi, %rax
add %r8, %rax
add %rsi, %rax
ret
.p2align 4
@ -362,14 +360,14 @@ L(length_less16_part2):
pcmpeqb (%rdi), %xmm1
pmovmskb %xmm1, %eax
mov %r8, %rcx
mov %esi, %ecx
sar %cl, %eax
test %eax, %eax
jz L(return_null)
bsr %eax, %eax
add %rdi, %rax
add %r8, %rax
add %rsi, %rax
ret
.p2align 4