From 7395928b957ebb35afb696c3278d14122aa97b51 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Mon, 5 Jun 2017 07:41:14 -0700 Subject: [PATCH] x86_64: Remove redundant REX bytes from memrchr.S By x86-64 specification, 32-bit destination registers are zero-extended to 64 bits. There is no need to use 64-bit registers when only the lower 32 bits are non-zero. Also 2 instructions in: mov %rdi, %rcx and $15, %rcx jz L(length_less16_offset0) mov %rdi, %rcx <<< redundant and $15, %rcx <<< redundant are redundant. * sysdeps/x86_64/memrchr.S (__memrchr): Use 32-bit registers for the lower 32 bits. Remove redundant instructions. --- ChangeLog | 5 +++++ sysdeps/x86_64/memrchr.S | 36 +++++++++++++++++------------------- 2 files changed, 22 insertions(+), 19 deletions(-) diff --git a/ChangeLog b/ChangeLog index 1cbcf564e0..1549eb6422 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +2017-06-05 H.J. Lu + + * sysdeps/x86_64/memrchr.S (__memrchr): Use 32-bit registers for + the lower 32 bits. Remove redundant instructions. + 2017-06-05 H.J. Lu * sysdeps/unix/sysv/linux/x86_64/sysdep.h (LO_HI_LONG): Pass diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S index aab1a4a0c4..5fa0fe9c1c 100644 --- a/sysdeps/x86_64/memrchr.S +++ b/sysdeps/x86_64/memrchr.S @@ -22,7 +22,7 @@ .text ENTRY (__memrchr) - movd %rsi, %xmm1 + movd %esi, %xmm1 sub $16, %rdx jbe L(length_less16) @@ -42,8 +42,8 @@ ENTRY (__memrchr) jnz L(matches0) sub $64, %rdi - mov %rdi, %rcx - and $15, %rcx + mov %edi, %ecx + and $15, %ecx jz L(loop_prolog) add $16, %rdi @@ -108,8 +108,8 @@ L(loop_prolog): test %eax, %eax jnz L(matches0) - mov %rdi, %rcx - and $63, %rcx + mov %edi, %ecx + and $63, %ecx jz L(align64_loop) add $64, %rdi @@ -166,8 +166,8 @@ L(align64_loop): .p2align 4 L(exit_loop): - add $64, %rdx - cmp $32, %rdx + add $64, %edx + cmp $32, %edx jbe L(exit_loop_32) movdqa 48(%rdi), %xmm0 @@ -187,7 +187,7 @@ L(exit_loop): pmovmskb %xmm3, %eax test %eax, %eax jnz L(matches16_1) - cmp $48, %rdx + cmp $48, %edx jbe L(return_null) pcmpeqb (%rdi), %xmm1 @@ -204,7 +204,7 @@ L(exit_loop_32): pmovmskb %xmm0, %eax test %eax, %eax jnz L(matches48_1) - cmp $16, %rdx + cmp $16, %edx jbe L(return_null) pcmpeqb 32(%rdi), %xmm1 @@ -276,7 +276,7 @@ L(matches48_1): .p2align 4 L(return_null): - xor %rax, %rax + xor %eax, %eax ret .p2align 4 @@ -306,18 +306,16 @@ L(length_less16): punpcklbw %xmm1, %xmm1 punpcklbw %xmm1, %xmm1 - add $16, %rdx + add $16, %edx pshufd $0, %xmm1, %xmm1 - mov %rdi, %rcx - and $15, %rcx + mov %edi, %ecx + and $15, %ecx jz L(length_less16_offset0) - mov %rdi, %rcx - and $15, %rcx mov %cl, %dh - mov %rcx, %r8 + mov %ecx, %esi add %dl, %dh and $-16, %rdi @@ -340,7 +338,7 @@ L(length_less16): bsr %eax, %eax add %rdi, %rax - add %r8, %rax + add %rsi, %rax ret .p2align 4 @@ -362,14 +360,14 @@ L(length_less16_part2): pcmpeqb (%rdi), %xmm1 pmovmskb %xmm1, %eax - mov %r8, %rcx + mov %esi, %ecx sar %cl, %eax test %eax, %eax jz L(return_null) bsr %eax, %eax add %rdi, %rax - add %r8, %rax + add %rsi, %rax ret .p2align 4