memcmp implementation for x86-64 using SSE2.
This commit is contained in:
parent
ca419225a3
commit
e26c9b8415
@ -1,3 +1,7 @@
|
||||
2009-07-15 H.J. Lu <hongjiu.lu@intel.com>
|
||||
|
||||
* sysdeps/x86_64/memcmp.S: New file.
|
||||
|
||||
2009-07-15 Ulrich Drepper <drepper@redhat.com>
|
||||
|
||||
* sysdeps/x86-64/dl-trampoline.h: Remove after integrating code into...
|
||||
|
359
sysdeps/x86_64/memcmp.S
Normal file
359
sysdeps/x86_64/memcmp.S
Normal file
@ -0,0 +1,359 @@
|
||||
/* memcmp with SSE2
|
||||
Copyright (C) 2009 Free Software Foundation, Inc.
|
||||
Contributed by Intel Corporation.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, write to the Free
|
||||
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
|
||||
02111-1307 USA. */
|
||||
|
||||
#include <sysdep.h>
|
||||
|
||||
.text
|
||||
ENTRY (memcmp)
|
||||
test %rdx, %rdx
|
||||
jz L(finz)
|
||||
cmpq $1, %rdx
|
||||
jle L(finr1b)
|
||||
subq %rdi, %rsi
|
||||
movq %rdx, %r10
|
||||
cmpq $32, %r10
|
||||
jge L(gt32)
|
||||
/* Handle small chunks and last block of less than 32 bytes. */
|
||||
L(small):
|
||||
testq $1, %r10
|
||||
jz L(s2b)
|
||||
movzbl (%rdi), %eax
|
||||
movzbl (%rdi, %rsi), %edx
|
||||
subq $1, %r10
|
||||
je L(finz1)
|
||||
addq $1, %rdi
|
||||
subl %edx, %eax
|
||||
jnz L(exit)
|
||||
L(s2b):
|
||||
testq $2, %r10
|
||||
jz L(s4b)
|
||||
movzwl (%rdi), %eax
|
||||
movzwl (%rdi, %rsi), %edx
|
||||
subq $2, %r10
|
||||
je L(fin2_7)
|
||||
addq $2, %rdi
|
||||
cmpl %edx, %eax
|
||||
jnz L(fin2_7)
|
||||
L(s4b):
|
||||
testq $4, %r10
|
||||
jz L(s8b)
|
||||
movl (%rdi), %eax
|
||||
movl (%rdi, %rsi), %edx
|
||||
subq $4, %r10
|
||||
je L(fin2_7)
|
||||
addq $4, %rdi
|
||||
cmpl %edx, %eax
|
||||
jnz L(fin2_7)
|
||||
L(s8b):
|
||||
testq $8, %r10
|
||||
jz L(s16b)
|
||||
movq (%rdi), %rax
|
||||
movq (%rdi, %rsi), %rdx
|
||||
subq $8, %r10
|
||||
je L(fin2_7)
|
||||
addq $8, %rdi
|
||||
cmpq %rdx, %rax
|
||||
jnz L(fin2_7)
|
||||
L(s16b):
|
||||
movdqu (%rdi), %xmm1
|
||||
movdqu (%rdi, %rsi), %xmm0
|
||||
pcmpeqb %xmm0, %xmm1
|
||||
pmovmskb %xmm1, %edx
|
||||
xorl %eax, %eax
|
||||
subl $0xffff, %edx
|
||||
jz L(finz)
|
||||
bsfl %edx, %ecx
|
||||
leaq (%rdi, %rcx), %rcx
|
||||
movzbl (%rcx), %eax
|
||||
movzbl (%rsi, %rcx), %edx
|
||||
jmp L(finz1)
|
||||
|
||||
.p2align 4,, 4
|
||||
L(finr1b):
|
||||
movzbl (%rdi), %eax
|
||||
movzbl (%rsi), %edx
|
||||
L(finz1):
|
||||
subl %edx, %eax
|
||||
L(exit):
|
||||
ret
|
||||
|
||||
.p2align 4,, 4
|
||||
L(fin2_7):
|
||||
cmpq %rdx, %rax
|
||||
jz L(finz)
|
||||
movq %rax, %r11
|
||||
subq %rdx, %r11
|
||||
bsfq %r11, %rcx
|
||||
sarq $3, %rcx
|
||||
salq $3, %rcx
|
||||
sarq %cl, %rax
|
||||
movzbl %al, %eax
|
||||
sarq %cl, %rdx
|
||||
movzbl %dl, %edx
|
||||
subl %edx, %eax
|
||||
ret
|
||||
|
||||
.p2align 4,, 4
|
||||
L(finz):
|
||||
xorl %eax, %eax
|
||||
ret
|
||||
|
||||
/* For blocks bigger than 32 bytes
|
||||
1. Advance one of the addr pointer to be 16B aligned.
|
||||
2. Treat the case of both addr pointers aligned to 16B
|
||||
separately to avoid movdqu.
|
||||
3. Handle any blocks of greater than 64 consecutive bytes with
|
||||
unrolling to reduce branches.
|
||||
4. At least one addr pointer is 16B aligned, use memory version
|
||||
of pcmbeqb.
|
||||
*/
|
||||
.p2align 4,, 4
|
||||
L(gt32):
|
||||
movq %rdx, %r11
|
||||
addq %rdi, %r11
|
||||
movq %rdi, %r8
|
||||
|
||||
andq $15, %r8
|
||||
jz L(16am)
|
||||
/* Both pointers may be misaligned. */
|
||||
movdqu (%rdi), %xmm1
|
||||
movdqu (%rdi, %rsi), %xmm0
|
||||
pcmpeqb %xmm0, %xmm1
|
||||
pmovmskb %xmm1, %edx
|
||||
subl $0xffff, %edx
|
||||
jnz L(neq)
|
||||
neg %r8
|
||||
leaq 16(%rdi, %r8), %rdi
|
||||
L(16am):
|
||||
/* Handle two 16B aligned pointers separately. */
|
||||
testq $15, %rsi
|
||||
jz L(ATR)
|
||||
testq $16, %rdi
|
||||
jz L(A32)
|
||||
movdqu (%rdi, %rsi), %xmm0
|
||||
pcmpeqb (%rdi), %xmm0
|
||||
pmovmskb %xmm0, %edx
|
||||
subl $0xffff, %edx
|
||||
jnz L(neq)
|
||||
addq $16, %rdi
|
||||
L(A32):
|
||||
movq %r11, %r10
|
||||
andq $-32, %r10
|
||||
cmpq %r10, %rdi
|
||||
jge L(mt16)
|
||||
/* Pre-unroll to be ready for unrolled 64B loop. */
|
||||
testq $32, %rdi
|
||||
jz L(A64)
|
||||
movdqu (%rdi,%rsi), %xmm0
|
||||
pcmpeqb (%rdi), %xmm0
|
||||
pmovmskb %xmm0, %edx
|
||||
subl $0xffff, %edx
|
||||
jnz L(neq)
|
||||
addq $16, %rdi
|
||||
|
||||
movdqu (%rdi,%rsi), %xmm0
|
||||
pcmpeqb (%rdi), %xmm0
|
||||
pmovmskb %xmm0, %edx
|
||||
subl $0xffff, %edx
|
||||
jnz L(neq)
|
||||
addq $16, %rdi
|
||||
|
||||
L(A64):
|
||||
movq %r11, %r10
|
||||
andq $-64, %r10
|
||||
cmpq %r10, %rdi
|
||||
jge L(mt32)
|
||||
|
||||
L(A64main):
|
||||
movdqu (%rdi,%rsi), %xmm0
|
||||
pcmpeqb (%rdi), %xmm0
|
||||
pmovmskb %xmm0, %edx
|
||||
subl $0xffff, %edx
|
||||
jnz L(neq)
|
||||
addq $16, %rdi
|
||||
|
||||
movdqu (%rdi,%rsi), %xmm0
|
||||
pcmpeqb (%rdi), %xmm0
|
||||
pmovmskb %xmm0, %edx
|
||||
subl $0xffff, %edx
|
||||
jnz L(neq)
|
||||
addq $16, %rdi
|
||||
|
||||
movdqu (%rdi,%rsi), %xmm0
|
||||
pcmpeqb (%rdi), %xmm0
|
||||
pmovmskb %xmm0, %edx
|
||||
subl $0xffff, %edx
|
||||
jnz L(neq)
|
||||
addq $16, %rdi
|
||||
|
||||
movdqu (%rdi,%rsi), %xmm0
|
||||
pcmpeqb (%rdi), %xmm0
|
||||
pmovmskb %xmm0, %edx
|
||||
subl $0xffff, %edx
|
||||
jnz L(neq)
|
||||
addq $16, %rdi
|
||||
|
||||
cmpq %rdi, %r10
|
||||
jne L(A64main)
|
||||
|
||||
L(mt32):
|
||||
movq %r11, %r10
|
||||
andq $-32, %r10
|
||||
cmpq %r10, %rdi
|
||||
jge L(mt16)
|
||||
|
||||
L(A32main):
|
||||
movdqu (%rdi,%rsi), %xmm0
|
||||
pcmpeqb (%rdi), %xmm0
|
||||
pmovmskb %xmm0, %edx
|
||||
subl $0xffff, %edx
|
||||
jnz L(neq)
|
||||
addq $16, %rdi
|
||||
|
||||
movdqu (%rdi,%rsi), %xmm0
|
||||
pcmpeqb (%rdi), %xmm0
|
||||
pmovmskb %xmm0, %edx
|
||||
subl $0xffff, %edx
|
||||
jnz L(neq)
|
||||
addq $16, %rdi
|
||||
|
||||
cmpq %rdi, %r10
|
||||
jne L(A32main)
|
||||
L(mt16):
|
||||
subq %rdi, %r11
|
||||
je L(finz)
|
||||
movq %r11, %r10
|
||||
jmp L(small)
|
||||
|
||||
.p2align 4,, 4
|
||||
L(neq):
|
||||
bsfl %edx, %ecx
|
||||
movzbl (%rdi, %rcx), %eax
|
||||
addq %rdi, %rsi
|
||||
movzbl (%rsi,%rcx), %edx
|
||||
jmp L(finz1)
|
||||
|
||||
.p2align 4,, 4
|
||||
L(ATR):
|
||||
movq %r11, %r10
|
||||
andq $-32, %r10
|
||||
cmpq %r10, %rdi
|
||||
jge L(mt16)
|
||||
testq $16, %rdi
|
||||
jz L(ATR32)
|
||||
|
||||
movdqa (%rdi,%rsi), %xmm0
|
||||
pcmpeqb (%rdi), %xmm0
|
||||
pmovmskb %xmm0, %edx
|
||||
subl $0xffff, %edx
|
||||
jnz L(neq)
|
||||
addq $16, %rdi
|
||||
cmpq %rdi, %r10
|
||||
je L(mt16)
|
||||
|
||||
L(ATR32):
|
||||
movq %r11, %r10
|
||||
andq $-64, %r10
|
||||
testq $32, %rdi
|
||||
jz L(ATR64)
|
||||
|
||||
movdqa (%rdi,%rsi), %xmm0
|
||||
pcmpeqb (%rdi), %xmm0
|
||||
pmovmskb %xmm0, %edx
|
||||
subl $0xffff, %edx
|
||||
jnz L(neq)
|
||||
addq $16, %rdi
|
||||
|
||||
movdqa (%rdi,%rsi), %xmm0
|
||||
pcmpeqb (%rdi), %xmm0
|
||||
pmovmskb %xmm0, %edx
|
||||
subl $0xffff, %edx
|
||||
jnz L(neq)
|
||||
addq $16, %rdi
|
||||
|
||||
L(ATR64):
|
||||
cmpq %rdi, %r10
|
||||
je L(mt32)
|
||||
|
||||
L(ATR64main):
|
||||
movdqa (%rdi,%rsi), %xmm0
|
||||
pcmpeqb (%rdi), %xmm0
|
||||
pmovmskb %xmm0, %edx
|
||||
subl $0xffff, %edx
|
||||
jnz L(neq)
|
||||
addq $16, %rdi
|
||||
|
||||
movdqa (%rdi,%rsi), %xmm0
|
||||
pcmpeqb (%rdi), %xmm0
|
||||
pmovmskb %xmm0, %edx
|
||||
subl $0xffff, %edx
|
||||
jnz L(neq)
|
||||
addq $16, %rdi
|
||||
|
||||
movdqa (%rdi,%rsi), %xmm0
|
||||
pcmpeqb (%rdi), %xmm0
|
||||
pmovmskb %xmm0, %edx
|
||||
subl $0xffff, %edx
|
||||
jnz L(neq)
|
||||
addq $16, %rdi
|
||||
|
||||
movdqa (%rdi,%rsi), %xmm0
|
||||
pcmpeqb (%rdi), %xmm0
|
||||
pmovmskb %xmm0, %edx
|
||||
subl $0xffff, %edx
|
||||
jnz L(neq)
|
||||
addq $16, %rdi
|
||||
cmpq %rdi, %r10
|
||||
jne L(ATR64main)
|
||||
|
||||
movq %r11, %r10
|
||||
andq $-32, %r10
|
||||
cmpq %r10, %rdi
|
||||
jge L(mt16)
|
||||
|
||||
L(ATR32res):
|
||||
movdqa (%rdi,%rsi), %xmm0
|
||||
pcmpeqb (%rdi), %xmm0
|
||||
pmovmskb %xmm0, %edx
|
||||
subl $0xffff, %edx
|
||||
jnz L(neq)
|
||||
addq $16, %rdi
|
||||
|
||||
movdqa (%rdi,%rsi), %xmm0
|
||||
pcmpeqb (%rdi), %xmm0
|
||||
pmovmskb %xmm0, %edx
|
||||
subl $0xffff, %edx
|
||||
jnz L(neq)
|
||||
addq $16, %rdi
|
||||
|
||||
cmpq %r10, %rdi
|
||||
jne L(ATR32res)
|
||||
|
||||
subq %rdi, %r11
|
||||
je L(finz)
|
||||
movq %r11, %r10
|
||||
jmp L(small)
|
||||
/* Align to 16byte to improve instruction fetch. */
|
||||
.p2align 4,, 4
|
||||
END(memcmp)
|
||||
|
||||
#undef bcmp
|
||||
weak_alias (memcmp, bcmp)
|
||||
libc_hidden_builtin_def (memcmp)
|
Loading…
Reference in New Issue
Block a user