From 5a7af22fbb661e351473bfde639beeac16fc32f7 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Wed, 13 Jan 2010 07:51:48 -0800 Subject: [PATCH] Unroll the loop x86-64 SSE4.2 strlen. --- ChangeLog | 4 +++ sysdeps/x86_64/multiarch/strlen.S | 58 +++++++++++++++++++++++-------- 2 files changed, 48 insertions(+), 14 deletions(-) diff --git a/ChangeLog b/ChangeLog index 736bd5d778..9080030d1e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,7 @@ +2010-01-12 H.J. Lu + + * sysdeps/x86_64/multiarch/strlen.S: Unroll the loop. + 2010-01-13 Ulrich Drepper * stdlib/stdlib.h: Be a bit more relaxed about obsoleted mktemp symbol. diff --git a/sysdeps/x86_64/multiarch/strlen.S b/sysdeps/x86_64/multiarch/strlen.S index 509f9c9605..f9641131fa 100644 --- a/sysdeps/x86_64/multiarch/strlen.S +++ b/sysdeps/x86_64/multiarch/strlen.S @@ -46,28 +46,58 @@ END(strlen) __strlen_sse42: cfi_startproc CALL_MCOUNT - pxor %xmm2, %xmm2 - movq %rdi, %rcx + pxor %xmm1, %xmm1 + movl %edi, %ecx movq %rdi, %r8 andq $~15, %rdi - movdqa %xmm2, %xmm1 - pcmpeqb (%rdi), %xmm2 - orl $0xffffffff, %esi - subq %rdi, %rcx - shll %cl, %esi - pmovmskb %xmm2, %edx - andl %esi, %edx - jnz 1f + xor %edi, %ecx + pcmpeqb (%rdi), %xmm1 + pmovmskb %xmm1, %edx + shrl %cl, %edx + shll %cl, %edx + andl %edx, %edx + jnz L(less16bytes) + pxor %xmm1, %xmm1 -2: pcmpistri $0x08, 16(%rdi), %xmm1 - leaq 16(%rdi), %rdi - jnz 2b + .p2align 4 +L(more64bytes_loop): + pcmpistri $0x08, 16(%rdi), %xmm1 + jz L(more32bytes) + pcmpistri $0x08, 32(%rdi), %xmm1 + jz L(more48bytes) + + pcmpistri $0x08, 48(%rdi), %xmm1 + jz L(more64bytes) + + add $64, %rdi + pcmpistri $0x08, (%rdi), %xmm1 + jnz L(more64bytes_loop) leaq (%rdi,%rcx), %rax subq %r8, %rax ret -1: subq %r8, %rdi + .p2align 4 +L(more32bytes): + leaq 16(%rdi,%rcx, 1), %rax + subq %r8, %rax + ret + + .p2align 4 +L(more48bytes): + leaq 32(%rdi,%rcx, 1), %rax + subq %r8, %rax + ret + + .p2align 4 +L(more64bytes): + leaq 48(%rdi,%rcx, 1), %rax + subq %r8, %rax + ret + + .p2align 4 +L(less16bytes): + subq %r8, %rdi bsfl %edx, %eax addq %rdi, %rax ret