From 3ab2d57a4d00046f1c472abd128517e93e20e485 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Fri, 5 Jun 2009 11:32:00 -0700 Subject: [PATCH] Optimize x86-64 strlen for SSE4.2. The SSE4.2 implementation is used in the DSO only. The patch also adds some infrastructure to be used in similar code later one. --- ChangeLog | 6 ++ sysdeps/x86_64/multiarch/Makefile | 1 + sysdeps/x86_64/multiarch/ifunc-defines.sym | 15 ++++ sysdeps/x86_64/multiarch/init-arch.h | 2 +- sysdeps/x86_64/multiarch/strlen.S | 87 ++++++++++++++++++++++ 5 files changed, 110 insertions(+), 1 deletion(-) create mode 100644 sysdeps/x86_64/multiarch/ifunc-defines.sym create mode 100644 sysdeps/x86_64/multiarch/strlen.S diff --git a/ChangeLog b/ChangeLog index cc03bb36d9..c6a3530c86 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,11 @@ 2009-06-05 Ulrich Drepper + * sysdeps/x86_64/multiarch/strlen.S: New file. + * sysdeps/x86_64/multiarch/ifunc-defines.sym: New file. + * sysdeps/x86_64/multiarch/Makefile: Add rule to build ifunc-defines.h. + * sysdeps/x86_64/multiarch/init-arch.h: Name structure with register + content. + * csu/elf-init.c: Only compile in IFUNC functionality if USE_MULTIARCH is defined. diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 2a1e910e06..33d98c36e6 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -1,3 +1,4 @@ ifeq ($(subdir),csu) aux += init-arch +gen-as-const-headers += ifunc-defines.sym endif diff --git a/sysdeps/x86_64/multiarch/ifunc-defines.sym b/sysdeps/x86_64/multiarch/ifunc-defines.sym new file mode 100644 index 0000000000..48d1287246 --- /dev/null +++ b/sysdeps/x86_64/multiarch/ifunc-defines.sym @@ -0,0 +1,15 @@ +#include "init-arch.h" +#include + +-- + +CPU_FEATURES_SIZE sizeof (struct cpu_features) +KIND_OFFSET offsetof (struct cpu_features, kind) +CPUID_OFFSET offsetof (struct cpu_features, cpuid) +CPUID_SIZE sizeof (struct cpuid_registers) +CPUID_EAX_OFFSET offsetof (struct cpuid_registers, eax) +CPUID_EBX_OFFSET offsetof (struct cpuid_registers, ebx) +CPUID_ECX_OFFSET offsetof (struct cpuid_registers, ecx) +CPUID_EDX_OFFSET offsetof (struct cpuid_registers, edx) + +COMMON_CPUID_INDEX_1 diff --git a/sysdeps/x86_64/multiarch/init-arch.h b/sysdeps/x86_64/multiarch/init-arch.h index 40b804571d..f160ba2a94 100644 --- a/sysdeps/x86_64/multiarch/init-arch.h +++ b/sysdeps/x86_64/multiarch/init-arch.h @@ -35,7 +35,7 @@ extern struct cpu_features arch_kind_other } kind; int max_cpuid; - struct + struct cpuid_registers { unsigned int eax; unsigned int ebx; diff --git a/sysdeps/x86_64/multiarch/strlen.S b/sysdeps/x86_64/multiarch/strlen.S new file mode 100644 index 0000000000..bf889c1ab6 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strlen.S @@ -0,0 +1,87 @@ +/* strlen(str) -- determine the length of the string STR. + Copyright (C) 2009 Free Software Foundation, Inc. + Contributed by Ulrich Drepper . + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include + + +/* Define multiple versions only for the definition in libc and for + the DSO. In static binaries we need strlen before the initialization + happened. */ +#if defined SHARED && !defined NOT_IN_libc + .text +ENTRY(strlen) + .type strlen, @gnu_indirect_function + cmpl $0, __cpu_features+KIND_OFFSET(%rip) + jne 1f + call __init_cpu_features +1: leaq __strlen_sse2(%rip), %rax + testl $(1<<20), __cpu_features+CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET(%rip) + jz 2f + leaq __strlen_sse42(%rip), %rax +2: ret +END(strlen) + + + .type __strlen_sse42, @function +__strlen_sse42: + pxor %xmm2, %xmm2 + movq %rdi, %rcx + movq %rdi, %r8 + andq $~15, %rdi + movdqa %xmm2, %xmm1 + pcmpeqb (%rdi), %xmm2 + orl $0xffffffff, %esi + subq %rdi, %rcx + shll %cl, %esi + pmovmskb %xmm2, %edx + andl %esi, %edx + jnz 1f + +2: pcmpistri $0x08, 16(%rdi), %xmm1 + leaq 16(%rdi), %rdi + jnz 2b + + leaq (%rdi,%rcx), %rax + subq %r8, %rax + ret + +1: bsfl %edx, %eax + leaq (%rdi,%rax), %rax + subq %r8, %rax + ret + .size __strlen_sse42, .-__strlen_sse42 + + +# undef ENTRY +# define ENTRY(name) \ + .type __strlen_sse2, @function; __strlen_sse2: +# undef END +# define END(name) \ + .size __strlen_sse2, .-__strlen_sse2 +# undef libc_hidden_builtin_def +/* It doesn't make sense to send libc-internal strlen calls through a PLT. + The speedup we get from using SSE4.2 instruction is likely eaten away + by the indirect call in the PLT. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_strlen; __GI_strlen = __strlen_sse2 +#endif + +#include "../strlen.S"