SSE4.2 strstr/strcasestr for x86-64.
This patch implements SSE4.2 strstr/strcasestr, using Knuth-Morris-Pratt string searching algorithm.
This commit is contained in:
parent
63fbc91bec
commit
2b7a8664fa
17
ChangeLog
17
ChangeLog
@ -1,3 +1,20 @@
|
||||
2009-07-20 H.J. Lu <hongjiu.lu@intel.com>
|
||||
|
||||
* string/strcasestr.c (STRCASESTR): New macro.
|
||||
(__strcasestr): Renamed to ..
|
||||
(STRCASESTR): ...this.
|
||||
* string/strstr.c (STRSTR): New macro.
|
||||
(strstr): Renamed to ..
|
||||
(STRSTR): ...this.
|
||||
* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
|
||||
strstr-c strcasestr-c
|
||||
(CFLAGS-strstr.c): New.
|
||||
(CFLAGS-strcasestr.c): Likewise.
|
||||
* sysdeps/x86_64/multiarch/strcasestr-c.c: New file.
|
||||
* sysdeps/x86_64/multiarch/strcasestr.c: New file.
|
||||
* sysdeps/x86_64/multiarch/strstr-c.c: New file.
|
||||
* sysdeps/x86_64/multiarch/strstr.c: New file.
|
||||
|
||||
2009-07-20 Ulrich Drepper <drepper@redhat.com>
|
||||
|
||||
* locale/localeinfo.h (LIMAGIC): Update value for LC_CTYPE.
|
||||
|
@ -1,5 +1,6 @@
|
||||
/* Return the offset of one string within another.
|
||||
Copyright (C) 1994, 1996-2000, 2004, 2008 Free Software Foundation, Inc.
|
||||
Copyright (C) 1994, 1996-2000, 2004, 2008, 2009
|
||||
Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
@ -52,11 +53,16 @@
|
||||
#undef strcasestr
|
||||
#undef __strcasestr
|
||||
|
||||
#ifndef STRCASESTR
|
||||
#define STRCASESTR __strcasestr
|
||||
#endif
|
||||
|
||||
|
||||
/* Find the first occurrence of NEEDLE in HAYSTACK, using
|
||||
case-insensitive comparison. This function gives unspecified
|
||||
results in multibyte locales. */
|
||||
char *
|
||||
__strcasestr (const char *haystack_start, const char *needle_start)
|
||||
STRCASESTR (const char *haystack_start, const char *needle_start)
|
||||
{
|
||||
const char *haystack = haystack_start;
|
||||
const char *needle = needle_start;
|
||||
|
@ -1,5 +1,6 @@
|
||||
/* Return the offset of one string within another.
|
||||
Copyright (C) 1994,1996,1997,2000,2001,2003,2008 Free Software Foundation, Inc.
|
||||
Copyright (C) 1994,1996,1997,2000,2001,2003,2008,2009
|
||||
Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
@ -40,11 +41,15 @@
|
||||
|
||||
#undef strstr
|
||||
|
||||
#ifndef STRSTR
|
||||
#define STRSTR strstr
|
||||
#endif
|
||||
|
||||
/* Return the first occurrence of NEEDLE in HAYSTACK. Return HAYSTACK
|
||||
if NEEDLE is empty, otherwise NULL if NEEDLE is not found in
|
||||
HAYSTACK. */
|
||||
char *
|
||||
strstr (const char *haystack_start, const char *needle_start)
|
||||
STRSTR (const char *haystack_start, const char *needle_start)
|
||||
{
|
||||
const char *haystack = haystack_start;
|
||||
const char *needle = needle_start;
|
||||
|
@ -6,9 +6,11 @@ endif
|
||||
ifeq ($(subdir),string)
|
||||
sysdep_routines += stpncpy-c strncpy-c strncmp-c
|
||||
ifeq (yes,$(config-cflags-sse4))
|
||||
sysdep_routines += strcspn-c strpbrk-c strspn-c
|
||||
sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c
|
||||
CFLAGS-strcspn-c.c += -msse4
|
||||
CFLAGS-strpbrk-c.c += -msse4
|
||||
CFLAGS-strspn-c.c += -msse4
|
||||
CFLAGS-strstr.c += -msse4
|
||||
CFLAGS-strcasestr.c += -msse4
|
||||
endif
|
||||
endif
|
||||
|
18
sysdeps/x86_64/multiarch/strcasestr-c.c
Normal file
18
sysdeps/x86_64/multiarch/strcasestr-c.c
Normal file
@ -0,0 +1,18 @@
|
||||
#include "init-arch.h"
|
||||
|
||||
#define STRCASESTR __strcasestr_sse2
|
||||
#undef libc_hidden_builtin_def
|
||||
#define libc_hidden_builtin_def(name) \
|
||||
__hidden_ver1 (__strcasestr_sse2, __GI_strcasestr, __strcasestr_sse2);
|
||||
|
||||
#include "string/strcasestr.c"
|
||||
|
||||
extern char *__strcasestr_sse42 (const char *, const char *);
|
||||
|
||||
#if 1
|
||||
libc_ifunc (__strcasestr,
|
||||
HAS_SSE4_2 ? __strcasestr_sse42 : __strcasestr_sse2);
|
||||
#else
|
||||
libc_ifunc (__strcasestr,
|
||||
0 ? __strcasestr_sse42 : __strcasestr_sse2);
|
||||
#endif
|
3
sysdeps/x86_64/multiarch/strcasestr.c
Normal file
3
sysdeps/x86_64/multiarch/strcasestr.c
Normal file
@ -0,0 +1,3 @@
|
||||
#define USE_AS_STRCASESTR
|
||||
#define STRSTR_SSE42 __strcasestr_sse42
|
||||
#include "strstr.c"
|
12
sysdeps/x86_64/multiarch/strstr-c.c
Normal file
12
sysdeps/x86_64/multiarch/strstr-c.c
Normal file
@ -0,0 +1,12 @@
|
||||
#include "init-arch.h"
|
||||
|
||||
#define STRSTR __strstr_sse2
|
||||
#undef libc_hidden_builtin_def
|
||||
#define libc_hidden_builtin_def(name) \
|
||||
__hidden_ver1 (__strstr_sse2, __GI_strstr, __strstr_sse2);
|
||||
|
||||
#include "string/strstr.c"
|
||||
|
||||
extern char *__strstr_sse42 (const char *, const char *);
|
||||
|
||||
libc_ifunc (strstr, HAS_SSE4_2 ? __strstr_sse42 : __strstr_sse2);
|
483
sysdeps/x86_64/multiarch/strstr.c
Normal file
483
sysdeps/x86_64/multiarch/strstr.c
Normal file
@ -0,0 +1,483 @@
|
||||
/* strstr with SSE4.2 intrinsics
|
||||
Copyright (C) 2009 Free Software Foundation, Inc.
|
||||
Contributed by Intel Corporation.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, write to the Free
|
||||
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
|
||||
02111-1307 USA. */
|
||||
|
||||
#include <nmmintrin.h>
|
||||
|
||||
#ifndef STRSTR_SSE42
|
||||
# define STRSTR_SSE42 __strstr_sse42
|
||||
#endif
|
||||
|
||||
#ifdef USE_AS_STRCASESTR
|
||||
# include <ctype.h>
|
||||
# include <locale.h>
|
||||
# include <string.h>
|
||||
|
||||
# define LOADBYTE(C) tolower (C)
|
||||
# define CMPBYTE(C1, C2) \
|
||||
((C1) == (C2) || tolower (C1) == tolower (C2))
|
||||
#else
|
||||
# define LOADBYTE(C) (C)
|
||||
# define CMPBYTE(C1, C2) ((C1) == (C2))
|
||||
#endif
|
||||
|
||||
/* We use 0xe ordered-compare:
|
||||
_SIDD_SBYTE_OPS
|
||||
| _SIDD_CMP_EQUAL_ORDER
|
||||
| _SIDD_LEAST_SIGNIFICANT
|
||||
on pcmpistri to do the scanning and string comparsion requirements of
|
||||
sub-string match. In the scanning phase, we process Cflag and ECX
|
||||
index to locate the first fragment match; once the first fragment
|
||||
match position has been identified, we do comparison of subsequent
|
||||
string fragments until we can conclude false or true match; whe
|
||||
n concluding a false match, we may need to repeat scanning process
|
||||
from next relevant offset in the target string.
|
||||
|
||||
In the scanning phase we have 4 cases:
|
||||
case ECX CFlag ZFlag SFlag
|
||||
1 16 0 0 0
|
||||
2a 16 0 0 1
|
||||
2b 16 0 1 0
|
||||
2c 16 0 1 1
|
||||
|
||||
1. No ordered-comparison match, both 16B fragments are valid, so
|
||||
continue to next fragment.
|
||||
2. No ordered-comparison match, there is EOS in either fragment,
|
||||
2a. Zflg = 0, Sflg = 1, we continue
|
||||
2b. Zflg = 1, Sflg = 0, we conclude no match and return.
|
||||
2c. Zflg = 1, sflg = 1, lenth determine match or no match
|
||||
|
||||
In the string comparison phase, the 1st fragment match is fixed up
|
||||
to produce ECX = 0. Subsequent fragment compare of nonzero index
|
||||
and no match conclude a false match.
|
||||
|
||||
case ECX CFlag ZFlag SFlag
|
||||
3 X 1 0 0/1
|
||||
4a 0 1 0 0
|
||||
4b 0 1 0 1
|
||||
4c 0 < X 1 0 0/1
|
||||
5 16 0 1 0
|
||||
|
||||
3. An initial ordered-comparison fragment match, we fix up to do
|
||||
subsequent string comparison
|
||||
4a. Continuation of fragment comparison of a string compare.
|
||||
4b. EOS reached in the reference string, we conclude true match and
|
||||
return
|
||||
4c. String compare failed if index is nonzero, we need to go back to
|
||||
scanning
|
||||
5. failed string compare, go back to scanning
|
||||
*/
|
||||
|
||||
/* Fix-up of removal of unneeded data due to 16B aligned load
|
||||
parameters:
|
||||
value: 16B data loaded from 16B aligned address.
|
||||
offset: Offset of target data address relative to 16B aligned load
|
||||
address.
|
||||
*/
|
||||
|
||||
static __inline__ __m128i
|
||||
__m128i_shift_right (__m128i value, int offset)
|
||||
{
|
||||
switch (offset)
|
||||
{
|
||||
case 1:
|
||||
value = _mm_srli_si128 (value, 1);
|
||||
break;
|
||||
case 2:
|
||||
value = _mm_srli_si128 (value, 2);
|
||||
break;
|
||||
case 3:
|
||||
value = _mm_srli_si128 (value, 3);
|
||||
break;
|
||||
case 4:
|
||||
value = _mm_srli_si128 (value, 4);
|
||||
break;
|
||||
case 5:
|
||||
value = _mm_srli_si128 (value, 5);
|
||||
break;
|
||||
case 6:
|
||||
value = _mm_srli_si128 (value, 6);
|
||||
break;
|
||||
case 7:
|
||||
value = _mm_srli_si128 (value, 7);
|
||||
break;
|
||||
case 8:
|
||||
value = _mm_srli_si128 (value, 8);
|
||||
break;
|
||||
case 9:
|
||||
value = _mm_srli_si128 (value, 9);
|
||||
break;
|
||||
case 10:
|
||||
value = _mm_srli_si128 (value, 10);
|
||||
break;
|
||||
case 11:
|
||||
value = _mm_srli_si128 (value, 11);
|
||||
break;
|
||||
case 12:
|
||||
value = _mm_srli_si128 (value, 12);
|
||||
break;
|
||||
case 13:
|
||||
value = _mm_srli_si128 (value, 13);
|
||||
break;
|
||||
case 14:
|
||||
value = _mm_srli_si128 (value, 14);
|
||||
break;
|
||||
case 15:
|
||||
value = _mm_srli_si128 (value, 15);
|
||||
break;
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
/* Simple replacement of movdqu to address 4KB boundary cross issue.
|
||||
If EOS occurs within less than 16B before 4KB boundary, we don't
|
||||
cross to next page. */
|
||||
|
||||
static __m128i
|
||||
__attribute__ ((section (".text.sse4.2")))
|
||||
__m128i_strloadu (const unsigned char * p)
|
||||
{
|
||||
int offset = ((size_t) p & (16 - 1));
|
||||
|
||||
if (offset && (int) ((size_t) p & 0xfff) > 0xff0)
|
||||
{
|
||||
__m128i a = _mm_load_si128 ((__m128i *) (p - offset));
|
||||
__m128i zero = _mm_setzero_si128 ();
|
||||
int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (a, zero));
|
||||
if ((bmsk >> offset) != 0)
|
||||
return __m128i_shift_right (a, offset);
|
||||
}
|
||||
return _mm_loadu_si128 ((__m128i *) p);
|
||||
}
|
||||
|
||||
#ifdef USE_AS_STRCASESTR
|
||||
|
||||
/* Similar to __m128i_strloadu. Convert to lower case for POSIX/C
|
||||
locale. */
|
||||
|
||||
static __m128i
|
||||
__attribute__ ((section (".text.sse4.2")))
|
||||
__m128i_strloadu_tolower_posix (const unsigned char * p)
|
||||
{
|
||||
__m128i frag = __m128i_strloadu (p);
|
||||
|
||||
/* Convert frag to lower case for POSIX/C locale. */
|
||||
__m128i rangeuc = _mm_set_epi64x (0x0, 0x5a41);
|
||||
__m128i u2ldelta = _mm_set1_epi64x (0xe0e0e0e0e0e0e0e0);
|
||||
__m128i mask1 = _mm_cmpistrm (rangeuc, frag, 0x44);
|
||||
__m128i mask2 = _mm_blendv_epi8 (u2ldelta, frag, mask1);
|
||||
mask2 = _mm_sub_epi8 (mask2, u2ldelta);
|
||||
return _mm_blendv_epi8 (frag, mask2, mask1);
|
||||
}
|
||||
|
||||
/* Similar to __m128i_strloadu. Convert to lower case for none-POSIX/C
|
||||
locale. */
|
||||
|
||||
static __m128i
|
||||
__attribute__ ((section (".text.sse4.2")))
|
||||
__m128i_strloadu_tolower (const unsigned char * p)
|
||||
{
|
||||
union
|
||||
{
|
||||
char b[16];
|
||||
__m128i x;
|
||||
} u;
|
||||
|
||||
for (int i = 0; i < 16; i++)
|
||||
if (p[i] == 0)
|
||||
{
|
||||
u.b[i] = 0;
|
||||
break;
|
||||
}
|
||||
else
|
||||
u.b[i] = tolower (p[i]);
|
||||
|
||||
return u.x;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Calculate Knuth-Morris-Pratt string searching algorithm (or KMP
|
||||
algorithm) overlap for a fully populated 16B vector.
|
||||
Input parameter: 1st 16Byte loaded from the reference string of a
|
||||
strstr function.
|
||||
We don't use KMP algorithm if reference string is less than 16B.
|
||||
*/
|
||||
|
||||
static int
|
||||
__inline__ __attribute__ ((__always_inline__,))
|
||||
KMP16Bovrlap (__m128i s2)
|
||||
{
|
||||
__m128i a, b;
|
||||
int bmsk, k1;
|
||||
|
||||
b = _mm_unpacklo_epi8 (s2, s2);
|
||||
a = _mm_unpacklo_epi8 (b, b);
|
||||
a = _mm_shuffle_epi32 (a, 0);
|
||||
b = _mm_srli_si128 (s2, sizeof (char));
|
||||
bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (b, a));
|
||||
|
||||
/* _BitScanForward(&k1, bmsk); */
|
||||
__asm ("bsfl %[bmsk], %[k1]" : [k1] "=r" (k1) : [bmsk] "r" (bmsk));
|
||||
if (!bmsk)
|
||||
return 16;
|
||||
else if (bmsk == 0x7fff)
|
||||
return 1;
|
||||
else if (!k1)
|
||||
{
|
||||
/* There are al least two ditinct char in s2. If byte 0 and 1 are
|
||||
idential and the distinct value lies farther down, we can deduce
|
||||
the next byte offset to restart full compare is least no earlier
|
||||
than byte 3. */
|
||||
return 3;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Byte 1 is not degenerated to byte 0. */
|
||||
return k1 + 1;
|
||||
}
|
||||
}
|
||||
|
||||
char *
|
||||
__attribute__ ((section (".text.sse4.2")))
|
||||
STRSTR_SSE42 (const unsigned char *s1, const unsigned char *s2)
|
||||
{
|
||||
int len, len1;
|
||||
const unsigned char *p1 = s1;
|
||||
const unsigned char *p2 = s2;
|
||||
__m128i frag1, frag2, zero;
|
||||
int cmp, cmp_c, cmp_z, cmp_s;
|
||||
int kmp_fwd, bmsk, bmsk1;
|
||||
const unsigned char *pt;
|
||||
|
||||
if (!p2[0])
|
||||
return (char *) p1;
|
||||
|
||||
if (!p1[0])
|
||||
return NULL;
|
||||
|
||||
/* Check if p1 length is 1 byte long. */
|
||||
if (!p1[1])
|
||||
return !p2[1] && CMPBYTE (p1[0], p2[0]) ? (char *) p1 : NULL;
|
||||
|
||||
#ifdef USE_AS_STRCASESTR
|
||||
__m128i (*strloadu) (const unsigned char *);
|
||||
const char *used_locale = setlocale (LC_CTYPE, NULL);
|
||||
|
||||
if (!used_locale
|
||||
|| (used_locale[0] == 'C' && used_locale[1] == '\0')
|
||||
|| strcmp (used_locale, "POSIX") == 0)
|
||||
strloadu = __m128i_strloadu_tolower_posix;
|
||||
else
|
||||
strloadu = __m128i_strloadu_tolower;
|
||||
#else
|
||||
# define strloadu __m128i_strloadu
|
||||
#endif
|
||||
|
||||
/* p1 > 1 byte long. Load up to 16 bytes of fragment. */
|
||||
frag1 = strloadu (p1);
|
||||
|
||||
if (p2[1])
|
||||
{
|
||||
/* p2 is > 1 byte long. */
|
||||
frag2 = strloadu (p2);
|
||||
}
|
||||
else
|
||||
{
|
||||
zero = _mm_setzero_si128 ();
|
||||
frag2 = _mm_insert_epi8 (zero, LOADBYTE(p2[0]), 0);
|
||||
}
|
||||
|
||||
/* Unsigned bytes, equal order, does frag2 has null? */
|
||||
cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
|
||||
cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
|
||||
cmp = _mm_cmpistri (frag2, frag1, 0x0c);
|
||||
cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
|
||||
if (cmp_s & cmp_c)
|
||||
{
|
||||
zero = _mm_setzero_si128 ();
|
||||
bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (frag2, zero));
|
||||
__asm ("bsfl %[bmsk], %[len]"
|
||||
: [len] "=r" (len) : [bmsk] "r" (bmsk));
|
||||
p1 += cmp;
|
||||
if ((len + cmp) <= 16)
|
||||
return (char *) p1;
|
||||
else
|
||||
{
|
||||
/* Load up to 16 bytes of fragment. */
|
||||
frag1 = strloadu (p1);
|
||||
cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
|
||||
cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
|
||||
cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
|
||||
cmp = _mm_cmpistri (frag2, frag1, 0x0c);
|
||||
if ((len + cmp) <= 16)
|
||||
return (char *) p1 + cmp;
|
||||
}
|
||||
}
|
||||
|
||||
if (cmp_s)
|
||||
{
|
||||
/* Adjust addr for 16B alginment in ensuing loop. */
|
||||
while (!cmp_z)
|
||||
{
|
||||
p1 += cmp;
|
||||
/* Load up to 16 bytes of fragment. */
|
||||
frag1 = strloadu (p1);
|
||||
cmp = _mm_cmpistri (frag2, frag1, 0x0c);
|
||||
cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
|
||||
cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
|
||||
/* Because s2 < 16 bytes and we adjusted p1 by non-zero cmp
|
||||
once already, this time cmp will be zero and we can exit. */
|
||||
if ((!cmp) & cmp_c)
|
||||
break;
|
||||
}
|
||||
|
||||
if (!cmp_c)
|
||||
return NULL;
|
||||
else
|
||||
{
|
||||
/* Since s2 is less than 16 bytes, com_c is definitive
|
||||
determination of full match. */
|
||||
return (char *) p1 + cmp;
|
||||
}
|
||||
}
|
||||
|
||||
/* General case, s2 is at least 16 bytes or more.
|
||||
First, the common case of false-match at first byte of p2. */
|
||||
pt = NULL;
|
||||
kmp_fwd = 0;
|
||||
re_trace:
|
||||
while (!cmp_c)
|
||||
{
|
||||
/* frag1 has null. */
|
||||
if (cmp_z)
|
||||
return NULL;
|
||||
|
||||
/* frag 1 has no null, advance 16 bytes. */
|
||||
p1 += 16;
|
||||
/* Load up to 16 bytes of fragment. */
|
||||
frag1 = strloadu (p1);
|
||||
/* Unsigned bytes, equal order, is there a partial match? */
|
||||
cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
|
||||
cmp = _mm_cmpistri(frag2, frag1, 0x0c);
|
||||
cmp_z = _mm_cmpistrz(frag2, frag1, 0x0c);
|
||||
}
|
||||
|
||||
/* Next, handle inital positive match as first byte of p2. We have
|
||||
a partial fragment match, make full determination until we reached
|
||||
end of s2. */
|
||||
if (!cmp)
|
||||
{
|
||||
if (cmp_z)
|
||||
return (char *) p1;
|
||||
|
||||
pt = p1;
|
||||
p1 += 16;
|
||||
p2 += 16;
|
||||
/* Load up to 16 bytes of fragment. */
|
||||
frag2 = strloadu(p2);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Adjust 16B alignment. */
|
||||
p1 += cmp;
|
||||
pt = p1;
|
||||
}
|
||||
|
||||
/* Load up to 16 bytes of fragment. */
|
||||
frag1 = strloadu (p1);
|
||||
|
||||
/* Unsigned bytes, equal order, does frag2 has null? */
|
||||
cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
|
||||
cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
|
||||
cmp = _mm_cmpistri (frag2, frag1, 0x0c);
|
||||
cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
|
||||
while (!(cmp | cmp_z | cmp_s))
|
||||
{
|
||||
p1 += 16;
|
||||
p2 += 16;
|
||||
/* Load up to 16 bytes of fragment. */
|
||||
frag2 = strloadu (p2);
|
||||
/* Load up to 16 bytes of fragment. */
|
||||
frag1 = strloadu (p1);
|
||||
/* Unsigned bytes, equal order, does frag2 has null? */
|
||||
cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
|
||||
cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
|
||||
cmp = _mm_cmpistri (frag2, frag1, 0x0c);
|
||||
cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
|
||||
}
|
||||
|
||||
/* Full determination yielded false result, retrace s1 to next
|
||||
starting position.
|
||||
Zflg 1 0 1 0/1
|
||||
Sflg 0 1 1 0/1
|
||||
cmp na 0 0 >0
|
||||
action done done continue continue if s2 < s1
|
||||
false match retrace s1 else false
|
||||
*/
|
||||
|
||||
if(cmp_s & !cmp)
|
||||
return (char *) pt;
|
||||
else if (cmp_z)
|
||||
{
|
||||
if (!cmp_s)
|
||||
return NULL;
|
||||
|
||||
/* Handle both zero and sign flag set and s1 is shorter in
|
||||
length. */
|
||||
zero = _mm_setzero_si128 ();
|
||||
bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (zero, frag2));
|
||||
bmsk1 = _mm_movemask_epi8 (_mm_cmpeq_epi8 (zero, frag1));
|
||||
__asm ("bsfl %[bmsk], %[len]"
|
||||
: [len] "=r" (len) : [bmsk] "r" (bmsk));
|
||||
__asm ("bsfl %[bmsk1], %[len1]"
|
||||
: [len1] "=r" (len1) : [bmsk1] "r" (bmsk1));
|
||||
if (len >= len1)
|
||||
return NULL;
|
||||
}
|
||||
else if (!cmp)
|
||||
return (char *) pt;
|
||||
|
||||
/* Otherwise, we have to retrace and continue. Default of multiple
|
||||
paths that need to retrace from next byte in s1. */
|
||||
p2 = s2;
|
||||
frag2 = strloadu (p2);
|
||||
|
||||
if (!kmp_fwd)
|
||||
kmp_fwd = KMP16Bovrlap (frag2);
|
||||
|
||||
/* KMP algorithm predicted overlap needs to be corrected for
|
||||
partial fragment compare. */
|
||||
p1 = pt + (kmp_fwd > cmp ? cmp : kmp_fwd);
|
||||
|
||||
/* Since s2 is at least 16 bytes long, we're certain there is no
|
||||
match. */
|
||||
if (!p1[0])
|
||||
return NULL;
|
||||
else
|
||||
{
|
||||
/* Load up to 16 bytes of fragment. */
|
||||
frag1 = strloadu (p1);
|
||||
}
|
||||
|
||||
/* Unsigned bytes, equal order, is there a partial match? */
|
||||
cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
|
||||
cmp = _mm_cmpistri (frag2, frag1, 0x0c);
|
||||
cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
|
||||
goto re_trace;
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user