68b7efaadb
Also fixed the following whitespace nits to satisfy the push: sysdeps/alpha/alphaev6/memset.S:142: space before tab in indent. sysdeps/alpha/configure:1: new blank line at EOF. sysdeps/alpha/fpu/e_sqrt.c:126: space before tab in indent. sysdeps/alpha/preconfigure:1: new blank line at EOF. sysdeps/unix/sysv/linux/alpha/syscalls.list:1: new blank line at EOF.
175 lines
4.4 KiB
C
175 lines
4.4 KiB
C
/* Copyright (C) 2010-2014 Free Software Foundation, Inc.
|
|
This file is part of the GNU C Library.
|
|
|
|
The GNU C Library is free software; you can redistribute it and/or
|
|
modify it under the terms of the GNU Lesser General Public
|
|
License as published by the Free Software Foundation; either
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
The GNU C Library is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
Lesser General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
License along with the GNU C Library. If not, see
|
|
<http://www.gnu.org/licenses/>. */
|
|
|
|
#include <string.h>
|
|
|
|
typedef unsigned long word;
|
|
|
|
static inline word
|
|
ldq_u(const void *s)
|
|
{
|
|
return *(const word *)((word)s & -8);
|
|
}
|
|
|
|
#define unlikely(X) __builtin_expect ((X), 0)
|
|
#define prefetch(X) __builtin_prefetch ((void *)(X), 0)
|
|
|
|
#define cmpbeq0(X) __builtin_alpha_cmpbge(0, (X))
|
|
#define find(X, Y) cmpbeq0 ((X) ^ (Y))
|
|
|
|
/* Search no more than N bytes of S for C. */
|
|
|
|
void *
|
|
__memchr (const void *s, int xc, size_t n)
|
|
{
|
|
const word *s_align;
|
|
word t, current, found, mask, offset;
|
|
|
|
if (unlikely (n == 0))
|
|
return 0;
|
|
|
|
current = ldq_u (s);
|
|
|
|
/* Replicate low byte of XC into all bytes of C. */
|
|
t = xc & 0xff; /* 0000000c */
|
|
t = (t << 8) | t; /* 000000cc */
|
|
t = (t << 16) | t; /* 0000cccc */
|
|
const word c = (t << 32) | t; /* cccccccc */
|
|
|
|
/* Align the source, and decrement the count by the number
|
|
of bytes searched in the first word. */
|
|
s_align = (const word *)((word)s & -8);
|
|
n += ((word)s & 7);
|
|
|
|
/* Deal with misalignment in the first word for the comparison. */
|
|
mask = (1ul << ((word)s & 7)) - 1;
|
|
|
|
/* If the entire string fits within one word, we may need masking
|
|
at both the front and the back of the string. */
|
|
if (unlikely (n <= 8))
|
|
{
|
|
mask |= -1ul << n;
|
|
goto last_quad;
|
|
}
|
|
|
|
found = find (current, c) & ~mask;
|
|
if (unlikely (found))
|
|
goto found_it;
|
|
|
|
s_align++;
|
|
n -= 8;
|
|
|
|
/* If the block is sufficiently large, align to cacheline and prefetch. */
|
|
if (unlikely (n >= 256))
|
|
{
|
|
/* Prefetch 3 cache lines beyond the one we're working on. */
|
|
prefetch (s_align + 8);
|
|
prefetch (s_align + 16);
|
|
prefetch (s_align + 24);
|
|
|
|
while ((word)s_align & 63)
|
|
{
|
|
current = *s_align;
|
|
found = find (current, c);
|
|
if (found)
|
|
goto found_it;
|
|
s_align++;
|
|
n -= 8;
|
|
}
|
|
|
|
/* Within each cacheline, advance the load for the next word
|
|
before the test for the previous word is complete. This
|
|
allows us to hide the 3 cycle L1 cache load latency. We
|
|
only perform this advance load within a cacheline to prevent
|
|
reading across page boundary. */
|
|
#define CACHELINE_LOOP \
|
|
do { \
|
|
word i, next = s_align[0]; \
|
|
for (i = 0; i < 7; ++i) \
|
|
{ \
|
|
current = next; \
|
|
next = s_align[1]; \
|
|
found = find (current, c); \
|
|
if (unlikely (found)) \
|
|
goto found_it; \
|
|
s_align++; \
|
|
} \
|
|
current = next; \
|
|
found = find (current, c); \
|
|
if (unlikely (found)) \
|
|
goto found_it; \
|
|
s_align++; \
|
|
n -= 64; \
|
|
} while (0)
|
|
|
|
/* While there's still lots more data to potentially be read,
|
|
continue issuing prefetches for the 4th cacheline out. */
|
|
while (n >= 256)
|
|
{
|
|
prefetch (s_align + 24);
|
|
CACHELINE_LOOP;
|
|
}
|
|
|
|
/* Up to 3 cache lines remaining. Continue issuing advanced
|
|
loads, but stop prefetching. */
|
|
while (n >= 64)
|
|
CACHELINE_LOOP;
|
|
|
|
/* We may have exhausted the buffer. */
|
|
if (n == 0)
|
|
return NULL;
|
|
}
|
|
|
|
/* Quadword aligned loop. */
|
|
current = *s_align;
|
|
while (n > 8)
|
|
{
|
|
found = find (current, c);
|
|
if (unlikely (found))
|
|
goto found_it;
|
|
current = *++s_align;
|
|
n -= 8;
|
|
}
|
|
|
|
/* The last word may need masking at the tail of the compare. */
|
|
mask = -1ul << n;
|
|
last_quad:
|
|
found = find (current, c) & ~mask;
|
|
if (found == 0)
|
|
return NULL;
|
|
|
|
found_it:
|
|
#ifdef __alpha_cix__
|
|
offset = __builtin_alpha_cttz (found);
|
|
#else
|
|
/* Extract LSB. */
|
|
found &= -found;
|
|
|
|
/* Binary search for the LSB. */
|
|
offset = (found & 0x0f ? 0 : 4);
|
|
offset += (found & 0x33 ? 0 : 2);
|
|
offset += (found & 0x55 ? 0 : 1);
|
|
#endif
|
|
|
|
return (void *)((word)s_align + offset);
|
|
}
|
|
|
|
#ifdef weak_alias
|
|
weak_alias (__memchr, memchr)
|
|
#endif
|
|
libc_hidden_builtin_def (memchr)
|