glibc/sysdeps/alpha/memchr.S

/* Copyright (C) 1996 Free Software Foundation, Inc.
   Contributed by David Mosberger (davidm@cs.arizona.edu).

This file is part of the GNU C Library.

The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.

The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Library General Public License for more details.

You should have received a copy of the GNU Library General Public
License along with the GNU C Library; see the file COPYING.LIB.  If
not, write to the Free Software Foundation, Inc., 675 Mass Ave,
Cambridge, MA 02139, USA.  */

/* Finds characters in a memory area.  Optimized for the Alpha
architecture:

      - memory accessed as aligned quadwords only
      - uses cmpbge to compare 8 bytes in parallel
      - does binary search to find 0 byte in last
        quadword (HAKMEM needed 12 instructions to
        do this instead of the 9 instructions that
        binary search needs).

For correctness consider that:

      - only minimum number of quadwords may be accessed
      - the third argument is an unsigned long
*/

#include <sysdep.h>
#ifdef __linux__
# include <alpha/regdef.h>
#else
#include <regdef.h>
#endif

        .set noreorder
        .set noat

ENTRY(memchr)
	beq	a2, not_found
        ldq_u   t0, 0(a0)       # load first quadword (a0 may be misaligned)
	addq	a0, a2, t4
	and	a1, 0xff, a1	# a1 = 00000000000000ch
	ldq_u	t5, -1(t4)
	sll	a1,  8, t1	# t1 = 000000000000ch00
	cmpult	a2, 9, t3
	or	t1, a1, a1	# a1 = 000000000000chch
	sll	a1, 16, t1	# t1 = 00000000chch0000
        lda     t2, -1(zero)
	or	t1, a1, a1	# a1 = 00000000chchchch
	sll	a1, 32, t1	# t1 = chchchch00000000
	extql	t0, a0, t6
	or	t1, a1, a1	# a1 = chchchchchchchch

	beq	t3, first_quad

	extqh	t5, a0, t5
	mov	a0, v0
	or	t6, t5, t0	# t0 = quadword starting at a0

	#
	# Deal with the case where at most 8 bytes remain to be searched
	# in t0.  E.g.:
	#	a2 = 6
	#	t0 = ????c6c5c4c3c2c1
last_quad:
	negq	a2, t5
	srl	t2, t5, t5	# t5 = mask of a2 bits set
        xor	a1, t0, t0
        cmpbge  zero, t0, t1
	and	t1, t5, t1
        beq     t1, not_found

found_it:
	# now, determine which byte matched:
        negq    t1, t2
        and     t1, t2, t1

        and     t1, 0x0f, t0
        addq    v0, 4, t2
        cmoveq  t0, t2, v0

        and     t1, 0x33, t0
        addq    v0, 2, t2
        cmoveq  t0, t2, v0

        and     t1, 0x55, t0
        addq    v0, 1, t2
        cmoveq  t0, t2, v0

done:	ret


	#
	# Deal with the case where a2 > 8 bytes remain to be
	# searched.  a0 may not be aligned.
	#
first_quad:
	andnot	a0, 0x7, v0
        insqh   t2, a0, t1	# t1 = 0000ffffffffffff (a0<0:2> ff bytes)
        xor	t0, a1, t0
	or	t0, t1, t0	# t0 = ====ffffffffffff
        cmpbge  zero, t0, t1
        bne     t1, found_it

	/* at least one byte left to process */

	ldq	t0, 8(v0)
	addq	v0, 8, v0
	/*
	 * Make a2 point to last quad to be accessed (the
	 * last quad may or may not be partial).
	 */
	subq	t4, 1, a2
	andnot	a2, 0x7, a2
	cmpult	v0, a2, t1
	beq	t1, final

	/* at least two quads remain to be accessed */

	subq	a2, v0, t3	# t3 <- number of quads to be processed in loop
	and	t3, 8, t3	# odd number of quads?
	bne	t3, odd_quad_count

	/* at least three quads remain to be accessed */

	mov	t0, t3		# move prefetched value into correct register

	.align	3
unrolled_loop:
	ldq	t0, 8(v0)	# prefetch t0
	xor	a1, t3, t1
	cmpbge	zero, t1, t1
	bne	t1, found_it

	addq	v0, 8, v0
odd_quad_count:
	xor	a1, t0, t1
	ldq	t3, 8(v0)	# prefetch t3
	cmpbge	zero, t1, t1
	bne	t1, found_it

	addq	v0, 8, v0
	cmpult	v0, a2, t5
	bne	t5, unrolled_loop

	mov	t3, t0		# move prefetched value into t0
final:	subq	t4, v0, a2	# a2 <- number of bytes left to do
	bne	a2, last_quad

not_found:
	mov	zero, v0
	ret

        .end    memchr
Sat Mar 16 20:08:22 1996 David Mosberger-Tang <davidm@azstarnet.com> * sysdeps/alpha/memchr.S: new file. * sysdeps/alpha/memchr.c: obsolete file removed. Sat Mar 16 16:26:09 1996 Roland McGrath <roland@charlie-brown.gnu.ai.mit.edu> * misc/Makefile (headers): Add sysexits.h. * misc/sysexits.h: New file. Thu Mar 14 15:20:45 1996 Andreas Schwab <schwab@issan.informatik.uni-dortmund.de> * sysdeps/libm-ieee754/e_atan2.c (__ieee754_atan2): Change atan call to __atan. * sysdeps/libm-ieee754/e_atan2f.c (__ieee754_atan2f): Change atanf call to __atanf. * sysdeps/m68k/fpu/e_acos.c, sysdeps/m68k/fpu/e_acosf.c, sysdeps/m68k/fpu/e_asin.c, sysdeps/m68k/fpu/e_asinf.c, sysdeps/m68k/fpu/e_atanh.c, sysdeps/m68k/fpu/e_atanhf.c, sysdeps/m68k/fpu/e_cosh.c, sysdeps/m68k/fpu/e_coshf.c, sysdeps/m68k/fpu/e_exp.c, sysdeps/m68k/fpu/e_expf.c, sysdeps/m68k/fpu/e_fmod.c, sysdeps/m68k/fpu/e_fmodf.c, sysdeps/m68k/fpu/e_log.c, sysdeps/m68k/fpu/e_log10.c, sysdeps/m68k/fpu/e_log10f.c, sysdeps/m68k/fpu/e_logf.c, sysdeps/m68k/fpu/e_pow.c, sysdeps/m68k/fpu/e_powf.c, sysdeps/m68k/fpu/e_remainder.c, sysdeps/m68k/fpu/e_remainderf.c, sysdeps/m68k/fpu/e_scalb.c, sysdeps/m68k/fpu/e_scalbf.c, sysdeps/m68k/fpu/e_sinh.c, sysdeps/m68k/fpu/e_sinhf.c, sysdeps/m68k/fpu/e_sqrt.c, sysdeps/m68k/fpu/e_sqrtf.c, sysdeps/m68k/fpu/k_cos.c, sysdeps/m68k/fpu/k_cosf.c, sysdeps/m68k/fpu/k_sin.c, sysdeps/m68k/fpu/k_sinf.c, sysdeps/m68k/fpu/k_tan.c, sysdeps/m68k/fpu/k_tanf.c, sysdeps/m68k/fpu/s_atan.c, sysdeps/m68k/fpu/s_atanf.c, sysdeps/m68k/fpu/s_ceil.c, sysdeps/m68k/fpu/s_ceilf.c, sysdeps/m68k/fpu/s_cos.c, sysdeps/m68k/fpu/s_cosf.c, sysdeps/m68k/fpu/s_expm1.c, sysdeps/m68k/fpu/s_expm1f.c, sysdeps/m68k/fpu/s_fabs.c, sysdeps/m68k/fpu/s_fabsf.c, sysdeps/m68k/fpu/s_finite.c, sysdeps/m68k/fpu/s_finitef.c, sysdeps/m68k/fpu/s_floor.c, sysdeps/m68k/fpu/s_floorf.c, sysdeps/m68k/fpu/s_frexp.c, sysdeps/m68k/fpu/s_frexpf.c, sysdeps/m68k/fpu/s_ilogb.c, sysdeps/m68k/fpu/s_ilogbf.c, sysdeps/m68k/fpu/s_isinf.c, sysdeps/m68k/fpu/s_isinff.c, sysdeps/m68k/fpu/s_isnan.c, sysdeps/m68k/fpu/s_isnanf.c, sysdeps/m68k/fpu/s_ldexp.c, sysdeps/m68k/fpu/s_ldexpf.c, sysdeps/m68k/fpu/s_log1p.c, sysdeps/m68k/fpu/s_log1pf.c, sysdeps/m68k/fpu/s_logb.c, sysdeps/m68k/fpu/s_logbf.c, sysdeps/m68k/fpu/s_modf.c, sysdeps/m68k/fpu/s_modff.c, sysdeps/m68k/fpu/s_rint.c, sysdeps/m68k/fpu/s_rintf.c, sysdeps/m68k/fpu/s_scalbn.c, sysdeps/m68k/fpu/s_scalbnf.c, sysdeps/m68k/fpu/s_significand.c, sysdeps/m68k/fpu/s_significandf.c, sysdeps/m68k/fpu/s_sin.c, sysdeps/m68k/fpu/s_sinf.c, sysdeps/m68k/fpu/s_tan.c, sysdeps/m68k/fpu/s_tanf.c, sysdeps/m68k/fpu/s_tanh.c, sysdeps/m68k/fpu/s_tanhf.c: New files, for m68881 port of fdlibm. * sysdeps/m68k/fpu/__math.h: Rewritten for fdlibm. * sysdeps/m68k/fpu/isinfl.c: Rewritten to get argument type right. * sysdeps/m68k/fpu/isnanl.c: Likewise. Thu Mar 14 06:01:07 1996 Roland McGrath <roland@charlie-brown.gnu.ai.mit.edu> * posix/glob.c (glob): In GLOB_BRACE brace expansion, fix buffer size calculation to include trailing invariant portion. Don't use alloca; instead use a dynamic auto array for GCC, malloc for non-GCC. Handle nested braces properly. * elf/elf.h (Elf32_auxv_t): Specify prototype (void) for `a_un.a_fcn'. * libc-symbols.h (lint): New macro. Fri Mar 15 01:18:00 1996 Andreas Schwab <schwab@issan.informatik.uni-dortmund.de> * libio/iosetvbuf.c: Add weak alias setvbuf. 1996-03-17 02:58:17 +01:00			`/* Copyright (C) 1996 Free Software Foundation, Inc.`
			`Contributed by David Mosberger (davidm@cs.arizona.edu).`

			`This file is part of the GNU C Library.`

			`The GNU C Library is free software; you can redistribute it and/or`
			`modify it under the terms of the GNU Library General Public License as`
			`published by the Free Software Foundation; either version 2 of the`
			`License, or (at your option) any later version.`

			`The GNU C Library is distributed in the hope that it will be useful,`
			`but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`Library General Public License for more details.`

			`You should have received a copy of the GNU Library General Public`
			`License along with the GNU C Library; see the file COPYING.LIB. If`
			`not, write to the Free Software Foundation, Inc., 675 Mass Ave,`
			`Cambridge, MA 02139, USA. */`

			`/* Finds characters in a memory area. Optimized for the Alpha`
			`architecture:`

			`- memory accessed as aligned quadwords only`
			`- uses cmpbge to compare 8 bytes in parallel`
			`- does binary search to find 0 byte in last`
			`quadword (HAKMEM needed 12 instructions to`
			`do this instead of the 9 instructions that`
			`binary search needs).`

			`For correctness consider that:`

			`- only minimum number of quadwords may be accessed`
			`- the third argument is an unsigned long`
			`*/`

			`#include <sysdep.h>`
			`#ifdef __linux__`
			`# include <alpha/regdef.h>`
			`#else`
			`#include <regdef.h>`
			`#endif`

			`.set noreorder`
			`.set noat`

			`ENTRY(memchr)`
			`beq a2, not_found`
			`ldq_u t0, 0(a0) # load first quadword (a0 may be misaligned)`
			`addq a0, a2, t4`
			`and a1, 0xff, a1 # a1 = 00000000000000ch`
			`ldq_u t5, -1(t4)`
			`sll a1, 8, t1 # t1 = 000000000000ch00`
			`cmpult a2, 9, t3`
			`or t1, a1, a1 # a1 = 000000000000chch`
			`sll a1, 16, t1 # t1 = 00000000chch0000`
			`lda t2, -1(zero)`
			`or t1, a1, a1 # a1 = 00000000chchchch`
			`sll a1, 32, t1 # t1 = chchchch00000000`
			`extql t0, a0, t6`
			`or t1, a1, a1 # a1 = chchchchchchchch`

			`beq t3, first_quad`

			`extqh t5, a0, t5`
			`mov a0, v0`
			`or t6, t5, t0 # t0 = quadword starting at a0`

			`#`
			`# Deal with the case where at most 8 bytes remain to be searched`
			`# in t0. E.g.:`
			`# a2 = 6`
			`# t0 = ????c6c5c4c3c2c1`
			`last_quad:`
			`negq a2, t5`
			`srl t2, t5, t5 # t5 = mask of a2 bits set`
			`xor a1, t0, t0`
			`cmpbge zero, t0, t1`
			`and t1, t5, t1`
			`beq t1, not_found`

			`found_it:`
			`# now, determine which byte matched:`
			`negq t1, t2`
			`and t1, t2, t1`

			`and t1, 0x0f, t0`
			`addq v0, 4, t2`
			`cmoveq t0, t2, v0`

			`and t1, 0x33, t0`
			`addq v0, 2, t2`
			`cmoveq t0, t2, v0`

			`and t1, 0x55, t0`
			`addq v0, 1, t2`
			`cmoveq t0, t2, v0`

			`done: ret`


			`#`
			`# Deal with the case where a2 > 8 bytes remain to be`
			`# searched. a0 may not be aligned.`
			`#`
			`first_quad:`
			`andnot a0, 0x7, v0`
			`insqh t2, a0, t1 # t1 = 0000ffffffffffff (a0<0:2> ff bytes)`
			`xor t0, a1, t0`
			`or t0, t1, t0 # t0 = ====ffffffffffff`
			`cmpbge zero, t0, t1`
			`bne t1, found_it`

			`/* at least one byte left to process */`

			`ldq t0, 8(v0)`
			`addq v0, 8, v0`
			`/*`
			`* Make a2 point to last quad to be accessed (the`
			`* last quad may or may not be partial).`
			`*/`
			`subq t4, 1, a2`
			`andnot a2, 0x7, a2`
			`cmpult v0, a2, t1`
			`beq t1, final`

			`/* at least two quads remain to be accessed */`

			`subq a2, v0, t3 # t3 <- number of quads to be processed in loop`
			`and t3, 8, t3 # odd number of quads?`
			`bne t3, odd_quad_count`

			`/* at least three quads remain to be accessed */`

			`mov t0, t3 # move prefetched value into correct register`

			`.align 3`
			`unrolled_loop:`
			`ldq t0, 8(v0) # prefetch t0`
			`xor a1, t3, t1`
			`cmpbge zero, t1, t1`
			`bne t1, found_it`

			`addq v0, 8, v0`
			`odd_quad_count:`
			`xor a1, t0, t1`
			`ldq t3, 8(v0) # prefetch t3`
			`cmpbge zero, t1, t1`
			`bne t1, found_it`

			`addq v0, 8, v0`
			`cmpult v0, a2, t5`
			`bne t5, unrolled_loop`

			`mov t3, t0 # move prefetched value into t0`
			`final: subq t4, v0, a2 # a2 <- number of bytes left to do`
			`bne a2, last_quad`

			`not_found:`
			`mov zero, v0`
			`ret`

			`.end memchr`