3552 lines
62 KiB
ArmAsm
3552 lines
62 KiB
ArmAsm
/* strcpy with SSSE3
|
|
Copyright (C) 2011 Free Software Foundation, Inc.
|
|
Contributed by Intel Corporation.
|
|
This file is part of the GNU C Library.
|
|
|
|
The GNU C Library is free software; you can redistribute it and/or
|
|
modify it under the terms of the GNU Lesser General Public
|
|
License as published by the Free Software Foundation; either
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
The GNU C Library is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
Lesser General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
License along with the GNU C Library; if not, see
|
|
<http://www.gnu.org/licenses/>. */
|
|
|
|
#ifndef NOT_IN_libc
|
|
|
|
# ifndef USE_AS_STRCAT
|
|
# include <sysdep.h>
|
|
|
|
# ifndef STRCPY
|
|
# define STRCPY __strcpy_ssse3
|
|
# endif
|
|
|
|
.section .text.ssse3,"ax",@progbits
|
|
ENTRY (STRCPY)
|
|
|
|
mov %rsi, %rcx
|
|
# ifdef USE_AS_STRNCPY
|
|
mov %rdx, %r8
|
|
# endif
|
|
mov %rdi, %rdx
|
|
# ifdef USE_AS_STRNCPY
|
|
test %r8, %r8
|
|
jz L(Exit0)
|
|
cmp $8, %r8
|
|
jbe L(StrncpyExit8Bytes)
|
|
# endif
|
|
cmpb $0, (%rcx)
|
|
jz L(Exit1)
|
|
cmpb $0, 1(%rcx)
|
|
jz L(Exit2)
|
|
cmpb $0, 2(%rcx)
|
|
jz L(Exit3)
|
|
cmpb $0, 3(%rcx)
|
|
jz L(Exit4)
|
|
cmpb $0, 4(%rcx)
|
|
jz L(Exit5)
|
|
cmpb $0, 5(%rcx)
|
|
jz L(Exit6)
|
|
cmpb $0, 6(%rcx)
|
|
jz L(Exit7)
|
|
cmpb $0, 7(%rcx)
|
|
jz L(Exit8)
|
|
# ifdef USE_AS_STRNCPY
|
|
cmp $16, %r8
|
|
jb L(StrncpyExit15Bytes)
|
|
# endif
|
|
cmpb $0, 8(%rcx)
|
|
jz L(Exit9)
|
|
cmpb $0, 9(%rcx)
|
|
jz L(Exit10)
|
|
cmpb $0, 10(%rcx)
|
|
jz L(Exit11)
|
|
cmpb $0, 11(%rcx)
|
|
jz L(Exit12)
|
|
cmpb $0, 12(%rcx)
|
|
jz L(Exit13)
|
|
cmpb $0, 13(%rcx)
|
|
jz L(Exit14)
|
|
cmpb $0, 14(%rcx)
|
|
jz L(Exit15)
|
|
# ifdef USE_AS_STRNCPY
|
|
cmp $16, %r8
|
|
je L(Exit16)
|
|
# endif
|
|
cmpb $0, 15(%rcx)
|
|
jz L(Exit16)
|
|
# endif
|
|
|
|
# ifdef USE_AS_STRNCPY
|
|
mov %rcx, %rsi
|
|
sub $16, %r8
|
|
and $0xf, %rsi
|
|
|
|
/* add 16 bytes rcx_offset to r8 */
|
|
|
|
add %rsi, %r8
|
|
# endif
|
|
lea 16(%rcx), %rsi
|
|
and $-16, %rsi
|
|
pxor %xmm0, %xmm0
|
|
mov (%rcx), %r9
|
|
mov %r9, (%rdx)
|
|
pcmpeqb (%rsi), %xmm0
|
|
mov 8(%rcx), %r9
|
|
mov %r9, 8(%rdx)
|
|
|
|
/* convert byte mask in xmm0 to bit mask */
|
|
|
|
pmovmskb %xmm0, %rax
|
|
sub %rcx, %rsi
|
|
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(CopyFrom1To16BytesCase2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(CopyFrom1To16Bytes)
|
|
|
|
mov %rdx, %rax
|
|
lea 16(%rdx), %rdx
|
|
and $-16, %rdx
|
|
sub %rdx, %rax
|
|
|
|
# ifdef USE_AS_STRNCPY
|
|
add %rax, %rsi
|
|
lea -1(%rsi), %rsi
|
|
and $1<<31, %esi
|
|
test %rsi, %rsi
|
|
jnz L(ContinueCopy)
|
|
lea 16(%r8), %r8
|
|
|
|
L(ContinueCopy):
|
|
# endif
|
|
sub %rax, %rcx
|
|
mov %rcx, %rax
|
|
and $0xf, %rax
|
|
mov $0, %rsi
|
|
|
|
/* case: rcx_offset == rdx_offset */
|
|
|
|
jz L(Align16Both)
|
|
|
|
cmp $8, %rax
|
|
jae L(ShlHigh8)
|
|
cmp $1, %rax
|
|
je L(Shl1)
|
|
cmp $2, %rax
|
|
je L(Shl2)
|
|
cmp $3, %rax
|
|
je L(Shl3)
|
|
cmp $4, %rax
|
|
je L(Shl4)
|
|
cmp $5, %rax
|
|
je L(Shl5)
|
|
cmp $6, %rax
|
|
je L(Shl6)
|
|
jmp L(Shl7)
|
|
|
|
L(ShlHigh8):
|
|
je L(Shl8)
|
|
cmp $9, %rax
|
|
je L(Shl9)
|
|
cmp $10, %rax
|
|
je L(Shl10)
|
|
cmp $11, %rax
|
|
je L(Shl11)
|
|
cmp $12, %rax
|
|
je L(Shl12)
|
|
cmp $13, %rax
|
|
je L(Shl13)
|
|
cmp $14, %rax
|
|
je L(Shl14)
|
|
jmp L(Shl15)
|
|
|
|
L(Align16Both):
|
|
movaps (%rcx), %xmm1
|
|
movaps 16(%rcx), %xmm2
|
|
movaps %xmm1, (%rdx)
|
|
pcmpeqb %xmm2, %xmm0
|
|
pmovmskb %xmm0, %rax
|
|
lea 16(%rsi), %rsi
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(CopyFrom1To16BytesCase2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(CopyFrom1To16Bytes)
|
|
|
|
movaps 16(%rcx, %rsi), %xmm3
|
|
movaps %xmm2, (%rdx, %rsi)
|
|
pcmpeqb %xmm3, %xmm0
|
|
pmovmskb %xmm0, %rax
|
|
lea 16(%rsi), %rsi
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(CopyFrom1To16BytesCase2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(CopyFrom1To16Bytes)
|
|
|
|
movaps 16(%rcx, %rsi), %xmm4
|
|
movaps %xmm3, (%rdx, %rsi)
|
|
pcmpeqb %xmm4, %xmm0
|
|
pmovmskb %xmm0, %rax
|
|
lea 16(%rsi), %rsi
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(CopyFrom1To16BytesCase2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(CopyFrom1To16Bytes)
|
|
|
|
movaps 16(%rcx, %rsi), %xmm1
|
|
movaps %xmm4, (%rdx, %rsi)
|
|
pcmpeqb %xmm1, %xmm0
|
|
pmovmskb %xmm0, %rax
|
|
lea 16(%rsi), %rsi
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(CopyFrom1To16BytesCase2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(CopyFrom1To16Bytes)
|
|
|
|
movaps 16(%rcx, %rsi), %xmm2
|
|
movaps %xmm1, (%rdx, %rsi)
|
|
pcmpeqb %xmm2, %xmm0
|
|
pmovmskb %xmm0, %rax
|
|
lea 16(%rsi), %rsi
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(CopyFrom1To16BytesCase2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(CopyFrom1To16Bytes)
|
|
|
|
movaps 16(%rcx, %rsi), %xmm3
|
|
movaps %xmm2, (%rdx, %rsi)
|
|
pcmpeqb %xmm3, %xmm0
|
|
pmovmskb %xmm0, %rax
|
|
lea 16(%rsi), %rsi
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(CopyFrom1To16BytesCase2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(CopyFrom1To16Bytes)
|
|
|
|
movaps %xmm3, (%rdx, %rsi)
|
|
mov %rcx, %rax
|
|
lea 16(%rcx, %rsi), %rcx
|
|
and $-0x40, %rcx
|
|
sub %rcx, %rax
|
|
sub %rax, %rdx
|
|
# ifdef USE_AS_STRNCPY
|
|
lea 112(%r8, %rax), %r8
|
|
# endif
|
|
mov $-0x40, %rsi
|
|
|
|
.p2align 4
|
|
L(Aligned64Loop):
|
|
movaps (%rcx), %xmm2
|
|
movaps %xmm2, %xmm4
|
|
movaps 16(%rcx), %xmm5
|
|
movaps 32(%rcx), %xmm3
|
|
movaps %xmm3, %xmm6
|
|
movaps 48(%rcx), %xmm7
|
|
pminub %xmm5, %xmm2
|
|
pminub %xmm7, %xmm3
|
|
pminub %xmm2, %xmm3
|
|
pcmpeqb %xmm0, %xmm3
|
|
pmovmskb %xmm3, %rax
|
|
lea 64(%rdx), %rdx
|
|
lea 64(%rcx), %rcx
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $64, %r8
|
|
jbe L(StrncpyLeaveCase2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Aligned64Leave)
|
|
movaps %xmm4, -64(%rdx)
|
|
movaps %xmm5, -48(%rdx)
|
|
movaps %xmm6, -32(%rdx)
|
|
movaps %xmm7, -16(%rdx)
|
|
jmp L(Aligned64Loop)
|
|
|
|
L(Aligned64Leave):
|
|
# ifdef USE_AS_STRNCPY
|
|
lea 48(%r8), %r8
|
|
# endif
|
|
pcmpeqb %xmm4, %xmm0
|
|
pmovmskb %xmm0, %rax
|
|
test %rax, %rax
|
|
jnz L(CopyFrom1To16Bytes)
|
|
|
|
pcmpeqb %xmm5, %xmm0
|
|
# ifdef USE_AS_STRNCPY
|
|
lea -16(%r8), %r8
|
|
# endif
|
|
pmovmskb %xmm0, %rax
|
|
movaps %xmm4, -64(%rdx)
|
|
test %rax, %rax
|
|
lea 16(%rsi), %rsi
|
|
jnz L(CopyFrom1To16Bytes)
|
|
|
|
pcmpeqb %xmm6, %xmm0
|
|
# ifdef USE_AS_STRNCPY
|
|
lea -16(%r8), %r8
|
|
# endif
|
|
pmovmskb %xmm0, %rax
|
|
movaps %xmm5, -48(%rdx)
|
|
test %rax, %rax
|
|
lea 16(%rsi), %rsi
|
|
jnz L(CopyFrom1To16Bytes)
|
|
|
|
movaps %xmm6, -32(%rdx)
|
|
pcmpeqb %xmm7, %xmm0
|
|
# ifdef USE_AS_STRNCPY
|
|
lea -16(%r8), %r8
|
|
# endif
|
|
pmovmskb %xmm0, %rax
|
|
lea 16(%rsi), %rsi
|
|
jmp L(CopyFrom1To16Bytes)
|
|
|
|
.p2align 4
|
|
L(Shl1):
|
|
movaps -1(%rcx), %xmm1
|
|
movaps 15(%rcx), %xmm2
|
|
L(Shl1Start):
|
|
pcmpeqb %xmm2, %xmm0
|
|
pmovmskb %xmm0, %rax
|
|
movaps %xmm2, %xmm3
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit1Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl1LoopExit)
|
|
|
|
palignr $1, %xmm1, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 31(%rcx), %xmm2
|
|
|
|
pcmpeqb %xmm2, %xmm0
|
|
lea 16(%rdx), %rdx
|
|
pmovmskb %xmm0, %rax
|
|
lea 16(%rcx), %rcx
|
|
movaps %xmm2, %xmm1
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit1Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl1LoopExit)
|
|
|
|
palignr $1, %xmm3, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 31(%rcx), %xmm2
|
|
|
|
pcmpeqb %xmm2, %xmm0
|
|
lea 16(%rdx), %rdx
|
|
pmovmskb %xmm0, %rax
|
|
lea 16(%rcx), %rcx
|
|
movaps %xmm2, %xmm3
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit1Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl1LoopExit)
|
|
|
|
palignr $1, %xmm1, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 31(%rcx), %xmm2
|
|
|
|
pcmpeqb %xmm2, %xmm0
|
|
lea 16(%rdx), %rdx
|
|
pmovmskb %xmm0, %rax
|
|
lea 16(%rcx), %rcx
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit1Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl1LoopExit)
|
|
|
|
palignr $1, %xmm3, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
lea 31(%rcx), %rcx
|
|
lea 16(%rdx), %rdx
|
|
|
|
mov %rcx, %rax
|
|
and $-0x40, %rcx
|
|
sub %rcx, %rax
|
|
lea -15(%rcx), %rcx
|
|
sub %rax, %rdx
|
|
# ifdef USE_AS_STRNCPY
|
|
add %rax, %r8
|
|
# endif
|
|
movaps -1(%rcx), %xmm1
|
|
|
|
/* 64 bytes loop */
|
|
.p2align 4
|
|
L(Shl1LoopStart):
|
|
movaps 15(%rcx), %xmm2
|
|
movaps 31(%rcx), %xmm3
|
|
movaps %xmm3, %xmm6
|
|
movaps 47(%rcx), %xmm4
|
|
movaps %xmm4, %xmm7
|
|
movaps 63(%rcx), %xmm5
|
|
pminub %xmm2, %xmm6
|
|
pminub %xmm5, %xmm7
|
|
pminub %xmm6, %xmm7
|
|
pcmpeqb %xmm0, %xmm7
|
|
pmovmskb %xmm7, %rax
|
|
movaps %xmm5, %xmm7
|
|
palignr $1, %xmm4, %xmm5
|
|
test %rax, %rax
|
|
palignr $1, %xmm3, %xmm4
|
|
jnz L(Shl1Start)
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $64, %r8
|
|
jbe L(StrncpyLeave1)
|
|
# endif
|
|
palignr $1, %xmm2, %xmm3
|
|
lea 64(%rcx), %rcx
|
|
palignr $1, %xmm1, %xmm2
|
|
movaps %xmm7, %xmm1
|
|
movaps %xmm5, 48(%rdx)
|
|
movaps %xmm4, 32(%rdx)
|
|
movaps %xmm3, 16(%rdx)
|
|
movaps %xmm2, (%rdx)
|
|
lea 64(%rdx), %rdx
|
|
jmp L(Shl1LoopStart)
|
|
|
|
L(Shl1LoopExit):
|
|
movdqu -1(%rcx), %xmm1
|
|
mov $15, %rsi
|
|
movdqu %xmm1, -1(%rdx)
|
|
jmp L(CopyFrom1To16Bytes)
|
|
|
|
.p2align 4
|
|
L(Shl2):
|
|
movaps -2(%rcx), %xmm1
|
|
movaps 14(%rcx), %xmm2
|
|
L(Shl2Start):
|
|
pcmpeqb %xmm2, %xmm0
|
|
pmovmskb %xmm0, %rax
|
|
movaps %xmm2, %xmm3
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit2Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl2LoopExit)
|
|
|
|
palignr $2, %xmm1, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 30(%rcx), %xmm2
|
|
|
|
pcmpeqb %xmm2, %xmm0
|
|
lea 16(%rdx), %rdx
|
|
pmovmskb %xmm0, %rax
|
|
lea 16(%rcx), %rcx
|
|
movaps %xmm2, %xmm1
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit2Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl2LoopExit)
|
|
|
|
palignr $2, %xmm3, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 30(%rcx), %xmm2
|
|
|
|
pcmpeqb %xmm2, %xmm0
|
|
lea 16(%rdx), %rdx
|
|
pmovmskb %xmm0, %rax
|
|
lea 16(%rcx), %rcx
|
|
movaps %xmm2, %xmm3
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit2Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl2LoopExit)
|
|
|
|
palignr $2, %xmm1, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 30(%rcx), %xmm2
|
|
|
|
pcmpeqb %xmm2, %xmm0
|
|
lea 16(%rdx), %rdx
|
|
pmovmskb %xmm0, %rax
|
|
lea 16(%rcx), %rcx
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit2Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl2LoopExit)
|
|
|
|
palignr $2, %xmm3, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
lea 30(%rcx), %rcx
|
|
lea 16(%rdx), %rdx
|
|
|
|
mov %rcx, %rax
|
|
and $-0x40, %rcx
|
|
sub %rcx, %rax
|
|
lea -14(%rcx), %rcx
|
|
sub %rax, %rdx
|
|
# ifdef USE_AS_STRNCPY
|
|
add %rax, %r8
|
|
# endif
|
|
movaps -2(%rcx), %xmm1
|
|
|
|
/* 64 bytes loop */
|
|
.p2align 4
|
|
L(Shl2LoopStart):
|
|
movaps 14(%rcx), %xmm2
|
|
movaps 30(%rcx), %xmm3
|
|
movaps %xmm3, %xmm6
|
|
movaps 46(%rcx), %xmm4
|
|
movaps %xmm4, %xmm7
|
|
movaps 62(%rcx), %xmm5
|
|
pminub %xmm2, %xmm6
|
|
pminub %xmm5, %xmm7
|
|
pminub %xmm6, %xmm7
|
|
pcmpeqb %xmm0, %xmm7
|
|
pmovmskb %xmm7, %rax
|
|
movaps %xmm5, %xmm7
|
|
palignr $2, %xmm4, %xmm5
|
|
test %rax, %rax
|
|
palignr $2, %xmm3, %xmm4
|
|
jnz L(Shl2Start)
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $64, %r8
|
|
jbe L(StrncpyLeave2)
|
|
# endif
|
|
palignr $2, %xmm2, %xmm3
|
|
lea 64(%rcx), %rcx
|
|
palignr $2, %xmm1, %xmm2
|
|
movaps %xmm7, %xmm1
|
|
movaps %xmm5, 48(%rdx)
|
|
movaps %xmm4, 32(%rdx)
|
|
movaps %xmm3, 16(%rdx)
|
|
movaps %xmm2, (%rdx)
|
|
lea 64(%rdx), %rdx
|
|
jmp L(Shl2LoopStart)
|
|
|
|
L(Shl2LoopExit):
|
|
movdqu -2(%rcx), %xmm1
|
|
mov $14, %rsi
|
|
movdqu %xmm1, -2(%rdx)
|
|
jmp L(CopyFrom1To16Bytes)
|
|
|
|
.p2align 4
|
|
L(Shl3):
|
|
movaps -3(%rcx), %xmm1
|
|
movaps 13(%rcx), %xmm2
|
|
L(Shl3Start):
|
|
pcmpeqb %xmm2, %xmm0
|
|
pmovmskb %xmm0, %rax
|
|
movaps %xmm2, %xmm3
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit3Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl3LoopExit)
|
|
|
|
palignr $3, %xmm1, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 29(%rcx), %xmm2
|
|
|
|
pcmpeqb %xmm2, %xmm0
|
|
lea 16(%rdx), %rdx
|
|
pmovmskb %xmm0, %rax
|
|
lea 16(%rcx), %rcx
|
|
movaps %xmm2, %xmm1
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit3Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl3LoopExit)
|
|
|
|
palignr $3, %xmm3, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 29(%rcx), %xmm2
|
|
|
|
pcmpeqb %xmm2, %xmm0
|
|
lea 16(%rdx), %rdx
|
|
pmovmskb %xmm0, %rax
|
|
lea 16(%rcx), %rcx
|
|
movaps %xmm2, %xmm3
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit3Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl3LoopExit)
|
|
|
|
palignr $3, %xmm1, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 29(%rcx), %xmm2
|
|
|
|
pcmpeqb %xmm2, %xmm0
|
|
lea 16(%rdx), %rdx
|
|
pmovmskb %xmm0, %rax
|
|
lea 16(%rcx), %rcx
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit3Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl3LoopExit)
|
|
|
|
palignr $3, %xmm3, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
lea 29(%rcx), %rcx
|
|
lea 16(%rdx), %rdx
|
|
|
|
mov %rcx, %rax
|
|
and $-0x40, %rcx
|
|
sub %rcx, %rax
|
|
lea -13(%rcx), %rcx
|
|
sub %rax, %rdx
|
|
# ifdef USE_AS_STRNCPY
|
|
add %rax, %r8
|
|
# endif
|
|
movaps -3(%rcx), %xmm1
|
|
|
|
/* 64 bytes loop */
|
|
.p2align 4
|
|
L(Shl3LoopStart):
|
|
movaps 13(%rcx), %xmm2
|
|
movaps 29(%rcx), %xmm3
|
|
movaps %xmm3, %xmm6
|
|
movaps 45(%rcx), %xmm4
|
|
movaps %xmm4, %xmm7
|
|
movaps 61(%rcx), %xmm5
|
|
pminub %xmm2, %xmm6
|
|
pminub %xmm5, %xmm7
|
|
pminub %xmm6, %xmm7
|
|
pcmpeqb %xmm0, %xmm7
|
|
pmovmskb %xmm7, %rax
|
|
movaps %xmm5, %xmm7
|
|
palignr $3, %xmm4, %xmm5
|
|
test %rax, %rax
|
|
palignr $3, %xmm3, %xmm4
|
|
jnz L(Shl3Start)
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $64, %r8
|
|
jbe L(StrncpyLeave3)
|
|
# endif
|
|
palignr $3, %xmm2, %xmm3
|
|
lea 64(%rcx), %rcx
|
|
palignr $3, %xmm1, %xmm2
|
|
movaps %xmm7, %xmm1
|
|
movaps %xmm5, 48(%rdx)
|
|
movaps %xmm4, 32(%rdx)
|
|
movaps %xmm3, 16(%rdx)
|
|
movaps %xmm2, (%rdx)
|
|
lea 64(%rdx), %rdx
|
|
jmp L(Shl3LoopStart)
|
|
|
|
L(Shl3LoopExit):
|
|
movdqu -3(%rcx), %xmm1
|
|
mov $13, %rsi
|
|
movdqu %xmm1, -3(%rdx)
|
|
jmp L(CopyFrom1To16Bytes)
|
|
|
|
.p2align 4
|
|
L(Shl4):
|
|
movaps -4(%rcx), %xmm1
|
|
movaps 12(%rcx), %xmm2
|
|
L(Shl4Start):
|
|
pcmpeqb %xmm2, %xmm0
|
|
pmovmskb %xmm0, %rax
|
|
movaps %xmm2, %xmm3
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit4Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl4LoopExit)
|
|
|
|
palignr $4, %xmm1, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 28(%rcx), %xmm2
|
|
|
|
pcmpeqb %xmm2, %xmm0
|
|
lea 16(%rdx), %rdx
|
|
pmovmskb %xmm0, %rax
|
|
lea 16(%rcx), %rcx
|
|
movaps %xmm2, %xmm1
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit4Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl4LoopExit)
|
|
|
|
palignr $4, %xmm3, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 28(%rcx), %xmm2
|
|
|
|
pcmpeqb %xmm2, %xmm0
|
|
lea 16(%rdx), %rdx
|
|
pmovmskb %xmm0, %rax
|
|
lea 16(%rcx), %rcx
|
|
movaps %xmm2, %xmm3
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit4Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl4LoopExit)
|
|
|
|
palignr $4, %xmm1, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 28(%rcx), %xmm2
|
|
|
|
pcmpeqb %xmm2, %xmm0
|
|
lea 16(%rdx), %rdx
|
|
pmovmskb %xmm0, %rax
|
|
lea 16(%rcx), %rcx
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit4Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl4LoopExit)
|
|
|
|
palignr $4, %xmm3, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
lea 28(%rcx), %rcx
|
|
lea 16(%rdx), %rdx
|
|
|
|
mov %rcx, %rax
|
|
and $-0x40, %rcx
|
|
sub %rcx, %rax
|
|
lea -12(%rcx), %rcx
|
|
sub %rax, %rdx
|
|
# ifdef USE_AS_STRNCPY
|
|
add %rax, %r8
|
|
# endif
|
|
movaps -4(%rcx), %xmm1
|
|
|
|
/* 64 bytes loop */
|
|
.p2align 4
|
|
L(Shl4LoopStart):
|
|
movaps 12(%rcx), %xmm2
|
|
movaps 28(%rcx), %xmm3
|
|
movaps %xmm3, %xmm6
|
|
movaps 44(%rcx), %xmm4
|
|
movaps %xmm4, %xmm7
|
|
movaps 60(%rcx), %xmm5
|
|
pminub %xmm2, %xmm6
|
|
pminub %xmm5, %xmm7
|
|
pminub %xmm6, %xmm7
|
|
pcmpeqb %xmm0, %xmm7
|
|
pmovmskb %xmm7, %rax
|
|
movaps %xmm5, %xmm7
|
|
palignr $4, %xmm4, %xmm5
|
|
test %rax, %rax
|
|
palignr $4, %xmm3, %xmm4
|
|
jnz L(Shl4Start)
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $64, %r8
|
|
jbe L(StrncpyLeave4)
|
|
# endif
|
|
palignr $4, %xmm2, %xmm3
|
|
lea 64(%rcx), %rcx
|
|
palignr $4, %xmm1, %xmm2
|
|
movaps %xmm7, %xmm1
|
|
movaps %xmm5, 48(%rdx)
|
|
movaps %xmm4, 32(%rdx)
|
|
movaps %xmm3, 16(%rdx)
|
|
movaps %xmm2, (%rdx)
|
|
lea 64(%rdx), %rdx
|
|
jmp L(Shl4LoopStart)
|
|
|
|
L(Shl4LoopExit):
|
|
movdqu -4(%rcx), %xmm1
|
|
mov $12, %rsi
|
|
movdqu %xmm1, -4(%rdx)
|
|
jmp L(CopyFrom1To16Bytes)
|
|
|
|
.p2align 4
|
|
L(Shl5):
|
|
movaps -5(%rcx), %xmm1
|
|
movaps 11(%rcx), %xmm2
|
|
L(Shl5Start):
|
|
pcmpeqb %xmm2, %xmm0
|
|
pmovmskb %xmm0, %rax
|
|
movaps %xmm2, %xmm3
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit5Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl5LoopExit)
|
|
|
|
palignr $5, %xmm1, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 27(%rcx), %xmm2
|
|
|
|
pcmpeqb %xmm2, %xmm0
|
|
lea 16(%rdx), %rdx
|
|
pmovmskb %xmm0, %rax
|
|
lea 16(%rcx), %rcx
|
|
movaps %xmm2, %xmm1
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit5Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl5LoopExit)
|
|
|
|
palignr $5, %xmm3, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 27(%rcx), %xmm2
|
|
|
|
pcmpeqb %xmm2, %xmm0
|
|
lea 16(%rdx), %rdx
|
|
pmovmskb %xmm0, %rax
|
|
lea 16(%rcx), %rcx
|
|
movaps %xmm2, %xmm3
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit5Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl5LoopExit)
|
|
|
|
palignr $5, %xmm1, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 27(%rcx), %xmm2
|
|
|
|
pcmpeqb %xmm2, %xmm0
|
|
lea 16(%rdx), %rdx
|
|
pmovmskb %xmm0, %rax
|
|
lea 16(%rcx), %rcx
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit5Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl5LoopExit)
|
|
|
|
palignr $5, %xmm3, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
lea 27(%rcx), %rcx
|
|
lea 16(%rdx), %rdx
|
|
|
|
mov %rcx, %rax
|
|
and $-0x40, %rcx
|
|
sub %rcx, %rax
|
|
lea -11(%rcx), %rcx
|
|
sub %rax, %rdx
|
|
# ifdef USE_AS_STRNCPY
|
|
add %rax, %r8
|
|
# endif
|
|
movaps -5(%rcx), %xmm1
|
|
|
|
/* 64 bytes loop */
|
|
.p2align 4
|
|
L(Shl5LoopStart):
|
|
movaps 11(%rcx), %xmm2
|
|
movaps 27(%rcx), %xmm3
|
|
movaps %xmm3, %xmm6
|
|
movaps 43(%rcx), %xmm4
|
|
movaps %xmm4, %xmm7
|
|
movaps 59(%rcx), %xmm5
|
|
pminub %xmm2, %xmm6
|
|
pminub %xmm5, %xmm7
|
|
pminub %xmm6, %xmm7
|
|
pcmpeqb %xmm0, %xmm7
|
|
pmovmskb %xmm7, %rax
|
|
movaps %xmm5, %xmm7
|
|
palignr $5, %xmm4, %xmm5
|
|
test %rax, %rax
|
|
palignr $5, %xmm3, %xmm4
|
|
jnz L(Shl5Start)
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $64, %r8
|
|
jbe L(StrncpyLeave5)
|
|
# endif
|
|
palignr $5, %xmm2, %xmm3
|
|
lea 64(%rcx), %rcx
|
|
palignr $5, %xmm1, %xmm2
|
|
movaps %xmm7, %xmm1
|
|
movaps %xmm5, 48(%rdx)
|
|
movaps %xmm4, 32(%rdx)
|
|
movaps %xmm3, 16(%rdx)
|
|
movaps %xmm2, (%rdx)
|
|
lea 64(%rdx), %rdx
|
|
jmp L(Shl5LoopStart)
|
|
|
|
L(Shl5LoopExit):
|
|
movdqu -5(%rcx), %xmm1
|
|
mov $11, %rsi
|
|
movdqu %xmm1, -5(%rdx)
|
|
jmp L(CopyFrom1To16Bytes)
|
|
|
|
.p2align 4
|
|
L(Shl6):
|
|
movaps -6(%rcx), %xmm1
|
|
movaps 10(%rcx), %xmm2
|
|
L(Shl6Start):
|
|
pcmpeqb %xmm2, %xmm0
|
|
pmovmskb %xmm0, %rax
|
|
movaps %xmm2, %xmm3
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit6Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl6LoopExit)
|
|
|
|
palignr $6, %xmm1, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 26(%rcx), %xmm2
|
|
|
|
pcmpeqb %xmm2, %xmm0
|
|
lea 16(%rdx), %rdx
|
|
pmovmskb %xmm0, %rax
|
|
lea 16(%rcx), %rcx
|
|
movaps %xmm2, %xmm1
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit6Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl6LoopExit)
|
|
|
|
palignr $6, %xmm3, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 26(%rcx), %xmm2
|
|
|
|
pcmpeqb %xmm2, %xmm0
|
|
lea 16(%rdx), %rdx
|
|
pmovmskb %xmm0, %rax
|
|
lea 16(%rcx), %rcx
|
|
movaps %xmm2, %xmm3
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit6Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl6LoopExit)
|
|
|
|
palignr $6, %xmm1, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 26(%rcx), %xmm2
|
|
|
|
pcmpeqb %xmm2, %xmm0
|
|
lea 16(%rdx), %rdx
|
|
pmovmskb %xmm0, %rax
|
|
lea 16(%rcx), %rcx
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit6Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl6LoopExit)
|
|
|
|
palignr $6, %xmm3, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
lea 26(%rcx), %rcx
|
|
lea 16(%rdx), %rdx
|
|
|
|
mov %rcx, %rax
|
|
and $-0x40, %rcx
|
|
sub %rcx, %rax
|
|
lea -10(%rcx), %rcx
|
|
sub %rax, %rdx
|
|
# ifdef USE_AS_STRNCPY
|
|
add %rax, %r8
|
|
# endif
|
|
movaps -6(%rcx), %xmm1
|
|
|
|
/* 64 bytes loop */
|
|
.p2align 4
|
|
L(Shl6LoopStart):
|
|
movaps 10(%rcx), %xmm2
|
|
movaps 26(%rcx), %xmm3
|
|
movaps %xmm3, %xmm6
|
|
movaps 42(%rcx), %xmm4
|
|
movaps %xmm4, %xmm7
|
|
movaps 58(%rcx), %xmm5
|
|
pminub %xmm2, %xmm6
|
|
pminub %xmm5, %xmm7
|
|
pminub %xmm6, %xmm7
|
|
pcmpeqb %xmm0, %xmm7
|
|
pmovmskb %xmm7, %rax
|
|
movaps %xmm5, %xmm7
|
|
palignr $6, %xmm4, %xmm5
|
|
test %rax, %rax
|
|
palignr $6, %xmm3, %xmm4
|
|
jnz L(Shl6Start)
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $64, %r8
|
|
jbe L(StrncpyLeave6)
|
|
# endif
|
|
palignr $6, %xmm2, %xmm3
|
|
lea 64(%rcx), %rcx
|
|
palignr $6, %xmm1, %xmm2
|
|
movaps %xmm7, %xmm1
|
|
movaps %xmm5, 48(%rdx)
|
|
movaps %xmm4, 32(%rdx)
|
|
movaps %xmm3, 16(%rdx)
|
|
movaps %xmm2, (%rdx)
|
|
lea 64(%rdx), %rdx
|
|
jmp L(Shl6LoopStart)
|
|
|
|
L(Shl6LoopExit):
|
|
mov (%rcx), %r9
|
|
mov 6(%rcx), %esi
|
|
mov %r9, (%rdx)
|
|
mov %esi, 6(%rdx)
|
|
mov $10, %rsi
|
|
jmp L(CopyFrom1To16Bytes)
|
|
|
|
.p2align 4
|
|
L(Shl7):
|
|
movaps -7(%rcx), %xmm1
|
|
movaps 9(%rcx), %xmm2
|
|
L(Shl7Start):
|
|
pcmpeqb %xmm2, %xmm0
|
|
pmovmskb %xmm0, %rax
|
|
movaps %xmm2, %xmm3
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit7Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl7LoopExit)
|
|
|
|
palignr $7, %xmm1, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 25(%rcx), %xmm2
|
|
|
|
pcmpeqb %xmm2, %xmm0
|
|
lea 16(%rdx), %rdx
|
|
pmovmskb %xmm0, %rax
|
|
lea 16(%rcx), %rcx
|
|
movaps %xmm2, %xmm1
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit7Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl7LoopExit)
|
|
|
|
palignr $7, %xmm3, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 25(%rcx), %xmm2
|
|
|
|
pcmpeqb %xmm2, %xmm0
|
|
lea 16(%rdx), %rdx
|
|
pmovmskb %xmm0, %rax
|
|
lea 16(%rcx), %rcx
|
|
movaps %xmm2, %xmm3
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit7Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl7LoopExit)
|
|
|
|
palignr $7, %xmm1, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 25(%rcx), %xmm2
|
|
|
|
pcmpeqb %xmm2, %xmm0
|
|
lea 16(%rdx), %rdx
|
|
pmovmskb %xmm0, %rax
|
|
lea 16(%rcx), %rcx
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit7Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl7LoopExit)
|
|
|
|
palignr $7, %xmm3, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
lea 25(%rcx), %rcx
|
|
lea 16(%rdx), %rdx
|
|
|
|
mov %rcx, %rax
|
|
and $-0x40, %rcx
|
|
sub %rcx, %rax
|
|
lea -9(%rcx), %rcx
|
|
sub %rax, %rdx
|
|
# ifdef USE_AS_STRNCPY
|
|
add %rax, %r8
|
|
# endif
|
|
movaps -7(%rcx), %xmm1
|
|
|
|
/* 64 bytes loop */
|
|
.p2align 4
|
|
L(Shl7LoopStart):
|
|
movaps 9(%rcx), %xmm2
|
|
movaps 25(%rcx), %xmm3
|
|
movaps %xmm3, %xmm6
|
|
movaps 41(%rcx), %xmm4
|
|
movaps %xmm4, %xmm7
|
|
movaps 57(%rcx), %xmm5
|
|
pminub %xmm2, %xmm6
|
|
pminub %xmm5, %xmm7
|
|
pminub %xmm6, %xmm7
|
|
pcmpeqb %xmm0, %xmm7
|
|
pmovmskb %xmm7, %rax
|
|
movaps %xmm5, %xmm7
|
|
palignr $7, %xmm4, %xmm5
|
|
test %rax, %rax
|
|
palignr $7, %xmm3, %xmm4
|
|
jnz L(Shl7Start)
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $64, %r8
|
|
jbe L(StrncpyLeave7)
|
|
# endif
|
|
palignr $7, %xmm2, %xmm3
|
|
lea 64(%rcx), %rcx
|
|
palignr $7, %xmm1, %xmm2
|
|
movaps %xmm7, %xmm1
|
|
movaps %xmm5, 48(%rdx)
|
|
movaps %xmm4, 32(%rdx)
|
|
movaps %xmm3, 16(%rdx)
|
|
movaps %xmm2, (%rdx)
|
|
lea 64(%rdx), %rdx
|
|
jmp L(Shl7LoopStart)
|
|
|
|
L(Shl7LoopExit):
|
|
mov (%rcx), %r9
|
|
mov 5(%rcx), %esi
|
|
mov %r9, (%rdx)
|
|
mov %esi, 5(%rdx)
|
|
mov $9, %rsi
|
|
jmp L(CopyFrom1To16Bytes)
|
|
|
|
.p2align 4
|
|
L(Shl8):
|
|
movaps -8(%rcx), %xmm1
|
|
movaps 8(%rcx), %xmm2
|
|
L(Shl8Start):
|
|
pcmpeqb %xmm2, %xmm0
|
|
pmovmskb %xmm0, %rax
|
|
movaps %xmm2, %xmm3
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit8Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl8LoopExit)
|
|
|
|
palignr $8, %xmm1, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 24(%rcx), %xmm2
|
|
|
|
pcmpeqb %xmm2, %xmm0
|
|
lea 16(%rdx), %rdx
|
|
pmovmskb %xmm0, %rax
|
|
lea 16(%rcx), %rcx
|
|
movaps %xmm2, %xmm1
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit8Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl8LoopExit)
|
|
|
|
palignr $8, %xmm3, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 24(%rcx), %xmm2
|
|
|
|
pcmpeqb %xmm2, %xmm0
|
|
lea 16(%rdx), %rdx
|
|
pmovmskb %xmm0, %rax
|
|
lea 16(%rcx), %rcx
|
|
movaps %xmm2, %xmm3
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit8Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl8LoopExit)
|
|
|
|
palignr $8, %xmm1, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 24(%rcx), %xmm2
|
|
|
|
pcmpeqb %xmm2, %xmm0
|
|
lea 16(%rdx), %rdx
|
|
pmovmskb %xmm0, %rax
|
|
lea 16(%rcx), %rcx
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit8Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl8LoopExit)
|
|
|
|
palignr $8, %xmm3, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
lea 24(%rcx), %rcx
|
|
lea 16(%rdx), %rdx
|
|
|
|
mov %rcx, %rax
|
|
and $-0x40, %rcx
|
|
sub %rcx, %rax
|
|
lea -8(%rcx), %rcx
|
|
sub %rax, %rdx
|
|
# ifdef USE_AS_STRNCPY
|
|
add %rax, %r8
|
|
# endif
|
|
movaps -8(%rcx), %xmm1
|
|
|
|
/* 64 bytes loop */
|
|
.p2align 4
|
|
L(Shl8LoopStart):
|
|
movaps 8(%rcx), %xmm2
|
|
movaps 24(%rcx), %xmm3
|
|
movaps %xmm3, %xmm6
|
|
movaps 40(%rcx), %xmm4
|
|
movaps %xmm4, %xmm7
|
|
movaps 56(%rcx), %xmm5
|
|
pminub %xmm2, %xmm6
|
|
pminub %xmm5, %xmm7
|
|
pminub %xmm6, %xmm7
|
|
pcmpeqb %xmm0, %xmm7
|
|
pmovmskb %xmm7, %rax
|
|
movaps %xmm5, %xmm7
|
|
palignr $8, %xmm4, %xmm5
|
|
test %rax, %rax
|
|
palignr $8, %xmm3, %xmm4
|
|
jnz L(Shl8Start)
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $64, %r8
|
|
jbe L(StrncpyLeave8)
|
|
# endif
|
|
palignr $8, %xmm2, %xmm3
|
|
lea 64(%rcx), %rcx
|
|
palignr $8, %xmm1, %xmm2
|
|
movaps %xmm7, %xmm1
|
|
movaps %xmm5, 48(%rdx)
|
|
movaps %xmm4, 32(%rdx)
|
|
movaps %xmm3, 16(%rdx)
|
|
movaps %xmm2, (%rdx)
|
|
lea 64(%rdx), %rdx
|
|
jmp L(Shl8LoopStart)
|
|
|
|
L(Shl8LoopExit):
|
|
mov (%rcx), %r9
|
|
mov $8, %rsi
|
|
mov %r9, (%rdx)
|
|
jmp L(CopyFrom1To16Bytes)
|
|
|
|
.p2align 4
|
|
L(Shl9):
|
|
movaps -9(%rcx), %xmm1
|
|
movaps 7(%rcx), %xmm2
|
|
L(Shl9Start):
|
|
pcmpeqb %xmm2, %xmm0
|
|
pmovmskb %xmm0, %rax
|
|
movaps %xmm2, %xmm3
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit9Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl9LoopExit)
|
|
|
|
palignr $9, %xmm1, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 23(%rcx), %xmm2
|
|
|
|
pcmpeqb %xmm2, %xmm0
|
|
lea 16(%rdx), %rdx
|
|
pmovmskb %xmm0, %rax
|
|
lea 16(%rcx), %rcx
|
|
movaps %xmm2, %xmm1
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit9Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl9LoopExit)
|
|
|
|
palignr $9, %xmm3, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 23(%rcx), %xmm2
|
|
|
|
pcmpeqb %xmm2, %xmm0
|
|
lea 16(%rdx), %rdx
|
|
pmovmskb %xmm0, %rax
|
|
lea 16(%rcx), %rcx
|
|
movaps %xmm2, %xmm3
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit9Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl9LoopExit)
|
|
|
|
palignr $9, %xmm1, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 23(%rcx), %xmm2
|
|
|
|
pcmpeqb %xmm2, %xmm0
|
|
lea 16(%rdx), %rdx
|
|
pmovmskb %xmm0, %rax
|
|
lea 16(%rcx), %rcx
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit9Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl9LoopExit)
|
|
|
|
palignr $9, %xmm3, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
lea 23(%rcx), %rcx
|
|
lea 16(%rdx), %rdx
|
|
|
|
mov %rcx, %rax
|
|
and $-0x40, %rcx
|
|
sub %rcx, %rax
|
|
lea -7(%rcx), %rcx
|
|
sub %rax, %rdx
|
|
# ifdef USE_AS_STRNCPY
|
|
add %rax, %r8
|
|
# endif
|
|
movaps -9(%rcx), %xmm1
|
|
|
|
/* 64 bytes loop */
|
|
.p2align 4
|
|
L(Shl9LoopStart):
|
|
movaps 7(%rcx), %xmm2
|
|
movaps 23(%rcx), %xmm3
|
|
movaps %xmm3, %xmm6
|
|
movaps 39(%rcx), %xmm4
|
|
movaps %xmm4, %xmm7
|
|
movaps 55(%rcx), %xmm5
|
|
pminub %xmm2, %xmm6
|
|
pminub %xmm5, %xmm7
|
|
pminub %xmm6, %xmm7
|
|
pcmpeqb %xmm0, %xmm7
|
|
pmovmskb %xmm7, %rax
|
|
movaps %xmm5, %xmm7
|
|
palignr $9, %xmm4, %xmm5
|
|
test %rax, %rax
|
|
palignr $9, %xmm3, %xmm4
|
|
jnz L(Shl9Start)
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $64, %r8
|
|
jbe L(StrncpyLeave9)
|
|
# endif
|
|
palignr $9, %xmm2, %xmm3
|
|
lea 64(%rcx), %rcx
|
|
palignr $9, %xmm1, %xmm2
|
|
movaps %xmm7, %xmm1
|
|
movaps %xmm5, 48(%rdx)
|
|
movaps %xmm4, 32(%rdx)
|
|
movaps %xmm3, 16(%rdx)
|
|
movaps %xmm2, (%rdx)
|
|
lea 64(%rdx), %rdx
|
|
jmp L(Shl9LoopStart)
|
|
|
|
L(Shl9LoopExit):
|
|
mov -1(%rcx), %r9
|
|
mov $7, %rsi
|
|
mov %r9, -1(%rdx)
|
|
jmp L(CopyFrom1To16Bytes)
|
|
|
|
.p2align 4
|
|
L(Shl10):
|
|
movaps -10(%rcx), %xmm1
|
|
movaps 6(%rcx), %xmm2
|
|
L(Shl10Start):
|
|
pcmpeqb %xmm2, %xmm0
|
|
pmovmskb %xmm0, %rax
|
|
movaps %xmm2, %xmm3
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit10Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl10LoopExit)
|
|
|
|
palignr $10, %xmm1, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 22(%rcx), %xmm2
|
|
|
|
pcmpeqb %xmm2, %xmm0
|
|
lea 16(%rdx), %rdx
|
|
pmovmskb %xmm0, %rax
|
|
lea 16(%rcx), %rcx
|
|
movaps %xmm2, %xmm1
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit10Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl10LoopExit)
|
|
|
|
palignr $10, %xmm3, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 22(%rcx), %xmm2
|
|
|
|
pcmpeqb %xmm2, %xmm0
|
|
lea 16(%rdx), %rdx
|
|
pmovmskb %xmm0, %rax
|
|
lea 16(%rcx), %rcx
|
|
movaps %xmm2, %xmm3
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit10Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl10LoopExit)
|
|
|
|
palignr $10, %xmm1, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 22(%rcx), %xmm2
|
|
|
|
pcmpeqb %xmm2, %xmm0
|
|
lea 16(%rdx), %rdx
|
|
pmovmskb %xmm0, %rax
|
|
lea 16(%rcx), %rcx
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit10Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl10LoopExit)
|
|
|
|
palignr $10, %xmm3, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
lea 22(%rcx), %rcx
|
|
lea 16(%rdx), %rdx
|
|
|
|
mov %rcx, %rax
|
|
and $-0x40, %rcx
|
|
sub %rcx, %rax
|
|
lea -6(%rcx), %rcx
|
|
sub %rax, %rdx
|
|
# ifdef USE_AS_STRNCPY
|
|
add %rax, %r8
|
|
# endif
|
|
movaps -10(%rcx), %xmm1
|
|
|
|
/* 64 bytes loop */
|
|
.p2align 4
|
|
L(Shl10LoopStart):
|
|
movaps 6(%rcx), %xmm2
|
|
movaps 22(%rcx), %xmm3
|
|
movaps %xmm3, %xmm6
|
|
movaps 38(%rcx), %xmm4
|
|
movaps %xmm4, %xmm7
|
|
movaps 54(%rcx), %xmm5
|
|
pminub %xmm2, %xmm6
|
|
pminub %xmm5, %xmm7
|
|
pminub %xmm6, %xmm7
|
|
pcmpeqb %xmm0, %xmm7
|
|
pmovmskb %xmm7, %rax
|
|
movaps %xmm5, %xmm7
|
|
palignr $10, %xmm4, %xmm5
|
|
test %rax, %rax
|
|
palignr $10, %xmm3, %xmm4
|
|
jnz L(Shl10Start)
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $64, %r8
|
|
jbe L(StrncpyLeave10)
|
|
# endif
|
|
palignr $10, %xmm2, %xmm3
|
|
lea 64(%rcx), %rcx
|
|
palignr $10, %xmm1, %xmm2
|
|
movaps %xmm7, %xmm1
|
|
movaps %xmm5, 48(%rdx)
|
|
movaps %xmm4, 32(%rdx)
|
|
movaps %xmm3, 16(%rdx)
|
|
movaps %xmm2, (%rdx)
|
|
lea 64(%rdx), %rdx
|
|
jmp L(Shl10LoopStart)
|
|
|
|
L(Shl10LoopExit):
|
|
mov -2(%rcx), %r9
|
|
mov $6, %rsi
|
|
mov %r9, -2(%rdx)
|
|
jmp L(CopyFrom1To16Bytes)
|
|
|
|
.p2align 4
|
|
L(Shl11):
|
|
movaps -11(%rcx), %xmm1
|
|
movaps 5(%rcx), %xmm2
|
|
L(Shl11Start):
|
|
pcmpeqb %xmm2, %xmm0
|
|
pmovmskb %xmm0, %rax
|
|
movaps %xmm2, %xmm3
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit11Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl11LoopExit)
|
|
|
|
palignr $11, %xmm1, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 21(%rcx), %xmm2
|
|
|
|
pcmpeqb %xmm2, %xmm0
|
|
lea 16(%rdx), %rdx
|
|
pmovmskb %xmm0, %rax
|
|
lea 16(%rcx), %rcx
|
|
movaps %xmm2, %xmm1
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit11Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl11LoopExit)
|
|
|
|
palignr $11, %xmm3, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 21(%rcx), %xmm2
|
|
|
|
pcmpeqb %xmm2, %xmm0
|
|
lea 16(%rdx), %rdx
|
|
pmovmskb %xmm0, %rax
|
|
lea 16(%rcx), %rcx
|
|
movaps %xmm2, %xmm3
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit11Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl11LoopExit)
|
|
|
|
palignr $11, %xmm1, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 21(%rcx), %xmm2
|
|
|
|
pcmpeqb %xmm2, %xmm0
|
|
lea 16(%rdx), %rdx
|
|
pmovmskb %xmm0, %rax
|
|
lea 16(%rcx), %rcx
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit11Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl11LoopExit)
|
|
|
|
palignr $11, %xmm3, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
lea 21(%rcx), %rcx
|
|
lea 16(%rdx), %rdx
|
|
|
|
mov %rcx, %rax
|
|
and $-0x40, %rcx
|
|
sub %rcx, %rax
|
|
lea -5(%rcx), %rcx
|
|
sub %rax, %rdx
|
|
# ifdef USE_AS_STRNCPY
|
|
add %rax, %r8
|
|
# endif
|
|
movaps -11(%rcx), %xmm1
|
|
|
|
/* 64 bytes loop */
|
|
.p2align 4
|
|
L(Shl11LoopStart):
|
|
movaps 5(%rcx), %xmm2
|
|
movaps 21(%rcx), %xmm3
|
|
movaps %xmm3, %xmm6
|
|
movaps 37(%rcx), %xmm4
|
|
movaps %xmm4, %xmm7
|
|
movaps 53(%rcx), %xmm5
|
|
pminub %xmm2, %xmm6
|
|
pminub %xmm5, %xmm7
|
|
pminub %xmm6, %xmm7
|
|
pcmpeqb %xmm0, %xmm7
|
|
pmovmskb %xmm7, %rax
|
|
movaps %xmm5, %xmm7
|
|
palignr $11, %xmm4, %xmm5
|
|
test %rax, %rax
|
|
palignr $11, %xmm3, %xmm4
|
|
jnz L(Shl11Start)
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $64, %r8
|
|
jbe L(StrncpyLeave11)
|
|
# endif
|
|
palignr $11, %xmm2, %xmm3
|
|
lea 64(%rcx), %rcx
|
|
palignr $11, %xmm1, %xmm2
|
|
movaps %xmm7, %xmm1
|
|
movaps %xmm5, 48(%rdx)
|
|
movaps %xmm4, 32(%rdx)
|
|
movaps %xmm3, 16(%rdx)
|
|
movaps %xmm2, (%rdx)
|
|
lea 64(%rdx), %rdx
|
|
jmp L(Shl11LoopStart)
|
|
|
|
L(Shl11LoopExit):
|
|
mov -3(%rcx), %r9
|
|
mov $5, %rsi
|
|
mov %r9, -3(%rdx)
|
|
jmp L(CopyFrom1To16Bytes)
|
|
|
|
.p2align 4
|
|
L(Shl12):
|
|
movaps -12(%rcx), %xmm1
|
|
movaps 4(%rcx), %xmm2
|
|
L(Shl12Start):
|
|
pcmpeqb %xmm2, %xmm0
|
|
pmovmskb %xmm0, %rax
|
|
movaps %xmm2, %xmm3
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit12Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl12LoopExit)
|
|
|
|
palignr $12, %xmm1, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 20(%rcx), %xmm2
|
|
|
|
pcmpeqb %xmm2, %xmm0
|
|
lea 16(%rdx), %rdx
|
|
pmovmskb %xmm0, %rax
|
|
lea 16(%rcx), %rcx
|
|
movaps %xmm2, %xmm1
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit12Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl12LoopExit)
|
|
|
|
palignr $12, %xmm3, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 20(%rcx), %xmm2
|
|
|
|
pcmpeqb %xmm2, %xmm0
|
|
lea 16(%rdx), %rdx
|
|
pmovmskb %xmm0, %rax
|
|
lea 16(%rcx), %rcx
|
|
movaps %xmm2, %xmm3
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit12Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl12LoopExit)
|
|
|
|
palignr $12, %xmm1, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 20(%rcx), %xmm2
|
|
|
|
pcmpeqb %xmm2, %xmm0
|
|
lea 16(%rdx), %rdx
|
|
pmovmskb %xmm0, %rax
|
|
lea 16(%rcx), %rcx
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit12Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl12LoopExit)
|
|
|
|
palignr $12, %xmm3, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
lea 20(%rcx), %rcx
|
|
lea 16(%rdx), %rdx
|
|
|
|
mov %rcx, %rax
|
|
and $-0x40, %rcx
|
|
sub %rcx, %rax
|
|
lea -4(%rcx), %rcx
|
|
sub %rax, %rdx
|
|
# ifdef USE_AS_STRNCPY
|
|
add %rax, %r8
|
|
# endif
|
|
movaps -12(%rcx), %xmm1
|
|
|
|
/* 64 bytes loop */
|
|
.p2align 4
|
|
L(Shl12LoopStart):
|
|
movaps 4(%rcx), %xmm2
|
|
movaps 20(%rcx), %xmm3
|
|
movaps %xmm3, %xmm6
|
|
movaps 36(%rcx), %xmm4
|
|
movaps %xmm4, %xmm7
|
|
movaps 52(%rcx), %xmm5
|
|
pminub %xmm2, %xmm6
|
|
pminub %xmm5, %xmm7
|
|
pminub %xmm6, %xmm7
|
|
pcmpeqb %xmm0, %xmm7
|
|
pmovmskb %xmm7, %rax
|
|
movaps %xmm5, %xmm7
|
|
palignr $12, %xmm4, %xmm5
|
|
test %rax, %rax
|
|
palignr $12, %xmm3, %xmm4
|
|
jnz L(Shl12Start)
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $64, %r8
|
|
jbe L(StrncpyLeave12)
|
|
# endif
|
|
palignr $12, %xmm2, %xmm3
|
|
lea 64(%rcx), %rcx
|
|
palignr $12, %xmm1, %xmm2
|
|
movaps %xmm7, %xmm1
|
|
movaps %xmm5, 48(%rdx)
|
|
movaps %xmm4, 32(%rdx)
|
|
movaps %xmm3, 16(%rdx)
|
|
movaps %xmm2, (%rdx)
|
|
lea 64(%rdx), %rdx
|
|
jmp L(Shl12LoopStart)
|
|
|
|
L(Shl12LoopExit):
|
|
mov (%rcx), %r9d
|
|
mov $4, %rsi
|
|
mov %r9d, (%rdx)
|
|
jmp L(CopyFrom1To16Bytes)
|
|
|
|
.p2align 4
|
|
L(Shl13):
|
|
movaps -13(%rcx), %xmm1
|
|
movaps 3(%rcx), %xmm2
|
|
L(Shl13Start):
|
|
pcmpeqb %xmm2, %xmm0
|
|
pmovmskb %xmm0, %rax
|
|
movaps %xmm2, %xmm3
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit13Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl13LoopExit)
|
|
|
|
palignr $13, %xmm1, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 19(%rcx), %xmm2
|
|
|
|
pcmpeqb %xmm2, %xmm0
|
|
lea 16(%rdx), %rdx
|
|
pmovmskb %xmm0, %rax
|
|
lea 16(%rcx), %rcx
|
|
movaps %xmm2, %xmm1
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit13Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl13LoopExit)
|
|
|
|
palignr $13, %xmm3, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 19(%rcx), %xmm2
|
|
|
|
pcmpeqb %xmm2, %xmm0
|
|
lea 16(%rdx), %rdx
|
|
pmovmskb %xmm0, %rax
|
|
lea 16(%rcx), %rcx
|
|
movaps %xmm2, %xmm3
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit13Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl13LoopExit)
|
|
|
|
palignr $13, %xmm1, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 19(%rcx), %xmm2
|
|
|
|
pcmpeqb %xmm2, %xmm0
|
|
lea 16(%rdx), %rdx
|
|
pmovmskb %xmm0, %rax
|
|
lea 16(%rcx), %rcx
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit13Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl13LoopExit)
|
|
|
|
palignr $13, %xmm3, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
lea 19(%rcx), %rcx
|
|
lea 16(%rdx), %rdx
|
|
|
|
mov %rcx, %rax
|
|
and $-0x40, %rcx
|
|
sub %rcx, %rax
|
|
lea -3(%rcx), %rcx
|
|
sub %rax, %rdx
|
|
# ifdef USE_AS_STRNCPY
|
|
add %rax, %r8
|
|
# endif
|
|
movaps -13(%rcx), %xmm1
|
|
|
|
/* 64 bytes loop */
|
|
.p2align 4
|
|
L(Shl13LoopStart):
|
|
movaps 3(%rcx), %xmm2
|
|
movaps 19(%rcx), %xmm3
|
|
movaps %xmm3, %xmm6
|
|
movaps 35(%rcx), %xmm4
|
|
movaps %xmm4, %xmm7
|
|
movaps 51(%rcx), %xmm5
|
|
pminub %xmm2, %xmm6
|
|
pminub %xmm5, %xmm7
|
|
pminub %xmm6, %xmm7
|
|
pcmpeqb %xmm0, %xmm7
|
|
pmovmskb %xmm7, %rax
|
|
movaps %xmm5, %xmm7
|
|
palignr $13, %xmm4, %xmm5
|
|
test %rax, %rax
|
|
palignr $13, %xmm3, %xmm4
|
|
jnz L(Shl13Start)
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $64, %r8
|
|
jbe L(StrncpyLeave13)
|
|
# endif
|
|
palignr $13, %xmm2, %xmm3
|
|
lea 64(%rcx), %rcx
|
|
palignr $13, %xmm1, %xmm2
|
|
movaps %xmm7, %xmm1
|
|
movaps %xmm5, 48(%rdx)
|
|
movaps %xmm4, 32(%rdx)
|
|
movaps %xmm3, 16(%rdx)
|
|
movaps %xmm2, (%rdx)
|
|
lea 64(%rdx), %rdx
|
|
jmp L(Shl13LoopStart)
|
|
|
|
L(Shl13LoopExit):
|
|
mov -1(%rcx), %r9d
|
|
mov $3, %rsi
|
|
mov %r9d, -1(%rdx)
|
|
jmp L(CopyFrom1To16Bytes)
|
|
|
|
.p2align 4
|
|
L(Shl14):
|
|
movaps -14(%rcx), %xmm1
|
|
movaps 2(%rcx), %xmm2
|
|
L(Shl14Start):
|
|
pcmpeqb %xmm2, %xmm0
|
|
pmovmskb %xmm0, %rax
|
|
movaps %xmm2, %xmm3
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit14Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl14LoopExit)
|
|
|
|
palignr $14, %xmm1, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 18(%rcx), %xmm2
|
|
|
|
pcmpeqb %xmm2, %xmm0
|
|
lea 16(%rdx), %rdx
|
|
pmovmskb %xmm0, %rax
|
|
lea 16(%rcx), %rcx
|
|
movaps %xmm2, %xmm1
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit14Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl14LoopExit)
|
|
|
|
palignr $14, %xmm3, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 18(%rcx), %xmm2
|
|
|
|
pcmpeqb %xmm2, %xmm0
|
|
lea 16(%rdx), %rdx
|
|
pmovmskb %xmm0, %rax
|
|
lea 16(%rcx), %rcx
|
|
movaps %xmm2, %xmm3
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit14Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl14LoopExit)
|
|
|
|
palignr $14, %xmm1, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 18(%rcx), %xmm2
|
|
|
|
pcmpeqb %xmm2, %xmm0
|
|
lea 16(%rdx), %rdx
|
|
pmovmskb %xmm0, %rax
|
|
lea 16(%rcx), %rcx
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit14Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl14LoopExit)
|
|
|
|
palignr $14, %xmm3, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
lea 18(%rcx), %rcx
|
|
lea 16(%rdx), %rdx
|
|
|
|
mov %rcx, %rax
|
|
and $-0x40, %rcx
|
|
sub %rcx, %rax
|
|
lea -2(%rcx), %rcx
|
|
sub %rax, %rdx
|
|
# ifdef USE_AS_STRNCPY
|
|
add %rax, %r8
|
|
# endif
|
|
movaps -14(%rcx), %xmm1
|
|
|
|
/* 64 bytes loop */
|
|
.p2align 4
|
|
L(Shl14LoopStart):
|
|
movaps 2(%rcx), %xmm2
|
|
movaps 18(%rcx), %xmm3
|
|
movaps %xmm3, %xmm6
|
|
movaps 34(%rcx), %xmm4
|
|
movaps %xmm4, %xmm7
|
|
movaps 50(%rcx), %xmm5
|
|
pminub %xmm2, %xmm6
|
|
pminub %xmm5, %xmm7
|
|
pminub %xmm6, %xmm7
|
|
pcmpeqb %xmm0, %xmm7
|
|
pmovmskb %xmm7, %rax
|
|
movaps %xmm5, %xmm7
|
|
palignr $14, %xmm4, %xmm5
|
|
test %rax, %rax
|
|
palignr $14, %xmm3, %xmm4
|
|
jnz L(Shl14Start)
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $64, %r8
|
|
jbe L(StrncpyLeave14)
|
|
# endif
|
|
palignr $14, %xmm2, %xmm3
|
|
lea 64(%rcx), %rcx
|
|
palignr $14, %xmm1, %xmm2
|
|
movaps %xmm7, %xmm1
|
|
movaps %xmm5, 48(%rdx)
|
|
movaps %xmm4, 32(%rdx)
|
|
movaps %xmm3, 16(%rdx)
|
|
movaps %xmm2, (%rdx)
|
|
lea 64(%rdx), %rdx
|
|
jmp L(Shl14LoopStart)
|
|
|
|
L(Shl14LoopExit):
|
|
mov -2(%rcx), %r9d
|
|
mov $2, %rsi
|
|
mov %r9d, -2(%rdx)
|
|
jmp L(CopyFrom1To16Bytes)
|
|
|
|
.p2align 4
|
|
L(Shl15):
|
|
movaps -15(%rcx), %xmm1
|
|
movaps 1(%rcx), %xmm2
|
|
L(Shl15Start):
|
|
pcmpeqb %xmm2, %xmm0
|
|
pmovmskb %xmm0, %rax
|
|
movaps %xmm2, %xmm3
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit15Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl15LoopExit)
|
|
|
|
palignr $15, %xmm1, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 17(%rcx), %xmm2
|
|
|
|
pcmpeqb %xmm2, %xmm0
|
|
lea 16(%rdx), %rdx
|
|
pmovmskb %xmm0, %rax
|
|
lea 16(%rcx), %rcx
|
|
movaps %xmm2, %xmm1
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit15Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl15LoopExit)
|
|
|
|
palignr $15, %xmm3, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 17(%rcx), %xmm2
|
|
|
|
pcmpeqb %xmm2, %xmm0
|
|
lea 16(%rdx), %rdx
|
|
pmovmskb %xmm0, %rax
|
|
lea 16(%rcx), %rcx
|
|
movaps %xmm2, %xmm3
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit15Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl15LoopExit)
|
|
|
|
palignr $15, %xmm1, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 17(%rcx), %xmm2
|
|
|
|
pcmpeqb %xmm2, %xmm0
|
|
lea 16(%rdx), %rdx
|
|
pmovmskb %xmm0, %rax
|
|
lea 16(%rcx), %rcx
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit15Case2OrCase3)
|
|
# endif
|
|
test %rax, %rax
|
|
jnz L(Shl15LoopExit)
|
|
|
|
palignr $15, %xmm3, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
lea 17(%rcx), %rcx
|
|
lea 16(%rdx), %rdx
|
|
|
|
mov %rcx, %rax
|
|
and $-0x40, %rcx
|
|
sub %rcx, %rax
|
|
lea -1(%rcx), %rcx
|
|
sub %rax, %rdx
|
|
# ifdef USE_AS_STRNCPY
|
|
add %rax, %r8
|
|
# endif
|
|
movaps -15(%rcx), %xmm1
|
|
|
|
/* 64 bytes loop */
|
|
.p2align 4
|
|
L(Shl15LoopStart):
|
|
movaps 1(%rcx), %xmm2
|
|
movaps 17(%rcx), %xmm3
|
|
movaps %xmm3, %xmm6
|
|
movaps 33(%rcx), %xmm4
|
|
movaps %xmm4, %xmm7
|
|
movaps 49(%rcx), %xmm5
|
|
pminub %xmm2, %xmm6
|
|
pminub %xmm5, %xmm7
|
|
pminub %xmm6, %xmm7
|
|
pcmpeqb %xmm0, %xmm7
|
|
pmovmskb %xmm7, %rax
|
|
movaps %xmm5, %xmm7
|
|
palignr $15, %xmm4, %xmm5
|
|
test %rax, %rax
|
|
palignr $15, %xmm3, %xmm4
|
|
jnz L(Shl15Start)
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $64, %r8
|
|
jbe L(StrncpyLeave15)
|
|
# endif
|
|
palignr $15, %xmm2, %xmm3
|
|
lea 64(%rcx), %rcx
|
|
palignr $15, %xmm1, %xmm2
|
|
movaps %xmm7, %xmm1
|
|
movaps %xmm5, 48(%rdx)
|
|
movaps %xmm4, 32(%rdx)
|
|
movaps %xmm3, 16(%rdx)
|
|
movaps %xmm2, (%rdx)
|
|
lea 64(%rdx), %rdx
|
|
jmp L(Shl15LoopStart)
|
|
|
|
L(Shl15LoopExit):
|
|
mov -3(%rcx), %r9d
|
|
mov $1, %rsi
|
|
mov %r9d, -3(%rdx)
|
|
# ifdef USE_AS_STRCAT
|
|
jmp L(CopyFrom1To16Bytes)
|
|
# endif
|
|
|
|
# ifndef USE_AS_STRCAT
|
|
|
|
.p2align 4
|
|
L(CopyFrom1To16Bytes):
|
|
# ifdef USE_AS_STRNCPY
|
|
add $16, %r8
|
|
# endif
|
|
add %rsi, %rdx
|
|
add %rsi, %rcx
|
|
|
|
test %al, %al
|
|
jz L(ExitHigh)
|
|
test $0x01, %al
|
|
jnz L(Exit1)
|
|
test $0x02, %al
|
|
jnz L(Exit2)
|
|
test $0x04, %al
|
|
jnz L(Exit3)
|
|
test $0x08, %al
|
|
jnz L(Exit4)
|
|
test $0x10, %al
|
|
jnz L(Exit5)
|
|
test $0x20, %al
|
|
jnz L(Exit6)
|
|
test $0x40, %al
|
|
jnz L(Exit7)
|
|
|
|
.p2align 4
|
|
L(Exit8):
|
|
mov (%rcx), %rax
|
|
mov %rax, (%rdx)
|
|
# ifdef USE_AS_STPCPY
|
|
lea 7(%rdx), %rax
|
|
# else
|
|
mov %rdi, %rax
|
|
# endif
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $8, %r8
|
|
lea 8(%rdx), %rcx
|
|
jnz L(StrncpyFillTailWithZero1)
|
|
# ifdef USE_AS_STPCPY
|
|
cmpb $1, (%rax)
|
|
sbb $-1, %rax
|
|
# endif
|
|
# endif
|
|
ret
|
|
|
|
.p2align 4
|
|
L(ExitHigh):
|
|
test $0x01, %ah
|
|
jnz L(Exit9)
|
|
test $0x02, %ah
|
|
jnz L(Exit10)
|
|
test $0x04, %ah
|
|
jnz L(Exit11)
|
|
test $0x08, %ah
|
|
jnz L(Exit12)
|
|
test $0x10, %ah
|
|
jnz L(Exit13)
|
|
test $0x20, %ah
|
|
jnz L(Exit14)
|
|
test $0x40, %ah
|
|
jnz L(Exit15)
|
|
|
|
.p2align 4
|
|
L(Exit16):
|
|
mov (%rcx), %rax
|
|
mov %rax, (%rdx)
|
|
mov 8(%rcx), %rax
|
|
mov %rax, 8(%rdx)
|
|
# ifdef USE_AS_STPCPY
|
|
lea 15(%rdx), %rax
|
|
# else
|
|
mov %rdi, %rax
|
|
# endif
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $16, %r8
|
|
lea 16(%rdx), %rcx
|
|
jnz L(StrncpyFillTailWithZero1)
|
|
# ifdef USE_AS_STPCPY
|
|
cmpb $1, (%rax)
|
|
sbb $-1, %rax
|
|
# endif
|
|
# endif
|
|
ret
|
|
|
|
# ifdef USE_AS_STRNCPY
|
|
|
|
.p2align 4
|
|
L(CopyFrom1To16BytesCase2):
|
|
add $16, %r8
|
|
add %rsi, %rcx
|
|
lea (%rsi, %rdx), %rsi
|
|
lea -9(%r8), %rdx
|
|
and $1<<7, %dh
|
|
or %al, %dh
|
|
test %dh, %dh
|
|
lea (%rsi), %rdx
|
|
jz L(ExitHighCase2)
|
|
|
|
cmp $1, %r8
|
|
je L(Exit1)
|
|
test $0x01, %al
|
|
jnz L(Exit1)
|
|
cmp $2, %r8
|
|
je L(Exit2)
|
|
test $0x02, %al
|
|
jnz L(Exit2)
|
|
cmp $3, %r8
|
|
je L(Exit3)
|
|
test $0x04, %al
|
|
jnz L(Exit3)
|
|
cmp $4, %r8
|
|
je L(Exit4)
|
|
test $0x08, %al
|
|
jnz L(Exit4)
|
|
cmp $5, %r8
|
|
je L(Exit5)
|
|
test $0x10, %al
|
|
jnz L(Exit5)
|
|
cmp $6, %r8
|
|
je L(Exit6)
|
|
test $0x20, %al
|
|
jnz L(Exit6)
|
|
cmp $7, %r8
|
|
je L(Exit7)
|
|
test $0x40, %al
|
|
jnz L(Exit7)
|
|
jmp L(Exit8)
|
|
|
|
.p2align 4
|
|
L(ExitHighCase2):
|
|
cmp $9, %r8
|
|
je L(Exit9)
|
|
test $0x01, %ah
|
|
jnz L(Exit9)
|
|
cmp $10, %r8
|
|
je L(Exit10)
|
|
test $0x02, %ah
|
|
jnz L(Exit10)
|
|
cmp $11, %r8
|
|
je L(Exit11)
|
|
test $0x04, %ah
|
|
jnz L(Exit11)
|
|
cmp $12, %r8
|
|
je L(Exit12)
|
|
test $0x8, %ah
|
|
jnz L(Exit12)
|
|
cmp $13, %r8
|
|
je L(Exit13)
|
|
test $0x10, %ah
|
|
jnz L(Exit13)
|
|
cmp $14, %r8
|
|
je L(Exit14)
|
|
test $0x20, %ah
|
|
jnz L(Exit14)
|
|
cmp $15, %r8
|
|
je L(Exit15)
|
|
test $0x40, %ah
|
|
jnz L(Exit15)
|
|
jmp L(Exit16)
|
|
|
|
L(CopyFrom1To16BytesCase2OrCase3):
|
|
test %rax, %rax
|
|
jnz L(CopyFrom1To16BytesCase2)
|
|
|
|
.p2align 4
|
|
L(CopyFrom1To16BytesCase3):
|
|
add $16, %r8
|
|
add %rsi, %rdx
|
|
add %rsi, %rcx
|
|
|
|
cmp $16, %r8
|
|
je L(Exit16)
|
|
cmp $8, %r8
|
|
je L(Exit8)
|
|
jg L(More8Case3)
|
|
cmp $4, %r8
|
|
je L(Exit4)
|
|
jg L(More4Case3)
|
|
cmp $2, %r8
|
|
jl L(Exit1)
|
|
je L(Exit2)
|
|
jg L(Exit3)
|
|
L(More8Case3): /* but less than 16 */
|
|
cmp $12, %r8
|
|
je L(Exit12)
|
|
jl L(Less12Case3)
|
|
cmp $14, %r8
|
|
jl L(Exit13)
|
|
je L(Exit14)
|
|
jg L(Exit15)
|
|
L(More4Case3): /* but less than 8 */
|
|
cmp $6, %r8
|
|
jl L(Exit5)
|
|
je L(Exit6)
|
|
jg L(Exit7)
|
|
L(Less12Case3): /* but more than 8 */
|
|
cmp $10, %r8
|
|
jl L(Exit9)
|
|
je L(Exit10)
|
|
jg L(Exit11)
|
|
# endif
|
|
|
|
.p2align 4
|
|
L(Exit1):
|
|
movb (%rcx), %al
|
|
movb %al, (%rdx)
|
|
# ifdef USE_AS_STPCPY
|
|
lea (%rdx), %rax
|
|
# else
|
|
mov %rdi, %rax
|
|
# endif
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $1, %r8
|
|
lea 1(%rdx), %rcx
|
|
jnz L(StrncpyFillTailWithZero1)
|
|
# ifdef USE_AS_STPCPY
|
|
cmpb $1, (%rax)
|
|
sbb $-1, %rax
|
|
# endif
|
|
# endif
|
|
ret
|
|
|
|
.p2align 4
|
|
L(Exit2):
|
|
movw (%rcx), %ax
|
|
movw %ax, (%rdx)
|
|
# ifdef USE_AS_STPCPY
|
|
lea 1(%rdx), %rax
|
|
# else
|
|
mov %rdi, %rax
|
|
# endif
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $2, %r8
|
|
lea 2(%rdx), %rcx
|
|
jnz L(StrncpyFillTailWithZero1)
|
|
# ifdef USE_AS_STPCPY
|
|
cmpb $1, (%rax)
|
|
sbb $-1, %rax
|
|
# endif
|
|
# endif
|
|
ret
|
|
|
|
.p2align 4
|
|
L(Exit3):
|
|
movw (%rcx), %ax
|
|
movw %ax, (%rdx)
|
|
movb 2(%rcx), %al
|
|
movb %al, 2(%rdx)
|
|
# ifdef USE_AS_STPCPY
|
|
lea 2(%rdx), %rax
|
|
# else
|
|
mov %rdi, %rax
|
|
# endif
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $3, %r8
|
|
lea 3(%rdx), %rcx
|
|
jnz L(StrncpyFillTailWithZero1)
|
|
# ifdef USE_AS_STPCPY
|
|
cmpb $1, (%rax)
|
|
sbb $-1, %rax
|
|
# endif
|
|
# endif
|
|
ret
|
|
|
|
.p2align 4
|
|
L(Exit4):
|
|
movl (%rcx), %eax
|
|
movl %eax, (%rdx)
|
|
# ifdef USE_AS_STPCPY
|
|
lea 3(%rdx), %rax
|
|
# else
|
|
mov %rdi, %rax
|
|
# endif
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $4, %r8
|
|
lea 4(%rdx), %rcx
|
|
jnz L(StrncpyFillTailWithZero1)
|
|
# ifdef USE_AS_STPCPY
|
|
cmpb $1, (%rax)
|
|
sbb $-1, %rax
|
|
# endif
|
|
# endif
|
|
ret
|
|
|
|
.p2align 4
|
|
L(Exit5):
|
|
movl (%rcx), %eax
|
|
movl %eax, (%rdx)
|
|
movb 4(%rcx), %al
|
|
movb %al, 4(%rdx)
|
|
# ifdef USE_AS_STPCPY
|
|
lea 4(%rdx), %rax
|
|
# else
|
|
mov %rdi, %rax
|
|
# endif
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $5, %r8
|
|
lea 5(%rdx), %rcx
|
|
jnz L(StrncpyFillTailWithZero1)
|
|
# ifdef USE_AS_STPCPY
|
|
cmpb $1, (%rax)
|
|
sbb $-1, %rax
|
|
# endif
|
|
# endif
|
|
ret
|
|
|
|
.p2align 4
|
|
L(Exit6):
|
|
movl (%rcx), %eax
|
|
movl %eax, (%rdx)
|
|
movw 4(%rcx), %ax
|
|
movw %ax, 4(%rdx)
|
|
# ifdef USE_AS_STPCPY
|
|
lea 5(%rdx), %rax
|
|
# else
|
|
mov %rdi, %rax
|
|
# endif
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $6, %r8
|
|
lea 6(%rdx), %rcx
|
|
jnz L(StrncpyFillTailWithZero1)
|
|
# ifdef USE_AS_STPCPY
|
|
cmpb $1, (%rax)
|
|
sbb $-1, %rax
|
|
# endif
|
|
# endif
|
|
ret
|
|
|
|
.p2align 4
|
|
L(Exit7):
|
|
movl (%rcx), %eax
|
|
movl %eax, (%rdx)
|
|
movl 3(%rcx), %eax
|
|
movl %eax, 3(%rdx)
|
|
# ifdef USE_AS_STPCPY
|
|
lea 6(%rdx), %rax
|
|
# else
|
|
mov %rdi, %rax
|
|
# endif
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $7, %r8
|
|
lea 7(%rdx), %rcx
|
|
jnz L(StrncpyFillTailWithZero1)
|
|
# ifdef USE_AS_STPCPY
|
|
cmpb $1, (%rax)
|
|
sbb $-1, %rax
|
|
# endif
|
|
# endif
|
|
ret
|
|
|
|
.p2align 4
|
|
L(Exit9):
|
|
mov (%rcx), %rax
|
|
mov %rax, (%rdx)
|
|
mov 5(%rcx), %eax
|
|
mov %eax, 5(%rdx)
|
|
# ifdef USE_AS_STPCPY
|
|
lea 8(%rdx), %rax
|
|
# else
|
|
mov %rdi, %rax
|
|
# endif
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $9, %r8
|
|
lea 9(%rdx), %rcx
|
|
jnz L(StrncpyFillTailWithZero1)
|
|
# ifdef USE_AS_STPCPY
|
|
cmpb $1, (%rax)
|
|
sbb $-1, %rax
|
|
# endif
|
|
# endif
|
|
ret
|
|
|
|
.p2align 4
|
|
L(Exit10):
|
|
mov (%rcx), %rax
|
|
mov %rax, (%rdx)
|
|
mov 6(%rcx), %eax
|
|
mov %eax, 6(%rdx)
|
|
# ifdef USE_AS_STPCPY
|
|
lea 9(%rdx), %rax
|
|
# else
|
|
mov %rdi, %rax
|
|
# endif
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $10, %r8
|
|
lea 10(%rdx), %rcx
|
|
jnz L(StrncpyFillTailWithZero1)
|
|
# ifdef USE_AS_STPCPY
|
|
cmpb $1, (%rax)
|
|
sbb $-1, %rax
|
|
# endif
|
|
# endif
|
|
ret
|
|
|
|
.p2align 4
|
|
L(Exit11):
|
|
mov (%rcx), %rax
|
|
mov %rax, (%rdx)
|
|
mov 7(%rcx), %eax
|
|
mov %eax, 7(%rdx)
|
|
# ifdef USE_AS_STPCPY
|
|
lea 10(%rdx), %rax
|
|
# else
|
|
mov %rdi, %rax
|
|
# endif
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $11, %r8
|
|
lea 11(%rdx), %rcx
|
|
jnz L(StrncpyFillTailWithZero1)
|
|
# ifdef USE_AS_STPCPY
|
|
cmpb $1, (%rax)
|
|
sbb $-1, %rax
|
|
# endif
|
|
# endif
|
|
ret
|
|
|
|
.p2align 4
|
|
L(Exit12):
|
|
mov (%rcx), %rax
|
|
mov %rax, (%rdx)
|
|
mov 8(%rcx), %eax
|
|
mov %eax, 8(%rdx)
|
|
# ifdef USE_AS_STPCPY
|
|
lea 11(%rdx), %rax
|
|
# else
|
|
mov %rdi, %rax
|
|
# endif
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $12, %r8
|
|
lea 12(%rdx), %rcx
|
|
jnz L(StrncpyFillTailWithZero1)
|
|
# ifdef USE_AS_STPCPY
|
|
cmpb $1, (%rax)
|
|
sbb $-1, %rax
|
|
# endif
|
|
# endif
|
|
ret
|
|
|
|
.p2align 4
|
|
L(Exit13):
|
|
mov (%rcx), %rax
|
|
mov %rax, (%rdx)
|
|
mov 5(%rcx), %rax
|
|
mov %rax, 5(%rdx)
|
|
# ifdef USE_AS_STPCPY
|
|
lea 12(%rdx), %rax
|
|
# else
|
|
mov %rdi, %rax
|
|
# endif
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $13, %r8
|
|
lea 13(%rdx), %rcx
|
|
jnz L(StrncpyFillTailWithZero1)
|
|
# ifdef USE_AS_STPCPY
|
|
cmpb $1, (%rax)
|
|
sbb $-1, %rax
|
|
# endif
|
|
# endif
|
|
ret
|
|
|
|
.p2align 4
|
|
L(Exit14):
|
|
mov (%rcx), %rax
|
|
mov %rax, (%rdx)
|
|
mov 6(%rcx), %rax
|
|
mov %rax, 6(%rdx)
|
|
# ifdef USE_AS_STPCPY
|
|
lea 13(%rdx), %rax
|
|
# else
|
|
mov %rdi, %rax
|
|
# endif
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $14, %r8
|
|
lea 14(%rdx), %rcx
|
|
jnz L(StrncpyFillTailWithZero1)
|
|
# ifdef USE_AS_STPCPY
|
|
cmpb $1, (%rax)
|
|
sbb $-1, %rax
|
|
# endif
|
|
# endif
|
|
ret
|
|
|
|
.p2align 4
|
|
L(Exit15):
|
|
mov (%rcx), %rax
|
|
mov %rax, (%rdx)
|
|
mov 7(%rcx), %rax
|
|
mov %rax, 7(%rdx)
|
|
# ifdef USE_AS_STPCPY
|
|
lea 14(%rdx), %rax
|
|
# else
|
|
mov %rdi, %rax
|
|
# endif
|
|
# ifdef USE_AS_STRNCPY
|
|
sub $15, %r8
|
|
lea 15(%rdx), %rcx
|
|
jnz L(StrncpyFillTailWithZero1)
|
|
# ifdef USE_AS_STPCPY
|
|
cmpb $1, (%rax)
|
|
sbb $-1, %rax
|
|
# endif
|
|
# endif
|
|
ret
|
|
|
|
# ifdef USE_AS_STRNCPY
|
|
.p2align 4
|
|
L(Fill0):
|
|
ret
|
|
|
|
.p2align 4
|
|
L(Fill1):
|
|
movb %dl, (%rcx)
|
|
ret
|
|
|
|
.p2align 4
|
|
L(Fill2):
|
|
movw %dx, (%rcx)
|
|
ret
|
|
|
|
.p2align 4
|
|
L(Fill3):
|
|
movw %dx, (%rcx)
|
|
movb %dl, 2(%rcx)
|
|
ret
|
|
|
|
.p2align 4
|
|
L(Fill4):
|
|
movl %edx, (%rcx)
|
|
ret
|
|
|
|
.p2align 4
|
|
L(Fill5):
|
|
movl %edx, (%rcx)
|
|
movb %dl, 4(%rcx)
|
|
ret
|
|
|
|
.p2align 4
|
|
L(Fill6):
|
|
movl %edx, (%rcx)
|
|
movw %dx, 4(%rcx)
|
|
ret
|
|
|
|
.p2align 4
|
|
L(Fill7):
|
|
movl %edx, (%rcx)
|
|
movl %edx, 3(%rcx)
|
|
ret
|
|
|
|
.p2align 4
|
|
L(Fill8):
|
|
mov %rdx, (%rcx)
|
|
ret
|
|
|
|
.p2align 4
|
|
L(Fill9):
|
|
mov %rdx, (%rcx)
|
|
movb %dl, 8(%rcx)
|
|
ret
|
|
|
|
.p2align 4
|
|
L(Fill10):
|
|
mov %rdx, (%rcx)
|
|
movw %dx, 8(%rcx)
|
|
ret
|
|
|
|
.p2align 4
|
|
L(Fill11):
|
|
mov %rdx, (%rcx)
|
|
movl %edx, 7(%rcx)
|
|
ret
|
|
|
|
.p2align 4
|
|
L(Fill12):
|
|
mov %rdx, (%rcx)
|
|
movl %edx, 8(%rcx)
|
|
ret
|
|
|
|
.p2align 4
|
|
L(Fill13):
|
|
mov %rdx, (%rcx)
|
|
mov %rdx, 5(%rcx)
|
|
ret
|
|
|
|
.p2align 4
|
|
L(Fill14):
|
|
mov %rdx, (%rcx)
|
|
mov %rdx, 6(%rcx)
|
|
ret
|
|
|
|
.p2align 4
|
|
L(Fill15):
|
|
mov %rdx, (%rcx)
|
|
mov %rdx, 7(%rcx)
|
|
ret
|
|
|
|
.p2align 4
|
|
L(Fill16):
|
|
mov %rdx, (%rcx)
|
|
mov %rdx, 8(%rcx)
|
|
ret
|
|
|
|
.p2align 4
|
|
L(StrncpyFillExit1):
|
|
lea 16(%r8), %r8
|
|
L(FillFrom1To16Bytes):
|
|
test %r8, %r8
|
|
jz L(Fill0)
|
|
cmp $16, %r8
|
|
je L(Fill16)
|
|
cmp $8, %r8
|
|
je L(Fill8)
|
|
jg L(FillMore8)
|
|
cmp $4, %r8
|
|
je L(Fill4)
|
|
jg L(FillMore4)
|
|
cmp $2, %r8
|
|
jl L(Fill1)
|
|
je L(Fill2)
|
|
jg L(Fill3)
|
|
L(FillMore8): /* but less than 16 */
|
|
cmp $12, %r8
|
|
je L(Fill12)
|
|
jl L(FillLess12)
|
|
cmp $14, %r8
|
|
jl L(Fill13)
|
|
je L(Fill14)
|
|
jg L(Fill15)
|
|
L(FillMore4): /* but less than 8 */
|
|
cmp $6, %r8
|
|
jl L(Fill5)
|
|
je L(Fill6)
|
|
jg L(Fill7)
|
|
L(FillLess12): /* but more than 8 */
|
|
cmp $10, %r8
|
|
jl L(Fill9)
|
|
je L(Fill10)
|
|
jmp L(Fill11)
|
|
|
|
.p2align 4
|
|
L(StrncpyFillTailWithZero1):
|
|
xor %rdx, %rdx
|
|
sub $16, %r8
|
|
jbe L(StrncpyFillExit1)
|
|
|
|
pxor %xmm0, %xmm0
|
|
mov %rdx, (%rcx)
|
|
mov %rdx, 8(%rcx)
|
|
|
|
lea 16(%rcx), %rcx
|
|
|
|
mov %rcx, %rdx
|
|
and $0xf, %rdx
|
|
sub %rdx, %rcx
|
|
add %rdx, %r8
|
|
xor %rdx, %rdx
|
|
sub $64, %r8
|
|
jb L(StrncpyFillLess64)
|
|
|
|
L(StrncpyFillLoopMovdqa):
|
|
movdqa %xmm0, (%rcx)
|
|
movdqa %xmm0, 16(%rcx)
|
|
movdqa %xmm0, 32(%rcx)
|
|
movdqa %xmm0, 48(%rcx)
|
|
lea 64(%rcx), %rcx
|
|
sub $64, %r8
|
|
jae L(StrncpyFillLoopMovdqa)
|
|
|
|
L(StrncpyFillLess64):
|
|
add $32, %r8
|
|
jl L(StrncpyFillLess32)
|
|
movdqa %xmm0, (%rcx)
|
|
movdqa %xmm0, 16(%rcx)
|
|
lea 32(%rcx), %rcx
|
|
sub $16, %r8
|
|
jl L(StrncpyFillExit1)
|
|
movdqa %xmm0, (%rcx)
|
|
lea 16(%rcx), %rcx
|
|
jmp L(FillFrom1To16Bytes)
|
|
|
|
L(StrncpyFillLess32):
|
|
add $16, %r8
|
|
jl L(StrncpyFillExit1)
|
|
movdqa %xmm0, (%rcx)
|
|
lea 16(%rcx), %rcx
|
|
jmp L(FillFrom1To16Bytes)
|
|
|
|
.p2align 4
|
|
L(Exit0):
|
|
mov %rdx, %rax
|
|
ret
|
|
|
|
.p2align 4
|
|
L(StrncpyExit15Bytes):
|
|
cmp $9, %r8
|
|
je L(Exit9)
|
|
cmpb $0, 8(%rcx)
|
|
jz L(Exit9)
|
|
cmp $10, %r8
|
|
je L(Exit10)
|
|
cmpb $0, 9(%rcx)
|
|
jz L(Exit10)
|
|
cmp $11, %r8
|
|
je L(Exit11)
|
|
cmpb $0, 10(%rcx)
|
|
jz L(Exit11)
|
|
cmp $12, %r8
|
|
je L(Exit12)
|
|
cmpb $0, 11(%rcx)
|
|
jz L(Exit12)
|
|
cmp $13, %r8
|
|
je L(Exit13)
|
|
cmpb $0, 12(%rcx)
|
|
jz L(Exit13)
|
|
cmp $14, %r8
|
|
je L(Exit14)
|
|
cmpb $0, 13(%rcx)
|
|
jz L(Exit14)
|
|
mov (%rcx), %rax
|
|
mov %rax, (%rdx)
|
|
mov 7(%rcx), %rax
|
|
mov %rax, 7(%rdx)
|
|
# ifdef USE_AS_STPCPY
|
|
lea 14(%rdx), %rax
|
|
cmpb $1, (%rax)
|
|
sbb $-1, %rax
|
|
# else
|
|
mov %rdi, %rax
|
|
# endif
|
|
ret
|
|
|
|
.p2align 4
|
|
L(StrncpyExit8Bytes):
|
|
cmp $1, %r8
|
|
je L(Exit1)
|
|
cmpb $0, (%rcx)
|
|
jz L(Exit1)
|
|
cmp $2, %r8
|
|
je L(Exit2)
|
|
cmpb $0, 1(%rcx)
|
|
jz L(Exit2)
|
|
cmp $3, %r8
|
|
je L(Exit3)
|
|
cmpb $0, 2(%rcx)
|
|
jz L(Exit3)
|
|
cmp $4, %r8
|
|
je L(Exit4)
|
|
cmpb $0, 3(%rcx)
|
|
jz L(Exit4)
|
|
cmp $5, %r8
|
|
je L(Exit5)
|
|
cmpb $0, 4(%rcx)
|
|
jz L(Exit5)
|
|
cmp $6, %r8
|
|
je L(Exit6)
|
|
cmpb $0, 5(%rcx)
|
|
jz L(Exit6)
|
|
cmp $7, %r8
|
|
je L(Exit7)
|
|
cmpb $0, 6(%rcx)
|
|
jz L(Exit7)
|
|
mov (%rcx), %rax
|
|
mov %rax, (%rdx)
|
|
# ifdef USE_AS_STPCPY
|
|
lea 7(%rdx), %rax
|
|
cmpb $1, (%rax)
|
|
sbb $-1, %rax
|
|
# else
|
|
mov %rdi, %rax
|
|
# endif
|
|
ret
|
|
|
|
# endif
|
|
# endif
|
|
|
|
# ifdef USE_AS_STRNCPY
|
|
.p2align 4
|
|
L(StrncpyLeaveCase2OrCase3):
|
|
test %rax, %rax
|
|
jnz L(Aligned64LeaveCase2)
|
|
|
|
L(Aligned64LeaveCase3):
|
|
lea 64(%r8), %r8
|
|
sub $16, %r8
|
|
jbe L(CopyFrom1To16BytesCase3)
|
|
movaps %xmm4, -64(%rdx)
|
|
lea 16(%rsi), %rsi
|
|
sub $16, %r8
|
|
jbe L(CopyFrom1To16BytesCase3)
|
|
movaps %xmm5, -48(%rdx)
|
|
lea 16(%rsi), %rsi
|
|
sub $16, %r8
|
|
jbe L(CopyFrom1To16BytesCase3)
|
|
movaps %xmm6, -32(%rdx)
|
|
lea 16(%rsi), %rsi
|
|
lea -16(%r8), %r8
|
|
jmp L(CopyFrom1To16BytesCase3)
|
|
|
|
L(Aligned64LeaveCase2):
|
|
pcmpeqb %xmm4, %xmm0
|
|
pmovmskb %xmm0, %rax
|
|
add $48, %r8
|
|
jle L(CopyFrom1To16BytesCase2OrCase3)
|
|
test %rax, %rax
|
|
jnz L(CopyFrom1To16Bytes)
|
|
|
|
pcmpeqb %xmm5, %xmm0
|
|
pmovmskb %xmm0, %rax
|
|
movaps %xmm4, -64(%rdx)
|
|
lea 16(%rsi), %rsi
|
|
sub $16, %r8
|
|
jbe L(CopyFrom1To16BytesCase2OrCase3)
|
|
test %rax, %rax
|
|
jnz L(CopyFrom1To16Bytes)
|
|
|
|
pcmpeqb %xmm6, %xmm0
|
|
pmovmskb %xmm0, %rax
|
|
movaps %xmm5, -48(%rdx)
|
|
lea 16(%rsi), %rsi
|
|
sub $16, %r8
|
|
jbe L(CopyFrom1To16BytesCase2OrCase3)
|
|
test %rax, %rax
|
|
jnz L(CopyFrom1To16Bytes)
|
|
|
|
pcmpeqb %xmm7, %xmm0
|
|
pmovmskb %xmm0, %rax
|
|
movaps %xmm6, -32(%rdx)
|
|
lea 16(%rsi), %rsi
|
|
lea -16(%r8), %r8
|
|
jmp L(CopyFrom1To16BytesCase2)
|
|
/*--------------------------------------------------*/
|
|
.p2align 4
|
|
L(StrncpyExit1Case2OrCase3):
|
|
movdqu -1(%rcx), %xmm0
|
|
movdqu %xmm0, -1(%rdx)
|
|
mov $15, %rsi
|
|
test %rax, %rax
|
|
jnz L(CopyFrom1To16BytesCase2)
|
|
jmp L(CopyFrom1To16BytesCase3)
|
|
|
|
.p2align 4
|
|
L(StrncpyExit2Case2OrCase3):
|
|
movdqu -2(%rcx), %xmm0
|
|
movdqu %xmm0, -2(%rdx)
|
|
mov $14, %rsi
|
|
test %rax, %rax
|
|
jnz L(CopyFrom1To16BytesCase2)
|
|
jmp L(CopyFrom1To16BytesCase3)
|
|
|
|
.p2align 4
|
|
L(StrncpyExit3Case2OrCase3):
|
|
movdqu -3(%rcx), %xmm0
|
|
movdqu %xmm0, -3(%rdx)
|
|
mov $13, %rsi
|
|
test %rax, %rax
|
|
jnz L(CopyFrom1To16BytesCase2)
|
|
jmp L(CopyFrom1To16BytesCase3)
|
|
|
|
.p2align 4
|
|
L(StrncpyExit4Case2OrCase3):
|
|
movdqu -4(%rcx), %xmm0
|
|
movdqu %xmm0, -4(%rdx)
|
|
mov $12, %rsi
|
|
test %rax, %rax
|
|
jnz L(CopyFrom1To16BytesCase2)
|
|
jmp L(CopyFrom1To16BytesCase3)
|
|
|
|
.p2align 4
|
|
L(StrncpyExit5Case2OrCase3):
|
|
movdqu -5(%rcx), %xmm0
|
|
movdqu %xmm0, -5(%rdx)
|
|
mov $11, %rsi
|
|
test %rax, %rax
|
|
jnz L(CopyFrom1To16BytesCase2)
|
|
jmp L(CopyFrom1To16BytesCase3)
|
|
|
|
.p2align 4
|
|
L(StrncpyExit6Case2OrCase3):
|
|
mov (%rcx), %rsi
|
|
mov 6(%rcx), %r9d
|
|
mov %r9d, 6(%rdx)
|
|
mov %rsi, (%rdx)
|
|
test %rax, %rax
|
|
mov $10, %rsi
|
|
jnz L(CopyFrom1To16BytesCase2)
|
|
jmp L(CopyFrom1To16BytesCase3)
|
|
|
|
.p2align 4
|
|
L(StrncpyExit7Case2OrCase3):
|
|
mov (%rcx), %rsi
|
|
mov 5(%rcx), %r9d
|
|
mov %r9d, 5(%rdx)
|
|
mov %rsi, (%rdx)
|
|
test %rax, %rax
|
|
mov $9, %rsi
|
|
jnz L(CopyFrom1To16BytesCase2)
|
|
jmp L(CopyFrom1To16BytesCase3)
|
|
|
|
.p2align 4
|
|
L(StrncpyExit8Case2OrCase3):
|
|
mov (%rcx), %r9
|
|
mov $8, %rsi
|
|
mov %r9, (%rdx)
|
|
test %rax, %rax
|
|
jnz L(CopyFrom1To16BytesCase2)
|
|
jmp L(CopyFrom1To16BytesCase3)
|
|
|
|
.p2align 4
|
|
L(StrncpyExit9Case2OrCase3):
|
|
mov -1(%rcx), %r9
|
|
mov $7, %rsi
|
|
mov %r9, -1(%rdx)
|
|
test %rax, %rax
|
|
jnz L(CopyFrom1To16BytesCase2)
|
|
jmp L(CopyFrom1To16BytesCase3)
|
|
|
|
.p2align 4
|
|
L(StrncpyExit10Case2OrCase3):
|
|
mov -2(%rcx), %r9
|
|
mov $6, %rsi
|
|
mov %r9, -2(%rdx)
|
|
test %rax, %rax
|
|
jnz L(CopyFrom1To16BytesCase2)
|
|
jmp L(CopyFrom1To16BytesCase3)
|
|
|
|
.p2align 4
|
|
L(StrncpyExit11Case2OrCase3):
|
|
mov -3(%rcx), %r9
|
|
mov $5, %rsi
|
|
mov %r9, -3(%rdx)
|
|
test %rax, %rax
|
|
jnz L(CopyFrom1To16BytesCase2)
|
|
jmp L(CopyFrom1To16BytesCase3)
|
|
|
|
.p2align 4
|
|
L(StrncpyExit12Case2OrCase3):
|
|
mov (%rcx), %r9d
|
|
mov $4, %rsi
|
|
mov %r9d, (%rdx)
|
|
test %rax, %rax
|
|
jnz L(CopyFrom1To16BytesCase2)
|
|
jmp L(CopyFrom1To16BytesCase3)
|
|
|
|
.p2align 4
|
|
L(StrncpyExit13Case2OrCase3):
|
|
mov -1(%rcx), %r9d
|
|
mov $3, %rsi
|
|
mov %r9d, -1(%rdx)
|
|
test %rax, %rax
|
|
jnz L(CopyFrom1To16BytesCase2)
|
|
jmp L(CopyFrom1To16BytesCase3)
|
|
|
|
.p2align 4
|
|
L(StrncpyExit14Case2OrCase3):
|
|
mov -2(%rcx), %r9d
|
|
mov $2, %rsi
|
|
mov %r9d, -2(%rdx)
|
|
test %rax, %rax
|
|
jnz L(CopyFrom1To16BytesCase2)
|
|
jmp L(CopyFrom1To16BytesCase3)
|
|
|
|
.p2align 4
|
|
L(StrncpyExit15Case2OrCase3):
|
|
mov -3(%rcx), %r9d
|
|
mov $1, %rsi
|
|
mov %r9d, -3(%rdx)
|
|
test %rax, %rax
|
|
jnz L(CopyFrom1To16BytesCase2)
|
|
jmp L(CopyFrom1To16BytesCase3)
|
|
|
|
.p2align 4
|
|
L(StrncpyLeave1):
|
|
movaps %xmm2, %xmm3
|
|
add $48, %r8
|
|
jle L(StrncpyExit1)
|
|
palignr $1, %xmm1, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 31(%rcx), %xmm2
|
|
lea 16(%rsi), %rsi
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit1)
|
|
palignr $1, %xmm3, %xmm2
|
|
movaps %xmm2, 16(%rdx)
|
|
lea 16(%rsi), %rsi
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit1)
|
|
movaps %xmm4, 32(%rdx)
|
|
lea 16(%rsi), %rsi
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit1)
|
|
movaps %xmm5, 48(%rdx)
|
|
lea 16(%rsi), %rsi
|
|
lea -16(%r8), %r8
|
|
|
|
L(StrncpyExit1):
|
|
lea 15(%rdx, %rsi), %rdx
|
|
lea 15(%rcx, %rsi), %rcx
|
|
mov -15(%rcx), %rsi
|
|
mov -8(%rcx), %rax
|
|
mov %rsi, -15(%rdx)
|
|
mov %rax, -8(%rdx)
|
|
xor %rsi, %rsi
|
|
jmp L(CopyFrom1To16BytesCase3)
|
|
|
|
.p2align 4
|
|
L(StrncpyLeave2):
|
|
movaps %xmm2, %xmm3
|
|
add $48, %r8
|
|
jle L(StrncpyExit2)
|
|
palignr $2, %xmm1, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 30(%rcx), %xmm2
|
|
lea 16(%rsi), %rsi
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit2)
|
|
palignr $2, %xmm3, %xmm2
|
|
movaps %xmm2, 16(%rdx)
|
|
lea 16(%rsi), %rsi
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit2)
|
|
movaps %xmm4, 32(%rdx)
|
|
lea 16(%rsi), %rsi
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit2)
|
|
movaps %xmm5, 48(%rdx)
|
|
lea 16(%rsi), %rsi
|
|
lea -16(%r8), %r8
|
|
|
|
L(StrncpyExit2):
|
|
lea 14(%rdx, %rsi), %rdx
|
|
lea 14(%rcx, %rsi), %rcx
|
|
mov -14(%rcx), %rsi
|
|
mov -8(%rcx), %rax
|
|
mov %rsi, -14(%rdx)
|
|
mov %rax, -8(%rdx)
|
|
xor %rsi, %rsi
|
|
jmp L(CopyFrom1To16BytesCase3)
|
|
|
|
.p2align 4
|
|
L(StrncpyLeave3):
|
|
movaps %xmm2, %xmm3
|
|
add $48, %r8
|
|
jle L(StrncpyExit3)
|
|
palignr $3, %xmm1, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 29(%rcx), %xmm2
|
|
lea 16(%rsi), %rsi
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit3)
|
|
palignr $3, %xmm3, %xmm2
|
|
movaps %xmm2, 16(%rdx)
|
|
lea 16(%rsi), %rsi
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit3)
|
|
movaps %xmm4, 32(%rdx)
|
|
lea 16(%rsi), %rsi
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit3)
|
|
movaps %xmm5, 48(%rdx)
|
|
lea 16(%rsi), %rsi
|
|
lea -16(%r8), %r8
|
|
|
|
L(StrncpyExit3):
|
|
lea 13(%rdx, %rsi), %rdx
|
|
lea 13(%rcx, %rsi), %rcx
|
|
mov -13(%rcx), %rsi
|
|
mov -8(%rcx), %rax
|
|
mov %rsi, -13(%rdx)
|
|
mov %rax, -8(%rdx)
|
|
xor %rsi, %rsi
|
|
jmp L(CopyFrom1To16BytesCase3)
|
|
|
|
.p2align 4
|
|
L(StrncpyLeave4):
|
|
movaps %xmm2, %xmm3
|
|
add $48, %r8
|
|
jle L(StrncpyExit4)
|
|
palignr $4, %xmm1, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 28(%rcx), %xmm2
|
|
lea 16(%rsi), %rsi
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit4)
|
|
palignr $4, %xmm3, %xmm2
|
|
movaps %xmm2, 16(%rdx)
|
|
lea 16(%rsi), %rsi
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit4)
|
|
movaps %xmm4, 32(%rdx)
|
|
lea 16(%rsi), %rsi
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit4)
|
|
movaps %xmm5, 48(%rdx)
|
|
lea 16(%rsi), %rsi
|
|
lea -16(%r8), %r8
|
|
|
|
L(StrncpyExit4):
|
|
lea 12(%rdx, %rsi), %rdx
|
|
lea 12(%rcx, %rsi), %rcx
|
|
mov -12(%rcx), %rsi
|
|
mov -4(%rcx), %eax
|
|
mov %rsi, -12(%rdx)
|
|
mov %eax, -4(%rdx)
|
|
xor %rsi, %rsi
|
|
jmp L(CopyFrom1To16BytesCase3)
|
|
|
|
.p2align 4
|
|
L(StrncpyLeave5):
|
|
movaps %xmm2, %xmm3
|
|
add $48, %r8
|
|
jle L(StrncpyExit5)
|
|
palignr $5, %xmm1, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 27(%rcx), %xmm2
|
|
lea 16(%rsi), %rsi
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit5)
|
|
palignr $5, %xmm3, %xmm2
|
|
movaps %xmm2, 16(%rdx)
|
|
lea 16(%rsi), %rsi
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit5)
|
|
movaps %xmm4, 32(%rdx)
|
|
lea 16(%rsi), %rsi
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit5)
|
|
movaps %xmm5, 48(%rdx)
|
|
lea 16(%rsi), %rsi
|
|
lea -16(%r8), %r8
|
|
|
|
L(StrncpyExit5):
|
|
lea 11(%rdx, %rsi), %rdx
|
|
lea 11(%rcx, %rsi), %rcx
|
|
mov -11(%rcx), %rsi
|
|
mov -4(%rcx), %eax
|
|
mov %rsi, -11(%rdx)
|
|
mov %eax, -4(%rdx)
|
|
xor %rsi, %rsi
|
|
jmp L(CopyFrom1To16BytesCase3)
|
|
|
|
.p2align 4
|
|
L(StrncpyLeave6):
|
|
movaps %xmm2, %xmm3
|
|
add $48, %r8
|
|
jle L(StrncpyExit6)
|
|
palignr $6, %xmm1, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 26(%rcx), %xmm2
|
|
lea 16(%rsi), %rsi
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit6)
|
|
palignr $6, %xmm3, %xmm2
|
|
movaps %xmm2, 16(%rdx)
|
|
lea 16(%rsi), %rsi
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit6)
|
|
movaps %xmm4, 32(%rdx)
|
|
lea 16(%rsi), %rsi
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit6)
|
|
movaps %xmm5, 48(%rdx)
|
|
lea 16(%rsi), %rsi
|
|
lea -16(%r8), %r8
|
|
|
|
L(StrncpyExit6):
|
|
lea 10(%rdx, %rsi), %rdx
|
|
lea 10(%rcx, %rsi), %rcx
|
|
mov -10(%rcx), %rsi
|
|
movw -2(%rcx), %ax
|
|
mov %rsi, -10(%rdx)
|
|
movw %ax, -2(%rdx)
|
|
xor %rsi, %rsi
|
|
jmp L(CopyFrom1To16BytesCase3)
|
|
|
|
.p2align 4
|
|
L(StrncpyLeave7):
|
|
movaps %xmm2, %xmm3
|
|
add $48, %r8
|
|
jle L(StrncpyExit7)
|
|
palignr $7, %xmm1, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 25(%rcx), %xmm2
|
|
lea 16(%rsi), %rsi
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit7)
|
|
palignr $7, %xmm3, %xmm2
|
|
movaps %xmm2, 16(%rdx)
|
|
lea 16(%rsi), %rsi
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit7)
|
|
movaps %xmm4, 32(%rdx)
|
|
lea 16(%rsi), %rsi
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit7)
|
|
movaps %xmm5, 48(%rdx)
|
|
lea 16(%rsi), %rsi
|
|
lea -16(%r8), %r8
|
|
|
|
L(StrncpyExit7):
|
|
lea 9(%rdx, %rsi), %rdx
|
|
lea 9(%rcx, %rsi), %rcx
|
|
mov -9(%rcx), %rsi
|
|
movb -1(%rcx), %ah
|
|
mov %rsi, -9(%rdx)
|
|
movb %ah, -1(%rdx)
|
|
xor %rsi, %rsi
|
|
jmp L(CopyFrom1To16BytesCase3)
|
|
|
|
.p2align 4
|
|
L(StrncpyLeave8):
|
|
movaps %xmm2, %xmm3
|
|
add $48, %r8
|
|
jle L(StrncpyExit8)
|
|
palignr $8, %xmm1, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 24(%rcx), %xmm2
|
|
lea 16(%rsi), %rsi
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit8)
|
|
palignr $8, %xmm3, %xmm2
|
|
movaps %xmm2, 16(%rdx)
|
|
lea 16(%rsi), %rsi
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit8)
|
|
movaps %xmm4, 32(%rdx)
|
|
lea 16(%rsi), %rsi
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit8)
|
|
movaps %xmm5, 48(%rdx)
|
|
lea 16(%rsi), %rsi
|
|
lea -16(%r8), %r8
|
|
|
|
L(StrncpyExit8):
|
|
lea 8(%rdx, %rsi), %rdx
|
|
lea 8(%rcx, %rsi), %rcx
|
|
mov -8(%rcx), %rax
|
|
xor %rsi, %rsi
|
|
mov %rax, -8(%rdx)
|
|
jmp L(CopyFrom1To16BytesCase3)
|
|
|
|
.p2align 4
|
|
L(StrncpyLeave9):
|
|
movaps %xmm2, %xmm3
|
|
add $48, %r8
|
|
jle L(StrncpyExit9)
|
|
palignr $9, %xmm1, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 23(%rcx), %xmm2
|
|
lea 16(%rsi), %rsi
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit9)
|
|
palignr $9, %xmm3, %xmm2
|
|
movaps %xmm2, 16(%rdx)
|
|
lea 16(%rsi), %rsi
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit9)
|
|
movaps %xmm4, 32(%rdx)
|
|
lea 16(%rsi), %rsi
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit9)
|
|
movaps %xmm5, 48(%rdx)
|
|
lea 16(%rsi), %rsi
|
|
lea -16(%r8), %r8
|
|
|
|
L(StrncpyExit9):
|
|
lea 7(%rdx, %rsi), %rdx
|
|
lea 7(%rcx, %rsi), %rcx
|
|
mov -8(%rcx), %rax
|
|
xor %rsi, %rsi
|
|
mov %rax, -8(%rdx)
|
|
jmp L(CopyFrom1To16BytesCase3)
|
|
|
|
.p2align 4
|
|
L(StrncpyLeave10):
|
|
movaps %xmm2, %xmm3
|
|
add $48, %r8
|
|
jle L(StrncpyExit10)
|
|
palignr $10, %xmm1, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 22(%rcx), %xmm2
|
|
lea 16(%rsi), %rsi
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit10)
|
|
palignr $10, %xmm3, %xmm2
|
|
movaps %xmm2, 16(%rdx)
|
|
lea 16(%rsi), %rsi
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit10)
|
|
movaps %xmm4, 32(%rdx)
|
|
lea 16(%rsi), %rsi
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit10)
|
|
movaps %xmm5, 48(%rdx)
|
|
lea 16(%rsi), %rsi
|
|
lea -16(%r8), %r8
|
|
|
|
L(StrncpyExit10):
|
|
lea 6(%rdx, %rsi), %rdx
|
|
lea 6(%rcx, %rsi), %rcx
|
|
mov -8(%rcx), %rax
|
|
xor %rsi, %rsi
|
|
mov %rax, -8(%rdx)
|
|
jmp L(CopyFrom1To16BytesCase3)
|
|
|
|
.p2align 4
|
|
L(StrncpyLeave11):
|
|
movaps %xmm2, %xmm3
|
|
add $48, %r8
|
|
jle L(StrncpyExit11)
|
|
palignr $11, %xmm1, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 21(%rcx), %xmm2
|
|
lea 16(%rsi), %rsi
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit11)
|
|
palignr $11, %xmm3, %xmm2
|
|
movaps %xmm2, 16(%rdx)
|
|
lea 16(%rsi), %rsi
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit11)
|
|
movaps %xmm4, 32(%rdx)
|
|
lea 16(%rsi), %rsi
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit11)
|
|
movaps %xmm5, 48(%rdx)
|
|
lea 16(%rsi), %rsi
|
|
lea -16(%r8), %r8
|
|
|
|
L(StrncpyExit11):
|
|
lea 5(%rdx, %rsi), %rdx
|
|
lea 5(%rcx, %rsi), %rcx
|
|
mov -8(%rcx), %rax
|
|
xor %rsi, %rsi
|
|
mov %rax, -8(%rdx)
|
|
jmp L(CopyFrom1To16BytesCase3)
|
|
|
|
.p2align 4
|
|
L(StrncpyLeave12):
|
|
movaps %xmm2, %xmm3
|
|
add $48, %r8
|
|
jle L(StrncpyExit12)
|
|
palignr $12, %xmm1, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 20(%rcx), %xmm2
|
|
lea 16(%rsi), %rsi
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit12)
|
|
palignr $12, %xmm3, %xmm2
|
|
movaps %xmm2, 16(%rdx)
|
|
lea 16(%rsi), %rsi
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit12)
|
|
movaps %xmm4, 32(%rdx)
|
|
lea 16(%rsi), %rsi
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit12)
|
|
movaps %xmm5, 48(%rdx)
|
|
lea 16(%rsi), %rsi
|
|
lea -16(%r8), %r8
|
|
|
|
L(StrncpyExit12):
|
|
lea 4(%rdx, %rsi), %rdx
|
|
lea 4(%rcx, %rsi), %rcx
|
|
mov -4(%rcx), %eax
|
|
xor %rsi, %rsi
|
|
mov %eax, -4(%rdx)
|
|
jmp L(CopyFrom1To16BytesCase3)
|
|
|
|
.p2align 4
|
|
L(StrncpyLeave13):
|
|
movaps %xmm2, %xmm3
|
|
add $48, %r8
|
|
jle L(StrncpyExit13)
|
|
palignr $13, %xmm1, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 19(%rcx), %xmm2
|
|
lea 16(%rsi), %rsi
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit13)
|
|
palignr $13, %xmm3, %xmm2
|
|
movaps %xmm2, 16(%rdx)
|
|
lea 16(%rsi), %rsi
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit13)
|
|
movaps %xmm4, 32(%rdx)
|
|
lea 16(%rsi), %rsi
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit13)
|
|
movaps %xmm5, 48(%rdx)
|
|
lea 16(%rsi), %rsi
|
|
lea -16(%r8), %r8
|
|
|
|
L(StrncpyExit13):
|
|
lea 3(%rdx, %rsi), %rdx
|
|
lea 3(%rcx, %rsi), %rcx
|
|
mov -4(%rcx), %eax
|
|
xor %rsi, %rsi
|
|
mov %eax, -4(%rdx)
|
|
jmp L(CopyFrom1To16BytesCase3)
|
|
|
|
.p2align 4
|
|
L(StrncpyLeave14):
|
|
movaps %xmm2, %xmm3
|
|
add $48, %r8
|
|
jle L(StrncpyExit14)
|
|
palignr $14, %xmm1, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 18(%rcx), %xmm2
|
|
lea 16(%rsi), %rsi
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit14)
|
|
palignr $14, %xmm3, %xmm2
|
|
movaps %xmm2, 16(%rdx)
|
|
lea 16(%rsi), %rsi
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit14)
|
|
movaps %xmm4, 32(%rdx)
|
|
lea 16(%rsi), %rsi
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit14)
|
|
movaps %xmm5, 48(%rdx)
|
|
lea 16(%rsi), %rsi
|
|
lea -16(%r8), %r8
|
|
|
|
L(StrncpyExit14):
|
|
lea 2(%rdx, %rsi), %rdx
|
|
lea 2(%rcx, %rsi), %rcx
|
|
movw -2(%rcx), %ax
|
|
xor %rsi, %rsi
|
|
movw %ax, -2(%rdx)
|
|
jmp L(CopyFrom1To16BytesCase3)
|
|
|
|
.p2align 4
|
|
L(StrncpyLeave15):
|
|
movaps %xmm2, %xmm3
|
|
add $48, %r8
|
|
jle L(StrncpyExit15)
|
|
palignr $15, %xmm1, %xmm2
|
|
movaps %xmm2, (%rdx)
|
|
movaps 17(%rcx), %xmm2
|
|
lea 16(%rsi), %rsi
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit15)
|
|
palignr $15, %xmm3, %xmm2
|
|
movaps %xmm2, 16(%rdx)
|
|
lea 16(%rsi), %rsi
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit15)
|
|
movaps %xmm4, 32(%rdx)
|
|
lea 16(%rsi), %rsi
|
|
sub $16, %r8
|
|
jbe L(StrncpyExit15)
|
|
movaps %xmm5, 48(%rdx)
|
|
lea 16(%rsi), %rsi
|
|
lea -16(%r8), %r8
|
|
|
|
L(StrncpyExit15):
|
|
lea 1(%rdx, %rsi), %rdx
|
|
lea 1(%rcx, %rsi), %rcx
|
|
movb -1(%rcx), %ah
|
|
xor %rsi, %rsi
|
|
movb %ah, -1(%rdx)
|
|
jmp L(CopyFrom1To16BytesCase3)
|
|
|
|
# endif
|
|
# ifndef USE_AS_STRCAT
|
|
END (STRCPY)
|
|
# endif
|
|
#endif
|