Remove alignments on jump targets in memset

X86-64 memset-vec-unaligned-erms.S aligns many jump targets, which
increases code sizes, but not necessarily improve performance.  As
memset benchtest data of align vs no align on various Intel and AMD
processors

https://sourceware.org/bugzilla/attachment.cgi?id=9277

shows that aligning jump targets isn't necessary.

	[BZ #20115]
	* sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S (__memset):
	Remove alignments on jump targets.
This commit is contained in:
H.J. Lu 2016-05-19 08:49:45 -07:00
parent bb37c73778
commit eb2c88c7c8
2 changed files with 11 additions and 32 deletions

View File

@ -1,3 +1,9 @@
2016-05-19 H.J. Lu <hongjiu.lu@intel.com>
[BZ #20115]
* sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S (__memset):
Remove alignments on jump targets.
2016-05-19 Joseph Myers <joseph@codesourcery.com> 2016-05-19 Joseph Myers <joseph@codesourcery.com>
* conform/data/pwd.h-data (endpwent): Do not expect for [XPG3]. * conform/data/pwd.h-data (endpwent): Do not expect for [XPG3].

View File

@ -18,12 +18,10 @@
/* memset is implemented as: /* memset is implemented as:
1. Use overlapping store to avoid branch. 1. Use overlapping store to avoid branch.
2. Force 32-bit displacement for branches to avoid long nop between 2. If size is less than VEC, use integer register stores.
instructions. 3. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores.
3. If size is less than VEC, use integer register stores. 4. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores.
4. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores. 5. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with
5. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores.
6. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with
4 VEC stores and store 4 * VEC at a time until done. */ 4 VEC stores and store 4 * VEC at a time until done. */
#include <sysdep.h> #include <sysdep.h>
@ -143,14 +141,10 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
VZEROUPPER VZEROUPPER
ret ret
.p2align 4
L(stosb_more_2x_vec): L(stosb_more_2x_vec):
cmpq $REP_STOSB_THRESHOLD, %rdx cmpq $REP_STOSB_THRESHOLD, %rdx
/* Force 32-bit displacement to avoid long nop between ja L(stosb)
instructions. */
ja.d32 L(stosb)
#endif #endif
.p2align 4
L(more_2x_vec): L(more_2x_vec):
cmpq $(VEC_SIZE * 4), %rdx cmpq $(VEC_SIZE * 4), %rdx
ja L(loop_start) ja L(loop_start)
@ -162,26 +156,12 @@ L(return):
VZEROUPPER VZEROUPPER
ret ret
.p2align 4
L(loop_start): L(loop_start):
leaq (VEC_SIZE * 4)(%rdi), %rcx leaq (VEC_SIZE * 4)(%rdi), %rcx
# if VEC_SIZE == 32 || VEC_SIZE == 64
/* Force 32-bit displacement to avoid long nop between
instructions. */
VMOVU.d32 %VEC(0), (%rdi)
# else
VMOVU %VEC(0), (%rdi) VMOVU %VEC(0), (%rdi)
# endif
andq $-(VEC_SIZE * 4), %rcx andq $-(VEC_SIZE * 4), %rcx
# if VEC_SIZE == 32
/* Force 32-bit displacement to avoid long nop between
instructions. */
VMOVU.d32 %VEC(0), -VEC_SIZE(%rdi,%rdx)
VMOVU.d32 %VEC(0), VEC_SIZE(%rdi)
# else
VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
VMOVU %VEC(0), VEC_SIZE(%rdi) VMOVU %VEC(0), VEC_SIZE(%rdi)
# endif
VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx) VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi) VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi)
VMOVU %VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx) VMOVU %VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx)
@ -190,14 +170,7 @@ L(loop_start):
addq %rdi, %rdx addq %rdi, %rdx
andq $-(VEC_SIZE * 4), %rdx andq $-(VEC_SIZE * 4), %rdx
cmpq %rdx, %rcx cmpq %rdx, %rcx
# if VEC_SIZE == 32 || VEC_SIZE == 64
/* Force 32-bit displacement to avoid long nop between
instructions. */
je.d32 L(return)
# else
je L(return) je L(return)
# endif
.p2align 4
L(loop): L(loop):
VMOVA %VEC(0), (%rcx) VMOVA %VEC(0), (%rcx)
VMOVA %VEC(0), VEC_SIZE(%rcx) VMOVA %VEC(0), VEC_SIZE(%rcx)