Remove alignments on jump targets in memset

X86-64 memset-vec-unaligned-erms.S aligns many jump targets, which increases code sizes, but not necessarily improve performance. As memset benchtest data of align vs no align on various Intel and AMD processors https://sourceware.org/bugzilla/attachment.cgi?id=9277 shows that aligning jump targets isn't necessary. [BZ #20115] * sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S (__memset): Remove alignments on jump targets.
2016-05-19 08:49:45 -07:00 · 2016-05-19 08:49:45 -07:00 · eb2c88c7c8
parent bb37c73778
commit eb2c88c7c8
2 changed files with 11 additions and 32 deletions
--- a/6
+++ b/6
@ -1,3 +1,9 @@
 2016-05-19  H.J. Lu  <hongjiu.lu@intel.com>
 	[BZ #20115]
 	* sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S (__memset):
 	Remove alignments on jump targets.
 2016-05-19  Joseph Myers  <joseph@codesourcery.com>
 	* conform/data/pwd.h-data (endpwent): Do not expect for [XPG3].
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@ -18,12 +18,10 @@
 /* memset is implemented as:
   1. Use overlapping store to avoid branch.
-   2. Force 32-bit displacement for branches to avoid long nop between
+   2. If size is less than VEC, use integer register stores.
-      instructions.
+   3. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores.
-   3. If size is less than VEC, use integer register stores.
+   4. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores.
-   4. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores.
+   5. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with
   5. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores.
   6. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with
      4 VEC stores and store 4 * VEC at a time until done.  */
 #include <sysdep.h>
@ -143,14 +141,10 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
 	VZEROUPPER
 	ret
 	.p2align 4
 L(stosb_more_2x_vec):
 	cmpq	$REP_STOSB_THRESHOLD, %rdx
-	/* Force 32-bit displacement to avoid long nop between
+	ja	L(stosb)
 	   instructions.  */
 	ja.d32	L(stosb)
 #endif
 	.p2align 4
 L(more_2x_vec):
 	cmpq  $(VEC_SIZE * 4), %rdx
 	ja	L(loop_start)
@ -162,26 +156,12 @@ L(return):
 	VZEROUPPER
 	ret
 	.p2align 4
 L(loop_start):
 	leaq	(VEC_SIZE * 4)(%rdi), %rcx
 # if VEC_SIZE == 32 || VEC_SIZE == 64
 	/* Force 32-bit displacement to avoid long nop between
 	   instructions.  */
 	VMOVU.d32 %VEC(0), (%rdi)
 # else
 	VMOVU	%VEC(0), (%rdi)
 # endif
 	andq	$-(VEC_SIZE * 4), %rcx
 # if VEC_SIZE == 32
 	/* Force 32-bit displacement to avoid long nop between
 	   instructions.  */
 	VMOVU.d32 %VEC(0), -VEC_SIZE(%rdi,%rdx)
 	VMOVU.d32 %VEC(0), VEC_SIZE(%rdi)
 # else
 	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
 	VMOVU	%VEC(0), VEC_SIZE(%rdi)
 # endif
 	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
 	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rdi)
 	VMOVU	%VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx)
@ -190,14 +170,7 @@ L(loop_start):
 	addq	%rdi, %rdx
 	andq	$-(VEC_SIZE * 4), %rdx
 	cmpq	%rdx, %rcx
 # if VEC_SIZE == 32 || VEC_SIZE == 64
 	/* Force 32-bit displacement to avoid long nop between
 	   instructions.  */
 	je.d32	L(return)
 # else
 	je	L(return)
 # endif
 	.p2align 4
 L(loop):
 	VMOVA	%VEC(0), (%rcx)
 	VMOVA	%VEC(0), VEC_SIZE(%rcx)