Save and restore vector registers in x86-64 ld.so

This patch adds SSE, AVX and AVX512 versions of _dl_runtime_resolve and _dl_runtime_profile, which save and restore the first 8 vector registers used for parameter passing. elf_machine_runtime_setup selects the proper _dl_runtime_resolve or _dl_runtime_profile based on _dl_x86_cpu_features. It avoids race condition caused by FOREIGN_CALL macros, which are only used for x86-64. Performance impact of saving and restoring 8 vector registers are negligible on Nehalem, Sandy Bridge, Ivy Bridge and Haswell when ld.so is optimized with SSE2. [BZ #15128] * sysdeps/x86_64/Makefile [$(subdir) == elf] (tests): Add ifuncmain8. (modules-names): Add ifuncmod8. ($(objpfx)ifuncmain8): New rule. * sysdeps/x86_64/dl-machine.h: Include <dl-procinfo.h> and <cpuid.h>. (elf_machine_runtime_setup): Use _dl_runtime_resolve_sse, _dl_runtime_resolve_avx, or _dl_runtime_resolve_avx512, _dl_runtime_profile_sse, _dl_runtime_profile_avx, or _dl_runtime_profile_avx512, based on HAS_ARCH_FEATURE. * sysdeps/x86_64/dl-trampoline.S: Rewrite. * sysdeps/x86_64/dl-trampoline.h: Likewise. * sysdeps/x86_64/ifuncmain8.c: New file. * sysdeps/x86_64/ifuncmod8.c: Likewise. * sysdeps/x86_64/nptl/tcb-offsets.sym (RTLD_SAVESPACE_SSE): Removed. * sysdeps/x86_64/nptl/tls.h (__128bits): Removed. (tcbhead_t): Change rtld_must_xmm_save to __glibc_unused1. Change rtld_savespace_sse to __glibc_unused2. (RTLD_CHECK_FOREIGN_CALL): Removed. (RTLD_ENABLE_FOREIGN_CALL): Likewise. (RTLD_PREPARE_FOREIGN_CALL): Likewise. (RTLD_FINALIZE_FOREIGN_CALL): Likewise.
2015-08-25 04:33:54 -07:00 · 2015-08-25 04:33:54 -07:00 · f3dcae82d5
parent 2d02fd0737
commit f3dcae82d5
9 changed files with 498 additions and 500 deletions
--- a/27
+++ b/27
@ -1,3 +1,30 @@
+2015-08-25  H.J. Lu  <hongjiu.lu@intel.com>
+
+	[BZ #15128]
+	* sysdeps/x86_64/Makefile [$(subdir) == elf] (tests): Add
+	ifuncmain8.
+	(modules-names): Add ifuncmod8.
+	($(objpfx)ifuncmain8): New rule.
+	* sysdeps/x86_64/dl-machine.h: Include <dl-procinfo.h> and
+	<cpuid.h>.
+	(elf_machine_runtime_setup): Use _dl_runtime_resolve_sse,
+	_dl_runtime_resolve_avx, or _dl_runtime_resolve_avx512,
+	_dl_runtime_profile_sse, _dl_runtime_profile_avx, or
+	_dl_runtime_profile_avx512, based on HAS_ARCH_FEATURE.
+	* sysdeps/x86_64/dl-trampoline.S: Rewrite.
+	* sysdeps/x86_64/dl-trampoline.h: Likewise.
+	* sysdeps/x86_64/ifuncmain8.c: New file.
+	* sysdeps/x86_64/ifuncmod8.c: Likewise.
+	* sysdeps/x86_64/nptl/tcb-offsets.sym (RTLD_SAVESPACE_SSE):
+	Removed.
+	* sysdeps/x86_64/nptl/tls.h (__128bits): Removed.
+	(tcbhead_t): Change rtld_must_xmm_save to __glibc_unused1.
+	Change rtld_savespace_sse to __glibc_unused2.
+	(RTLD_CHECK_FOREIGN_CALL): Removed.
+	(RTLD_ENABLE_FOREIGN_CALL): Likewise.
+	(RTLD_PREPARE_FOREIGN_CALL): Likewise.
+	(RTLD_FINALIZE_FOREIGN_CALL): Likewise.
+
 2015-08-24  Wilco Dijkstra  <wdijkstr@arm.com>

 	* sysdeps/aarch64/bzero.S (__bzero): Remove.
--- a/sysdeps/x86_64/Makefile
+++ b/sysdeps/x86_64/Makefile
@ -21,6 +21,11 @@ endif
 ifeq ($(subdir),elf)
 sysdep-dl-routines += tlsdesc dl-tlsdesc

+tests += ifuncmain8
+modules-names += ifuncmod8
+
+$(objpfx)ifuncmain8: $(objpfx)ifuncmod8.so
+
 tests += tst-quad1 tst-quad2
 modules-names += tst-quadmod1 tst-quadmod2

--- a/sysdeps/x86_64/dl-machine.h
+++ b/sysdeps/x86_64/dl-machine.h
@ -66,8 +66,12 @@ static inline int __attribute__ ((unused, always_inline))
 elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
 {
  Elf64_Addr *got;
-  extern void _dl_runtime_resolve (ElfW(Word)) attribute_hidden;
-  extern void _dl_runtime_profile (ElfW(Word)) attribute_hidden;
+  extern void _dl_runtime_resolve_sse (ElfW(Word)) attribute_hidden;
+  extern void _dl_runtime_resolve_avx (ElfW(Word)) attribute_hidden;
+  extern void _dl_runtime_resolve_avx512 (ElfW(Word)) attribute_hidden;
+  extern void _dl_runtime_profile_sse (ElfW(Word)) attribute_hidden;
+  extern void _dl_runtime_profile_avx (ElfW(Word)) attribute_hidden;
+  extern void _dl_runtime_profile_avx512 (ElfW(Word)) attribute_hidden;

  if (l->l_info[DT_JMPREL] && lazy)
    {
@ -95,7 +99,12 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
 	 end in this function.  */
      if (__glibc_unlikely (profile))
 	{
-	  *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile;
+	  if (HAS_ARCH_FEATURE (AVX512F_Usable))
+	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx512;
+	  else if (HAS_ARCH_FEATURE (AVX_Usable))
+	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx;
+	  else
+	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_sse;

 	  if (GLRO(dl_profile) != NULL
 	      && _dl_name_match_p (GLRO(dl_profile), l))
@ -104,9 +113,17 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
 	    GL(dl_profile_map) = l;
 	}
      else
-	/* This function will get called to fix up the GOT entry indicated by
-	   the offset on the stack, and then jump to the resolved address.  */
-	*(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_resolve;
+	{
+	  /* This function will get called to fix up the GOT entry
+	     indicated by the offset on the stack, and then jump to
+	     the resolved address.  */
+	  if (HAS_ARCH_FEATURE (AVX512F_Usable))
+	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_resolve_avx512;
+	  else if (HAS_ARCH_FEATURE (AVX_Usable))
+	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_resolve_avx;
+	  else
+	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_resolve_sse;
+	}
    }

  if (l->l_info[ADDRIDX (DT_TLSDESC_GOT)] && lazy)
--- a/sysdeps/x86_64/dl-trampoline.S
+++ b/sysdeps/x86_64/dl-trampoline.S
@ -20,23 +20,40 @@
 #include <sysdep.h>
 #include <link-defines.h>

-#if (RTLD_SAVESPACE_SSE % 32) != 0
-# error RTLD_SAVESPACE_SSE must be aligned to 32 bytes
+#ifndef DL_STACK_ALIGNMENT
+/* Due to GCC bug:
+
+   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
+
+   __tls_get_addr may be called with 8-byte stack alignment.  Although
+   this bug has been fixed in GCC 4.9.4, 5.3 and 6, we can't assume
+   that stack will be always aligned at 16 bytes.  We use unaligned
+   16-byte move to load and store SSE registers, which has no penalty
+   on modern processors if stack is 16-byte aligned.  */
+# define DL_STACK_ALIGNMENT 8
 #endif

+#ifndef DL_RUNIME_UNALIGNED_VEC_SIZE
+/* The maximum size of unaligned vector load and store.  */
+# define DL_RUNIME_UNALIGNED_VEC_SIZE 16
+#endif
+
+/* True if _dl_runtime_resolve should align stack to VEC_SIZE bytes.  */
+#define DL_RUNIME_RESOLVE_REALIGN_STACK \
+  (VEC_SIZE > DL_STACK_ALIGNMENT \
+   && VEC_SIZE > DL_RUNIME_UNALIGNED_VEC_SIZE)
+
+/* Align vector register save area to 16 bytes.  */
+#define REGISTER_SAVE_VEC_OFF	0
+
 /* Area on stack to save and restore registers used for parameter
   passing when calling _dl_fixup.  */
 #ifdef __ILP32__
-/* X32 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX.  */
-# define REGISTER_SAVE_AREA	(8 * 7)
-# define REGISTER_SAVE_RAX	0
+# define REGISTER_SAVE_RAX	(REGISTER_SAVE_VEC_OFF + VEC_SIZE * 8)
 # define PRESERVE_BND_REGS_PREFIX
 #else
-/* X86-64 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as BND0,
-   BND1, BND2, BND3.  */
-# define REGISTER_SAVE_AREA	(8 * 7 + 16 * 4)
 /* Align bound register save area to 16 bytes.  */
-# define REGISTER_SAVE_BND0	0
+# define REGISTER_SAVE_BND0	(REGISTER_SAVE_VEC_OFF + VEC_SIZE * 8)
 # define REGISTER_SAVE_BND1	(REGISTER_SAVE_BND0 + 16)
 # define REGISTER_SAVE_BND2	(REGISTER_SAVE_BND1 + 16)
 # define REGISTER_SAVE_BND3	(REGISTER_SAVE_BND2 + 16)
@ -54,386 +71,53 @@
 #define REGISTER_SAVE_R8	(REGISTER_SAVE_RDI + 8)
 #define REGISTER_SAVE_R9	(REGISTER_SAVE_R8 + 8)

-	.text
-	.globl _dl_runtime_resolve
-	.type _dl_runtime_resolve, @function
-	.align 16
-	cfi_startproc
-_dl_runtime_resolve:
-	cfi_adjust_cfa_offset(16) # Incorporate PLT
-	subq $REGISTER_SAVE_AREA,%rsp
-	cfi_adjust_cfa_offset(REGISTER_SAVE_AREA)
-	# Preserve registers otherwise clobbered.
-	movq %rax, REGISTER_SAVE_RAX(%rsp)
-	movq %rcx, REGISTER_SAVE_RCX(%rsp)
-	movq %rdx, REGISTER_SAVE_RDX(%rsp)
-	movq %rsi, REGISTER_SAVE_RSI(%rsp)
-	movq %rdi, REGISTER_SAVE_RDI(%rsp)
-	movq %r8, REGISTER_SAVE_R8(%rsp)
-	movq %r9, REGISTER_SAVE_R9(%rsp)
-#ifndef __ILP32__
-	# We also have to preserve bound registers.  These are nops if
-	# Intel MPX isn't available or disabled.
-# ifdef HAVE_MPX_SUPPORT
-	bndmov %bnd0, REGISTER_SAVE_BND0(%rsp)
-	bndmov %bnd1, REGISTER_SAVE_BND1(%rsp)
-	bndmov %bnd2, REGISTER_SAVE_BND2(%rsp)
-	bndmov %bnd3, REGISTER_SAVE_BND3(%rsp)
-# else
-#  if REGISTER_SAVE_BND0 == 0
-	.byte 0x66,0x0f,0x1b,0x04,0x24
-#  else
-	.byte 0x66,0x0f,0x1b,0x44,0x24,REGISTER_SAVE_BND0
-#  endif
-	.byte 0x66,0x0f,0x1b,0x4c,0x24,REGISTER_SAVE_BND1
-	.byte 0x66,0x0f,0x1b,0x54,0x24,REGISTER_SAVE_BND2
-	.byte 0x66,0x0f,0x1b,0x5c,0x24,REGISTER_SAVE_BND3
-# endif
+#define VEC_SIZE		64
+#define VMOVA			vmovdqa64
+#if DL_RUNIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT
+# define VMOV			vmovdqa64
+#else
+# define VMOV			vmovdqu64
 #endif
-	# Copy args pushed by PLT in register.
-	# %rdi: link_map, %rsi: reloc_index
-	movq (REGISTER_SAVE_AREA + 8)(%rsp), %rsi
-	movq REGISTER_SAVE_AREA(%rsp), %rdi
-	call _dl_fixup		# Call resolver.
-	movq %rax, %r11		# Save return value
-#ifndef __ILP32__
-	# Restore bound registers.  These are nops if Intel MPX isn't
-	# avaiable or disabled.
-# ifdef HAVE_MPX_SUPPORT
-	bndmov REGISTER_SAVE_BND3(%rsp), %bnd3
-	bndmov REGISTER_SAVE_BND2(%rsp), %bnd2
-	bndmov REGISTER_SAVE_BND1(%rsp), %bnd1
-	bndmov REGISTER_SAVE_BND0(%rsp), %bnd0
-# else
-	.byte 0x66,0x0f,0x1a,0x5c,0x24,REGISTER_SAVE_BND3
-	.byte 0x66,0x0f,0x1a,0x54,0x24,REGISTER_SAVE_BND2
-	.byte 0x66,0x0f,0x1a,0x4c,0x24,REGISTER_SAVE_BND1
-#  if REGISTER_SAVE_BND0 == 0
-	.byte 0x66,0x0f,0x1a,0x04,0x24
-#  else
-	.byte 0x66,0x0f,0x1a,0x44,0x24,REGISTER_SAVE_BND0
-#  endif
-# endif
+#define VEC(i)			zmm##i
+#define _dl_runtime_resolve	_dl_runtime_resolve_avx512
+#define _dl_runtime_profile	_dl_runtime_profile_avx512
+#define RESTORE_AVX
+#include "dl-trampoline.h"
+#undef _dl_runtime_resolve
+#undef _dl_runtime_profile
+#undef VEC
+#undef VMOV
+#undef VMOVA
+#undef VEC_SIZE
+
+#define VEC_SIZE		32
+#define VMOVA			vmovdqa
+#if DL_RUNIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT
+# define VMOV			vmovdqa
+#else
+# define VMOV			vmovdqu
 #endif
-	# Get register content back.
-	movq REGISTER_SAVE_R9(%rsp), %r9
-	movq REGISTER_SAVE_R8(%rsp), %r8
-	movq REGISTER_SAVE_RDI(%rsp), %rdi
-	movq REGISTER_SAVE_RSI(%rsp), %rsi
-	movq REGISTER_SAVE_RDX(%rsp), %rdx
-	movq REGISTER_SAVE_RCX(%rsp), %rcx
-	movq REGISTER_SAVE_RAX(%rsp), %rax
-	# Adjust stack(PLT did 2 pushes)
-	addq $(REGISTER_SAVE_AREA + 16), %rsp
-	cfi_adjust_cfa_offset(-(REGISTER_SAVE_AREA + 16))
-	# Preserve bound registers.
-	PRESERVE_BND_REGS_PREFIX
-	jmp *%r11		# Jump to function address.
-	cfi_endproc
-	.size _dl_runtime_resolve, .-_dl_runtime_resolve
+#define VEC(i)			ymm##i
+#define _dl_runtime_resolve	_dl_runtime_resolve_avx
+#define _dl_runtime_profile	_dl_runtime_profile_avx
+#include "dl-trampoline.h"
+#undef _dl_runtime_resolve
+#undef _dl_runtime_profile
+#undef VEC
+#undef VMOV
+#undef VMOVA
+#undef VEC_SIZE

-
-#ifndef PROF
-	.globl _dl_runtime_profile
-	.type _dl_runtime_profile, @function
-	.align 16
-	cfi_startproc
-
-_dl_runtime_profile:
-	cfi_adjust_cfa_offset(16) # Incorporate PLT
-	/* The La_x86_64_regs data structure pointed to by the
-	   fourth paramater must be 16-byte aligned.  This must
-	   be explicitly enforced.  We have the set up a dynamically
-	   sized stack frame.  %rbx points to the top half which
-	   has a fixed size and preserves the original stack pointer.  */
-
-	subq $32, %rsp		# Allocate the local storage.
-	cfi_adjust_cfa_offset(32)
-	movq %rbx, (%rsp)
-	cfi_rel_offset(%rbx, 0)
-
-	/* On the stack:
-		56(%rbx)	parameter #1
-		48(%rbx)	return address
-
-		40(%rbx)	reloc index
-		32(%rbx)	link_map
-
-		24(%rbx)	La_x86_64_regs pointer
-		16(%rbx)	framesize
-		 8(%rbx)	rax
-		  (%rbx)	rbx
-	*/
-
-	movq %rax, 8(%rsp)
-	movq %rsp, %rbx
-	cfi_def_cfa_register(%rbx)
-
-	/* Actively align the La_x86_64_regs structure.  */
-	andq $0xfffffffffffffff0, %rsp
-# if defined HAVE_AVX_SUPPORT || defined HAVE_AVX512_ASM_SUPPORT
-	/* sizeof(La_x86_64_regs).  Need extra space for 8 SSE registers
-	   to detect if any xmm0-xmm7 registers are changed by audit
-	   module.  */
-	subq $(LR_SIZE + XMM_SIZE*8), %rsp
-# else
-	subq $LR_SIZE, %rsp		# sizeof(La_x86_64_regs)
-# endif
-	movq %rsp, 24(%rbx)
-
-	/* Fill the La_x86_64_regs structure.  */
-	movq %rdx, LR_RDX_OFFSET(%rsp)
-	movq %r8,  LR_R8_OFFSET(%rsp)
-	movq %r9,  LR_R9_OFFSET(%rsp)
-	movq %rcx, LR_RCX_OFFSET(%rsp)
-	movq %rsi, LR_RSI_OFFSET(%rsp)
-	movq %rdi, LR_RDI_OFFSET(%rsp)
-	movq %rbp, LR_RBP_OFFSET(%rsp)
-
-	leaq 48(%rbx), %rax
-	movq %rax, LR_RSP_OFFSET(%rsp)
-
-	/* We always store the XMM registers even if AVX is available.
-	   This is to provide backward binary compatibility for existing
-	   audit modules.  */
-	movaps %xmm0,		   (LR_XMM_OFFSET)(%rsp)
-	movaps %xmm1, (LR_XMM_OFFSET +   XMM_SIZE)(%rsp)
-	movaps %xmm2, (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp)
-	movaps %xmm3, (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp)
-	movaps %xmm4, (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp)
-	movaps %xmm5, (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp)
-	movaps %xmm6, (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp)
-	movaps %xmm7, (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp)
-
-# ifndef __ILP32__
-#  ifdef HAVE_MPX_SUPPORT
-	bndmov %bnd0, 		   (LR_BND_OFFSET)(%rsp)  # Preserve bound
-	bndmov %bnd1, (LR_BND_OFFSET +   BND_SIZE)(%rsp)  # registers. Nops if
-	bndmov %bnd2, (LR_BND_OFFSET + BND_SIZE*2)(%rsp)  # MPX not available
-	bndmov %bnd3, (LR_BND_OFFSET + BND_SIZE*3)(%rsp)  # or disabled.
-#  else
-	.byte 0x66,0x0f,0x1b,0x84,0x24;.long (LR_BND_OFFSET)
-	.byte 0x66,0x0f,0x1b,0x8c,0x24;.long (LR_BND_OFFSET + BND_SIZE)
-	.byte 0x66,0x0f,0x1b,0x94,0x24;.long (LR_BND_OFFSET + BND_SIZE*2)
-	.byte 0x66,0x0f,0x1b,0x9c,0x24;.long (LR_BND_OFFSET + BND_SIZE*3)
-#  endif
-# endif
-
-# if defined HAVE_AVX_SUPPORT || defined HAVE_AVX512_ASM_SUPPORT
-	.data
-L(have_avx):
-	.zero 4
-	.size L(have_avx), 4
-	.previous
-
-	cmpl	$0, L(have_avx)(%rip)
-	jne	L(defined)
-	movq	%rbx, %r11		# Save rbx
-	movl	$1, %eax
-	cpuid
-	movq	%r11,%rbx		# Restore rbx
-	xorl	%eax, %eax
-	// AVX and XSAVE supported?
-	andl	$((1 << 28) | (1 << 27)), %ecx
-	cmpl	$((1 << 28) | (1 << 27)), %ecx
-	jne	10f
-#  ifdef HAVE_AVX512_ASM_SUPPORT
-	// AVX512 supported in processor?
-	movq	%rbx, %r11		# Save rbx
-	xorl	%ecx, %ecx
-	mov	$0x7, %eax
-	cpuid
-	andl	$(1 << 16), %ebx
-#  endif
-	xorl	%ecx, %ecx
-	// Get XFEATURE_ENABLED_MASK
-	xgetbv
-#  ifdef HAVE_AVX512_ASM_SUPPORT
-	test	%ebx, %ebx
-	movq	%r11, %rbx		# Restore rbx
-	je	20f
-	// Verify that XCR0[7:5] = '111b' and
-	// XCR0[2:1] = '11b' which means
-	// that zmm state is enabled
-	andl	$0xe6, %eax
-	cmpl	$0xe6, %eax
-	jne	20f
-	movl	%eax, L(have_avx)(%rip)
-L(avx512):
-#   define RESTORE_AVX
-#   define VMOV    vmovdqu64
-#   define VEC(i)  zmm##i
-#   define MORE_CODE
-#   include "dl-trampoline.h"
-#   undef VMOV
-#   undef VEC
-#   undef RESTORE_AVX
-#  endif
-20:	andl	$0x6, %eax
-10:	subl	$0x5, %eax
-	movl	%eax, L(have_avx)(%rip)
-	cmpl	$0, %eax
-
-L(defined):
-	js	L(no_avx)
-#  ifdef HAVE_AVX512_ASM_SUPPORT
-	cmpl	$0xe6, L(have_avx)(%rip)
-	je	L(avx512)
-#  endif
-
-#  define RESTORE_AVX
-#  define VMOV    vmovdqu
-#  define VEC(i)  ymm##i
-#  define MORE_CODE
-#  include "dl-trampoline.h"
-
-	.align 16
-L(no_avx):
-# endif
-
-# undef RESTORE_AVX
-# include "dl-trampoline.h"
-
-	cfi_endproc
-	.size _dl_runtime_profile, .-_dl_runtime_profile
-#endif
-
-
-#ifdef SHARED
-	.globl _dl_x86_64_save_sse
-	.type _dl_x86_64_save_sse, @function
-	.align 16
-	cfi_startproc
-_dl_x86_64_save_sse:
-# if defined HAVE_AVX_SUPPORT || defined HAVE_AVX512_ASM_SUPPORT
-	cmpl	$0, L(have_avx)(%rip)
-	jne	L(defined_5)
-	movq	%rbx, %r11		# Save rbx
-	movl	$1, %eax
-	cpuid
-	movq	%r11,%rbx		# Restore rbx
-	xorl	%eax, %eax
-	// AVX and XSAVE supported?
-	andl	$((1 << 28) | (1 << 27)), %ecx
-	cmpl	$((1 << 28) | (1 << 27)), %ecx
-	jne	1f
-#  ifdef HAVE_AVX512_ASM_SUPPORT
-	// AVX512 supported in a processor?
-	movq	%rbx, %r11              # Save rbx
-	xorl	%ecx,%ecx
-	mov	$0x7,%eax
-	cpuid
-	andl	$(1 << 16), %ebx
-#  endif
-	xorl	%ecx, %ecx
-	// Get XFEATURE_ENABLED_MASK
-	xgetbv
-#  ifdef HAVE_AVX512_ASM_SUPPORT
-	test	%ebx, %ebx
-	movq	%r11, %rbx		# Restore rbx
-	je	2f
-	// Verify that XCR0[7:5] = '111b' and
-	// XCR0[2:1] = '11b' which means
-	// that zmm state is enabled
-	andl	$0xe6, %eax
-	movl	%eax, L(have_avx)(%rip)
-	cmpl	$0xe6, %eax
-	je	L(avx512_5)
-#  endif
-
-2:	andl	$0x6, %eax
-1:	subl	$0x5, %eax
-	movl	%eax, L(have_avx)(%rip)
-	cmpl	$0, %eax
-
-L(defined_5):
-	js	L(no_avx5)
-#  ifdef HAVE_AVX512_ASM_SUPPORT
-	cmpl	$0xe6, L(have_avx)(%rip)
-	je	L(avx512_5)
-#  endif
-
-	vmovdqa %ymm0, %fs:RTLD_SAVESPACE_SSE+0*YMM_SIZE
-	vmovdqa %ymm1, %fs:RTLD_SAVESPACE_SSE+1*YMM_SIZE
-	vmovdqa %ymm2, %fs:RTLD_SAVESPACE_SSE+2*YMM_SIZE
-	vmovdqa %ymm3, %fs:RTLD_SAVESPACE_SSE+3*YMM_SIZE
-	vmovdqa %ymm4, %fs:RTLD_SAVESPACE_SSE+4*YMM_SIZE
-	vmovdqa %ymm5, %fs:RTLD_SAVESPACE_SSE+5*YMM_SIZE
-	vmovdqa %ymm6, %fs:RTLD_SAVESPACE_SSE+6*YMM_SIZE
-	vmovdqa %ymm7, %fs:RTLD_SAVESPACE_SSE+7*YMM_SIZE
-	ret
-#  ifdef HAVE_AVX512_ASM_SUPPORT
-L(avx512_5):
-	vmovdqu64 %zmm0, %fs:RTLD_SAVESPACE_SSE+0*ZMM_SIZE
-	vmovdqu64 %zmm1, %fs:RTLD_SAVESPACE_SSE+1*ZMM_SIZE
-	vmovdqu64 %zmm2, %fs:RTLD_SAVESPACE_SSE+2*ZMM_SIZE
-	vmovdqu64 %zmm3, %fs:RTLD_SAVESPACE_SSE+3*ZMM_SIZE
-	vmovdqu64 %zmm4, %fs:RTLD_SAVESPACE_SSE+4*ZMM_SIZE
-	vmovdqu64 %zmm5, %fs:RTLD_SAVESPACE_SSE+5*ZMM_SIZE
-	vmovdqu64 %zmm6, %fs:RTLD_SAVESPACE_SSE+6*ZMM_SIZE
-	vmovdqu64 %zmm7, %fs:RTLD_SAVESPACE_SSE+7*ZMM_SIZE
-	ret
-#  endif
-L(no_avx5):
-# endif
-	movdqa	%xmm0, %fs:RTLD_SAVESPACE_SSE+0*XMM_SIZE
-	movdqa	%xmm1, %fs:RTLD_SAVESPACE_SSE+1*XMM_SIZE
-	movdqa	%xmm2, %fs:RTLD_SAVESPACE_SSE+2*XMM_SIZE
-	movdqa	%xmm3, %fs:RTLD_SAVESPACE_SSE+3*XMM_SIZE
-	movdqa	%xmm4, %fs:RTLD_SAVESPACE_SSE+4*XMM_SIZE
-	movdqa	%xmm5, %fs:RTLD_SAVESPACE_SSE+5*XMM_SIZE
-	movdqa	%xmm6, %fs:RTLD_SAVESPACE_SSE+6*XMM_SIZE
-	movdqa	%xmm7, %fs:RTLD_SAVESPACE_SSE+7*XMM_SIZE
-	ret
-	cfi_endproc
-	.size _dl_x86_64_save_sse, .-_dl_x86_64_save_sse
-
-
-	.globl _dl_x86_64_restore_sse
-	.type _dl_x86_64_restore_sse, @function
-	.align 16
-	cfi_startproc
-_dl_x86_64_restore_sse:
-# if defined HAVE_AVX_SUPPORT || defined HAVE_AVX512_ASM_SUPPORT
-	cmpl	$0, L(have_avx)(%rip)
-	js	L(no_avx6)
-#  ifdef HAVE_AVX512_ASM_SUPPORT
-	cmpl	$0xe6, L(have_avx)(%rip)
-	je	L(avx512_6)
-#  endif
-
-	vmovdqa %fs:RTLD_SAVESPACE_SSE+0*YMM_SIZE, %ymm0
-	vmovdqa %fs:RTLD_SAVESPACE_SSE+1*YMM_SIZE, %ymm1
-	vmovdqa %fs:RTLD_SAVESPACE_SSE+2*YMM_SIZE, %ymm2
-	vmovdqa %fs:RTLD_SAVESPACE_SSE+3*YMM_SIZE, %ymm3
-	vmovdqa %fs:RTLD_SAVESPACE_SSE+4*YMM_SIZE, %ymm4
-	vmovdqa %fs:RTLD_SAVESPACE_SSE+5*YMM_SIZE, %ymm5
-	vmovdqa %fs:RTLD_SAVESPACE_SSE+6*YMM_SIZE, %ymm6
-	vmovdqa %fs:RTLD_SAVESPACE_SSE+7*YMM_SIZE, %ymm7
-	ret
-#  ifdef HAVE_AVX512_ASM_SUPPORT
-L(avx512_6):
-	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+0*ZMM_SIZE, %zmm0
-	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+1*ZMM_SIZE, %zmm1
-	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+2*ZMM_SIZE, %zmm2
-	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+3*ZMM_SIZE, %zmm3
-	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+4*ZMM_SIZE, %zmm4
-	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+5*ZMM_SIZE, %zmm5
-	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+6*ZMM_SIZE, %zmm6
-	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+7*ZMM_SIZE, %zmm7
-	ret
-#  endif
-L(no_avx6):
-# endif
-	movdqa	%fs:RTLD_SAVESPACE_SSE+0*XMM_SIZE, %xmm0
-	movdqa	%fs:RTLD_SAVESPACE_SSE+1*XMM_SIZE, %xmm1
-	movdqa	%fs:RTLD_SAVESPACE_SSE+2*XMM_SIZE, %xmm2
-	movdqa	%fs:RTLD_SAVESPACE_SSE+3*XMM_SIZE, %xmm3
-	movdqa	%fs:RTLD_SAVESPACE_SSE+4*XMM_SIZE, %xmm4
-	movdqa	%fs:RTLD_SAVESPACE_SSE+5*XMM_SIZE, %xmm5
-	movdqa	%fs:RTLD_SAVESPACE_SSE+6*XMM_SIZE, %xmm6
-	movdqa	%fs:RTLD_SAVESPACE_SSE+7*XMM_SIZE, %xmm7
-	ret
-	cfi_endproc
-	.size _dl_x86_64_restore_sse, .-_dl_x86_64_restore_sse
+/* movaps/movups is 1-byte shorter.  */
+#define VEC_SIZE		16
+#define VMOVA			movaps
+#if DL_RUNIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT
+# define VMOV			movaps
+#else
+# define VMOV			movups
 #endif
+#define VEC(i)			xmm##i
+#define _dl_runtime_resolve	_dl_runtime_resolve_sse
+#define _dl_runtime_profile	_dl_runtime_profile_sse
+#undef RESTORE_AVX
+#include "dl-trampoline.h"
--- a/sysdeps/x86_64/dl-trampoline.h
+++ b/sysdeps/x86_64/dl-trampoline.h
@ -1,5 +1,4 @@
-/* Partial PLT profile trampoline to save and restore x86-64 vector
-   registers.
+/* PLT trampolines.  x86-64 version.
   Copyright (C) 2009-2015 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

@ -17,16 +16,252 @@
   License along with the GNU C Library; if not, see
   <http://www.gnu.org/licenses/>.  */

-#ifdef RESTORE_AVX
+#undef REGISTER_SAVE_AREA_RAW
+#ifdef __ILP32__
+/* X32 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as VEC0 to
+   VEC7.  */
+# define REGISTER_SAVE_AREA_RAW	(8 * 7 + VEC_SIZE * 8)
+#else
+/* X86-64 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as
+   BND0, BND1, BND2, BND3 and VEC0 to VEC7. */
+# define REGISTER_SAVE_AREA_RAW	(8 * 7 + 16 * 4 + VEC_SIZE * 8)
+#endif
+
+#undef REGISTER_SAVE_AREA
+#undef LOCAL_STORAGE_AREA
+#undef BASE
+#if DL_RUNIME_RESOLVE_REALIGN_STACK
+# define REGISTER_SAVE_AREA	(REGISTER_SAVE_AREA_RAW + 8)
+/* Local stack area before jumping to function address: RBX.  */
+# define LOCAL_STORAGE_AREA	8
+# define BASE			rbx
+# if (REGISTER_SAVE_AREA % VEC_SIZE) != 0
+#  error REGISTER_SAVE_AREA must be multples of VEC_SIZE
+# endif
+#else
+# define REGISTER_SAVE_AREA	REGISTER_SAVE_AREA_RAW
+/* Local stack area before jumping to function address:  All saved
+   registers.  */
+# define LOCAL_STORAGE_AREA	REGISTER_SAVE_AREA
+# define BASE			rsp
+# if (REGISTER_SAVE_AREA % 16) != 8
+#  error REGISTER_SAVE_AREA must be odd multples of 8
+# endif
+#endif
+
+	.text
+	.globl _dl_runtime_resolve
+	.hidden _dl_runtime_resolve
+	.type _dl_runtime_resolve, @function
+	.align 16
+	cfi_startproc
+_dl_runtime_resolve:
+	cfi_adjust_cfa_offset(16) # Incorporate PLT
+#if DL_RUNIME_RESOLVE_REALIGN_STACK
+# if LOCAL_STORAGE_AREA != 8
+#  error LOCAL_STORAGE_AREA must be 8
+# endif
+	pushq %rbx			# push subtracts stack by 8.
+	cfi_adjust_cfa_offset(8)
+	cfi_rel_offset(%rbx, 0)
+	mov %RSP_LP, %RBX_LP
+	cfi_def_cfa_register(%rbx)
+	and $-VEC_SIZE, %RSP_LP
+#endif
+	sub $REGISTER_SAVE_AREA, %RSP_LP
+	cfi_adjust_cfa_offset(REGISTER_SAVE_AREA)
+	# Preserve registers otherwise clobbered.
+	movq %rax, REGISTER_SAVE_RAX(%rsp)
+	movq %rcx, REGISTER_SAVE_RCX(%rsp)
+	movq %rdx, REGISTER_SAVE_RDX(%rsp)
+	movq %rsi, REGISTER_SAVE_RSI(%rsp)
+	movq %rdi, REGISTER_SAVE_RDI(%rsp)
+	movq %r8, REGISTER_SAVE_R8(%rsp)
+	movq %r9, REGISTER_SAVE_R9(%rsp)
+	VMOV %VEC(0), (REGISTER_SAVE_VEC_OFF)(%rsp)
+	VMOV %VEC(1), (REGISTER_SAVE_VEC_OFF + VEC_SIZE)(%rsp)
+	VMOV %VEC(2), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 2)(%rsp)
+	VMOV %VEC(3), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 3)(%rsp)
+	VMOV %VEC(4), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 4)(%rsp)
+	VMOV %VEC(5), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 5)(%rsp)
+	VMOV %VEC(6), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 6)(%rsp)
+	VMOV %VEC(7), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 7)(%rsp)
+#ifndef __ILP32__
+	# We also have to preserve bound registers.  These are nops if
+	# Intel MPX isn't available or disabled.
+# ifdef HAVE_MPX_SUPPORT
+	bndmov %bnd0, REGISTER_SAVE_BND0(%rsp)
+	bndmov %bnd1, REGISTER_SAVE_BND1(%rsp)
+	bndmov %bnd2, REGISTER_SAVE_BND2(%rsp)
+	bndmov %bnd3, REGISTER_SAVE_BND3(%rsp)
+# else
+#  if REGISTER_SAVE_BND0 == 0
+	.byte 0x66,0x0f,0x1b,0x04,0x24
+#  else
+	.byte 0x66,0x0f,0x1b,0x44,0x24,REGISTER_SAVE_BND0
+#  endif
+	.byte 0x66,0x0f,0x1b,0x4c,0x24,REGISTER_SAVE_BND1
+	.byte 0x66,0x0f,0x1b,0x54,0x24,REGISTER_SAVE_BND2
+	.byte 0x66,0x0f,0x1b,0x5c,0x24,REGISTER_SAVE_BND3
+# endif
+#endif
+	# Copy args pushed by PLT in register.
+	# %rdi: link_map, %rsi: reloc_index
+	mov (LOCAL_STORAGE_AREA + 8)(%BASE), %RSI_LP
+	mov LOCAL_STORAGE_AREA(%BASE), %RDI_LP
+	call _dl_fixup		# Call resolver.
+	mov %RAX_LP, %R11_LP	# Save return value
+#ifndef __ILP32__
+	# Restore bound registers.  These are nops if Intel MPX isn't
+	# avaiable or disabled.
+# ifdef HAVE_MPX_SUPPORT
+	bndmov REGISTER_SAVE_BND3(%rsp), %bnd3
+	bndmov REGISTER_SAVE_BND2(%rsp), %bnd2
+	bndmov REGISTER_SAVE_BND1(%rsp), %bnd1
+	bndmov REGISTER_SAVE_BND0(%rsp), %bnd0
+# else
+	.byte 0x66,0x0f,0x1a,0x5c,0x24,REGISTER_SAVE_BND3
+	.byte 0x66,0x0f,0x1a,0x54,0x24,REGISTER_SAVE_BND2
+	.byte 0x66,0x0f,0x1a,0x4c,0x24,REGISTER_SAVE_BND1
+#  if REGISTER_SAVE_BND0 == 0
+	.byte 0x66,0x0f,0x1a,0x04,0x24
+#  else
+	.byte 0x66,0x0f,0x1a,0x44,0x24,REGISTER_SAVE_BND0
+#  endif
+# endif
+#endif
+	# Get register content back.
+	movq REGISTER_SAVE_R9(%rsp), %r9
+	movq REGISTER_SAVE_R8(%rsp), %r8
+	movq REGISTER_SAVE_RDI(%rsp), %rdi
+	movq REGISTER_SAVE_RSI(%rsp), %rsi
+	movq REGISTER_SAVE_RDX(%rsp), %rdx
+	movq REGISTER_SAVE_RCX(%rsp), %rcx
+	movq REGISTER_SAVE_RAX(%rsp), %rax
+	VMOV (REGISTER_SAVE_VEC_OFF)(%rsp), %VEC(0)
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE)(%rsp), %VEC(1)
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 2)(%rsp), %VEC(2)
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 3)(%rsp), %VEC(3)
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 4)(%rsp), %VEC(4)
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 5)(%rsp), %VEC(5)
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 6)(%rsp), %VEC(6)
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 7)(%rsp), %VEC(7)
+#if DL_RUNIME_RESOLVE_REALIGN_STACK
+	mov %RBX_LP, %RSP_LP
+	cfi_def_cfa_register(%rsp)
+	movq (%rsp), %rbx
+	cfi_restore(%rbx)
+#endif
+	# Adjust stack(PLT did 2 pushes)
+	add $(LOCAL_STORAGE_AREA + 16), %RSP_LP
+	cfi_adjust_cfa_offset(-(LOCAL_STORAGE_AREA + 16))
+	# Preserve bound registers.
+	PRESERVE_BND_REGS_PREFIX
+	jmp *%r11		# Jump to function address.
+	cfi_endproc
+	.size _dl_runtime_resolve, .-_dl_runtime_resolve
+
+
+#ifndef PROF
+# if (LR_VECTOR_OFFSET % VEC_SIZE) != 0
+#  error LR_VECTOR_OFFSET must be multples of VEC_SIZE
+# endif
+
+	.globl _dl_runtime_profile
+	.hidden _dl_runtime_profile
+	.type _dl_runtime_profile, @function
+	.align 16
+_dl_runtime_profile:
+	cfi_startproc
+	cfi_adjust_cfa_offset(16) # Incorporate PLT
+	/* The La_x86_64_regs data structure pointed to by the
+	   fourth paramater must be VEC_SIZE-byte aligned.  This must
+	   be explicitly enforced.  We have the set up a dynamically
+	   sized stack frame.  %rbx points to the top half which
+	   has a fixed size and preserves the original stack pointer.  */
+
+	sub $32, %RSP_LP	# Allocate the local storage.
+	cfi_adjust_cfa_offset(32)
+	movq %rbx, (%rsp)
+	cfi_rel_offset(%rbx, 0)
+
+	/* On the stack:
+		56(%rbx)	parameter #1
+		48(%rbx)	return address
+
+		40(%rbx)	reloc index
+		32(%rbx)	link_map
+
+		24(%rbx)	La_x86_64_regs pointer
+		16(%rbx)	framesize
+		 8(%rbx)	rax
+		  (%rbx)	rbx
+	*/
+
+	movq %rax, 8(%rsp)
+	mov %RSP_LP, %RBX_LP
+	cfi_def_cfa_register(%rbx)
+
+	/* Actively align the La_x86_64_regs structure.  */
+	and $-VEC_SIZE, %RSP_LP
+# if defined HAVE_AVX_SUPPORT || defined HAVE_AVX512_ASM_SUPPORT
+	/* sizeof(La_x86_64_regs).  Need extra space for 8 SSE registers
+	   to detect if any xmm0-xmm7 registers are changed by audit
+	   module.  */
+	sub $(LR_SIZE + XMM_SIZE*8), %RSP_LP
+# else
+	sub $LR_SIZE, %RSP_LP		# sizeof(La_x86_64_regs)
+# endif
+	movq %rsp, 24(%rbx)
+
+	/* Fill the La_x86_64_regs structure.  */
+	movq %rdx, LR_RDX_OFFSET(%rsp)
+	movq %r8,  LR_R8_OFFSET(%rsp)
+	movq %r9,  LR_R9_OFFSET(%rsp)
+	movq %rcx, LR_RCX_OFFSET(%rsp)
+	movq %rsi, LR_RSI_OFFSET(%rsp)
+	movq %rdi, LR_RDI_OFFSET(%rsp)
+	movq %rbp, LR_RBP_OFFSET(%rsp)
+
+	lea 48(%rbx), %RAX_LP
+	movq %rax, LR_RSP_OFFSET(%rsp)
+
+	/* We always store the XMM registers even if AVX is available.
+	   This is to provide backward binary compatibility for existing
+	   audit modules.  */
+	movaps %xmm0,		   (LR_XMM_OFFSET)(%rsp)
+	movaps %xmm1, (LR_XMM_OFFSET +   XMM_SIZE)(%rsp)
+	movaps %xmm2, (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp)
+	movaps %xmm3, (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp)
+	movaps %xmm4, (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp)
+	movaps %xmm5, (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp)
+	movaps %xmm6, (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp)
+	movaps %xmm7, (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp)
+
+# ifndef __ILP32__
+#  ifdef HAVE_MPX_SUPPORT
+	bndmov %bnd0, 		   (LR_BND_OFFSET)(%rsp)  # Preserve bound
+	bndmov %bnd1, (LR_BND_OFFSET +   BND_SIZE)(%rsp)  # registers. Nops if
+	bndmov %bnd2, (LR_BND_OFFSET + BND_SIZE*2)(%rsp)  # MPX not available
+	bndmov %bnd3, (LR_BND_OFFSET + BND_SIZE*3)(%rsp)  # or disabled.
+#  else
+	.byte 0x66,0x0f,0x1b,0x84,0x24;.long (LR_BND_OFFSET)
+	.byte 0x66,0x0f,0x1b,0x8c,0x24;.long (LR_BND_OFFSET + BND_SIZE)
+	.byte 0x66,0x0f,0x1b,0x94,0x24;.long (LR_BND_OFFSET + BND_SIZE*2)
+	.byte 0x66,0x0f,0x1b,0x9c,0x24;.long (LR_BND_OFFSET + BND_SIZE*3)
+#  endif
+# endif
+
+# ifdef RESTORE_AVX
 	/* This is to support AVX audit modules.  */
-	VMOV %VEC(0),		      (LR_VECTOR_OFFSET)(%rsp)
-	VMOV %VEC(1), (LR_VECTOR_OFFSET +   VECTOR_SIZE)(%rsp)
-	VMOV %VEC(2), (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp)
-	VMOV %VEC(3), (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp)
-	VMOV %VEC(4), (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp)
-	VMOV %VEC(5), (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp)
-	VMOV %VEC(6), (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp)
-	VMOV %VEC(7), (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp)
+	VMOVA %VEC(0),		      (LR_VECTOR_OFFSET)(%rsp)
+	VMOVA %VEC(1), (LR_VECTOR_OFFSET +   VECTOR_SIZE)(%rsp)
+	VMOVA %VEC(2), (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp)
+	VMOVA %VEC(3), (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp)
+	VMOVA %VEC(4), (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp)
+	VMOVA %VEC(5), (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp)
+	VMOVA %VEC(6), (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp)
+	VMOVA %VEC(7), (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp)

 	/* Save xmm0-xmm7 registers to detect if any of them are
 	   changed by audit module.  */
@ -38,7 +273,7 @@
 	vmovdqa %xmm5, (LR_SIZE + XMM_SIZE*5)(%rsp)
 	vmovdqa %xmm6, (LR_SIZE + XMM_SIZE*6)(%rsp)
 	vmovdqa %xmm7, (LR_SIZE + XMM_SIZE*7)(%rsp)
-#endif
+# endif

 	mov %RSP_LP, %RCX_LP	# La_x86_64_regs pointer to %rcx.
 	mov 48(%rbx), %RDX_LP	# Load return address if needed.
@ -63,7 +298,7 @@
 	movaps (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp), %xmm6
 	movaps (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp), %xmm7

-#ifdef RESTORE_AVX
+# ifdef RESTORE_AVX
 	/* Check if any xmm0-xmm7 registers are changed by audit
 	   module.  */
 	vpcmpeqq (LR_SIZE)(%rsp), %xmm0, %xmm8
@ -72,7 +307,7 @@
 	je 2f
 	vmovdqa	%xmm0, (LR_VECTOR_OFFSET)(%rsp)
 	jmp 1f
-2:	VMOV (LR_VECTOR_OFFSET)(%rsp), %VEC(0)
+2:	VMOVA (LR_VECTOR_OFFSET)(%rsp), %VEC(0)
 	vmovdqa	%xmm0, (LR_XMM_OFFSET)(%rsp)

 1:	vpcmpeqq (LR_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm8
@ -81,7 +316,7 @@
 	je 2f
 	vmovdqa	%xmm1, (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp)
 	jmp 1f
-2:	VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp), %VEC(1)
+2:	VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp), %VEC(1)
 	vmovdqa	%xmm1, (LR_XMM_OFFSET + XMM_SIZE)(%rsp)

 1:	vpcmpeqq (LR_SIZE + XMM_SIZE*2)(%rsp), %xmm2, %xmm8
@ -90,7 +325,7 @@
 	je 2f
 	vmovdqa	%xmm2, (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp)
 	jmp 1f
-2:	VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp), %VEC(2)
+2:	VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp), %VEC(2)
 	vmovdqa	%xmm2, (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp)

 1:	vpcmpeqq (LR_SIZE + XMM_SIZE*3)(%rsp), %xmm3, %xmm8
@ -99,7 +334,7 @@
 	je 2f
 	vmovdqa	%xmm3, (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp)
 	jmp 1f
-2:	VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp), %VEC(3)
+2:	VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp), %VEC(3)
 	vmovdqa	%xmm3, (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp)

 1:	vpcmpeqq (LR_SIZE + XMM_SIZE*4)(%rsp), %xmm4, %xmm8
@ -108,7 +343,7 @@
 	je 2f
 	vmovdqa	%xmm4, (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp)
 	jmp 1f
-2:	VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp), %VEC(4)
+2:	VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp), %VEC(4)
 	vmovdqa	%xmm4, (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp)

 1:	vpcmpeqq (LR_SIZE + XMM_SIZE*5)(%rsp), %xmm5, %xmm8
@ -117,7 +352,7 @@
 	je 2f
 	vmovdqa	%xmm5, (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp)
 	jmp 1f
-2:	VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp), %VEC(5)
+2:	VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp), %VEC(5)
 	vmovdqa	%xmm5, (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp)

 1:	vpcmpeqq (LR_SIZE + XMM_SIZE*6)(%rsp), %xmm6, %xmm8
@ -126,7 +361,7 @@
 	je 2f
 	vmovdqa	%xmm6, (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp)
 	jmp 1f
-2:	VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp), %VEC(6)
+2:	VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp), %VEC(6)
 	vmovdqa	%xmm6, (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp)

 1:	vpcmpeqq (LR_SIZE + XMM_SIZE*7)(%rsp), %xmm7, %xmm8
@ -135,25 +370,25 @@
 	je 2f
 	vmovdqa	%xmm7, (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp)
 	jmp 1f
-2:	VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp), %VEC(7)
+2:	VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp), %VEC(7)
 	vmovdqa	%xmm7, (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp)

 1:
-#endif
+# endif

-#ifndef __ILP32__
-# ifdef HAVE_MPX_SUPPORT
+# ifndef __ILP32__
+#  ifdef HAVE_MPX_SUPPORT
 	bndmov              (LR_BND_OFFSET)(%rsp), %bnd0  # Restore bound
 	bndmov (LR_BND_OFFSET +   BND_SIZE)(%rsp), %bnd1  # registers.
 	bndmov (LR_BND_OFFSET + BND_SIZE*2)(%rsp), %bnd2
 	bndmov (LR_BND_OFFSET + BND_SIZE*3)(%rsp), %bnd3
-# else
+#  else
 	.byte 0x66,0x0f,0x1a,0x84,0x24;.long (LR_BND_OFFSET)
 	.byte 0x66,0x0f,0x1a,0x8c,0x24;.long (LR_BND_OFFSET + BND_SIZE)
 	.byte 0x66,0x0f,0x1a,0x94,0x24;.long (LR_BND_OFFSET + BND_SIZE*2)
 	.byte 0x66,0x0f,0x1a,0x9c,0x24;.long (LR_BND_OFFSET + BND_SIZE*3)
+#  endif
 # endif
-#endif

 	mov  16(%rbx), %R10_LP	# Anything in framesize?
 	test %R10_LP, %R10_LP
@ -168,12 +403,12 @@
 	movq LR_RSI_OFFSET(%rsp), %rsi
 	movq LR_RDI_OFFSET(%rsp), %rdi

-	movq %rbx, %rsp
+	mov %RBX_LP, %RSP_LP
 	movq (%rsp), %rbx
-	cfi_restore(rbx)
+	cfi_restore(%rbx)
 	cfi_def_cfa_register(%rsp)

-	addq $48, %rsp		# Adjust the stack to the return value
+	add $48, %RSP_LP	# Adjust the stack to the return value
 				# (eats the reloc index and link_map)
 	cfi_adjust_cfa_offset(-48)
 	PRESERVE_BND_REGS_PREFIX
@ -189,13 +424,13 @@
 	   temporary buffer of the size specified by the 'framesize'
 	   returned from _dl_profile_fixup */

-	leaq LR_RSP_OFFSET(%rbx), %rsi	# stack
-	addq $8, %r10
-	andq $0xfffffffffffffff0, %r10
-	movq %r10, %rcx
-	subq %r10, %rsp
-	movq %rsp, %rdi
-	shrq $3, %rcx
+	lea LR_RSP_OFFSET(%rbx), %RSI_LP # stack
+	add $8, %R10_LP
+	and $-16, %R10_LP
+	mov %R10_LP, %RCX_LP
+	sub %R10_LP, %RSP_LP
+	mov %RSP_LP, %RDI_LP
+	shr $3, %RCX_LP
 	rep
 	movsq

@ -206,21 +441,21 @@
 	PRESERVE_BND_REGS_PREFIX
 	call *%r11

-	mov 24(%rbx), %rsp	# Drop the copied stack content
+	mov 24(%rbx), %RSP_LP	# Drop the copied stack content

 	/* Now we have to prepare the La_x86_64_retval structure for the
 	   _dl_call_pltexit.  The La_x86_64_regs is being pointed by rsp now,
 	   so we just need to allocate the sizeof(La_x86_64_retval) space on
 	   the stack, since the alignment has already been taken care of. */
-#ifdef RESTORE_AVX
+# ifdef RESTORE_AVX
 	/* sizeof(La_x86_64_retval).  Need extra space for 2 SSE
 	   registers to detect if xmm0/xmm1 registers are changed
 	   by audit module.  */
-	subq $(LRV_SIZE + XMM_SIZE*2), %rsp
-#else
-	subq $LRV_SIZE, %rsp	# sizeof(La_x86_64_retval)
-#endif
-	movq %rsp, %rcx		# La_x86_64_retval argument to %rcx.
+	sub $(LRV_SIZE + XMM_SIZE*2), %RSP_LP
+# else
+	sub $LRV_SIZE, %RSP_LP	# sizeof(La_x86_64_retval)
+# endif
+	mov %RSP_LP, %RCX_LP	# La_x86_64_retval argument to %rcx.

 	/* Fill in the La_x86_64_retval structure.  */
 	movq %rax, LRV_RAX_OFFSET(%rcx)
@ -229,26 +464,26 @@
 	movaps %xmm0, LRV_XMM0_OFFSET(%rcx)
 	movaps %xmm1, LRV_XMM1_OFFSET(%rcx)

-#ifdef RESTORE_AVX
+# ifdef RESTORE_AVX
 	/* This is to support AVX audit modules.  */
-	VMOV %VEC(0), LRV_VECTOR0_OFFSET(%rcx)
-	VMOV %VEC(1), LRV_VECTOR1_OFFSET(%rcx)
+	VMOVA %VEC(0), LRV_VECTOR0_OFFSET(%rcx)
+	VMOVA %VEC(1), LRV_VECTOR1_OFFSET(%rcx)

 	/* Save xmm0/xmm1 registers to detect if they are changed
 	   by audit module.  */
 	vmovdqa %xmm0,		  (LRV_SIZE)(%rcx)
 	vmovdqa %xmm1, (LRV_SIZE + XMM_SIZE)(%rcx)
-#endif
+# endif

-#ifndef __ILP32__
-# ifdef HAVE_MPX_SUPPORT
+# ifndef __ILP32__
+#  ifdef HAVE_MPX_SUPPORT
 	bndmov %bnd0, LRV_BND0_OFFSET(%rcx)  # Preserve returned bounds.
 	bndmov %bnd1, LRV_BND1_OFFSET(%rcx)
-# else
+#  else
 	.byte  0x66,0x0f,0x1b,0x81;.long (LRV_BND0_OFFSET)
 	.byte  0x66,0x0f,0x1b,0x89;.long (LRV_BND1_OFFSET)
+#  endif
 # endif
-#endif

 	fstpt LRV_ST0_OFFSET(%rcx)
 	fstpt LRV_ST1_OFFSET(%rcx)
@ -265,50 +500,47 @@
 	movaps LRV_XMM0_OFFSET(%rsp), %xmm0
 	movaps LRV_XMM1_OFFSET(%rsp), %xmm1

-#ifdef RESTORE_AVX
+# ifdef RESTORE_AVX
 	/* Check if xmm0/xmm1 registers are changed by audit module.  */
 	vpcmpeqq (LRV_SIZE)(%rsp), %xmm0, %xmm2
 	vpmovmskb %xmm2, %esi
 	cmpl $0xffff, %esi
 	jne 1f
-	VMOV LRV_VECTOR0_OFFSET(%rsp), %VEC(0)
+	VMOVA LRV_VECTOR0_OFFSET(%rsp), %VEC(0)

 1:	vpcmpeqq (LRV_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm2
 	vpmovmskb %xmm2, %esi
 	cmpl $0xffff, %esi
 	jne 1f
-	VMOV LRV_VECTOR1_OFFSET(%rsp), %VEC(1)
+	VMOVA LRV_VECTOR1_OFFSET(%rsp), %VEC(1)

 1:
-#endif
+# endif

-#ifndef __ILP32__
-# ifdef HAVE_MPX_SUPPORT
+# ifndef __ILP32__
+#  ifdef HAVE_MPX_SUPPORT
 	bndmov LRV_BND0_OFFSET(%rsp), %bnd0  # Restore bound registers.
 	bndmov LRV_BND1_OFFSET(%rsp), %bnd1
-# else
+#  else
 	.byte  0x66,0x0f,0x1a,0x84,0x24;.long (LRV_BND0_OFFSET)
 	.byte  0x66,0x0f,0x1a,0x8c,0x24;.long (LRV_BND1_OFFSET)
+#  endif
 # endif
-#endif

 	fldt LRV_ST1_OFFSET(%rsp)
 	fldt LRV_ST0_OFFSET(%rsp)

-	movq %rbx, %rsp
+	mov %RBX_LP, %RSP_LP
 	movq (%rsp), %rbx
-	cfi_restore(rbx)
+	cfi_restore(%rbx)
 	cfi_def_cfa_register(%rsp)

-	addq $48, %rsp		# Adjust the stack to the return value
+	add $48, %RSP_LP	# Adjust the stack to the return value
 				# (eats the reloc index and link_map)
 	cfi_adjust_cfa_offset(-48)
 	PRESERVE_BND_REGS_PREFIX
 	retq

-#ifdef MORE_CODE
-	cfi_adjust_cfa_offset(48)
-	cfi_rel_offset(%rbx, 0)
-	cfi_def_cfa_register(%rbx)
-# undef MORE_CODE
+	cfi_endproc
+	.size _dl_runtime_profile, .-_dl_runtime_profile
 #endif
--- a/sysdeps/x86_64/ifuncmain8.c
+++ b/sysdeps/x86_64/ifuncmain8.c
@ -0,0 +1,32 @@
+/* Test IFUNC selector with floating-point parameters.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <stdlib.h>
+
+extern float foo (float);
+
+static int
+do_test (void)
+{
+  if (foo (2) != 3)
+    abort ();
+  return 0;
+}
+
+#define TEST_FUNCTION do_test ()
+#include "../test-skeleton.c"
--- a/sysdeps/x86_64/ifuncmod8.c
+++ b/sysdeps/x86_64/ifuncmod8.c
@ -0,0 +1,36 @@
+/* Test IFUNC selector with floating-point parameters.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <emmintrin.h>
+
+void * foo_ifunc (void) __asm__ ("foo");
+__asm__(".type foo, %gnu_indirect_function");
+
+static float
+foo_impl (float x)
+{
+  return x + 1;
+}
+
+void *
+foo_ifunc (void)
+{
+  __m128i xmm = _mm_set1_epi32 (-1);
+  asm volatile ("movdqa %0, %%xmm0" : : "x" (xmm) : "xmm0" );
+  return foo_impl;
+}
--- a/sysdeps/x86_64/nptl/tcb-offsets.sym
+++ b/sysdeps/x86_64/nptl/tcb-offsets.sym
@ -16,7 +16,6 @@ VGETCPU_CACHE_OFFSET	offsetof (tcbhead_t, vgetcpu_cache)
 #ifndef __ASSUME_PRIVATE_FUTEX
 PRIVATE_FUTEX		offsetof (tcbhead_t, private_futex)
 #endif
-RTLD_SAVESPACE_SSE	offsetof (tcbhead_t, rtld_savespace_sse)

 -- Not strictly offsets, but these values are also used in the TCB.
 TCB_CANCELSTATE_BITMASK	 CANCELSTATE_BITMASK
--- a/sysdeps/x86_64/nptl/tls.h
+++ b/sysdeps/x86_64/nptl/tls.h
@ -67,14 +67,15 @@ typedef struct
 # else
  int __glibc_reserved1;
 # endif
-  int rtld_must_xmm_save;
+  int __glibc_unused1;
  /* Reservation of some values for the TM ABI.  */
  void *__private_tm[4];
  /* GCC split stack support.  */
  void *__private_ss;
  long int __glibc_reserved2;
-  /* Have space for the post-AVX register size.  */
-  __128bits rtld_savespace_sse[8][4] __attribute__ ((aligned (32)));
+  /* Must be kept even if it is no longer used by glibc since programs,
+     like AddressSanitizer, depend on the size of tcbhead_t.  */
+  __128bits __glibc_unused2[8][4] __attribute__ ((aligned (32)));

  void *__padding[8];
 } tcbhead_t;
@ -384,41 +385,6 @@ typedef struct
 # define THREAD_GSCOPE_WAIT() \
  GL(dl_wait_lookup_done) ()

-
-# ifdef SHARED
-/* Defined in dl-trampoline.S.  */
-extern void _dl_x86_64_save_sse (void);
-extern void _dl_x86_64_restore_sse (void);
-
-# define RTLD_CHECK_FOREIGN_CALL \
-  (THREAD_GETMEM (THREAD_SELF, header.rtld_must_xmm_save) != 0)
-
-/* NB: Don't use the xchg operation because that would imply a lock
-   prefix which is expensive and unnecessary.  The cache line is also
-   not contested at all.  */
-#  define RTLD_ENABLE_FOREIGN_CALL \
-  int old_rtld_must_xmm_save = THREAD_GETMEM (THREAD_SELF,		      \
-					      header.rtld_must_xmm_save);     \
-  THREAD_SETMEM (THREAD_SELF, header.rtld_must_xmm_save, 1)
-
-#  define RTLD_PREPARE_FOREIGN_CALL \
-  do if (THREAD_GETMEM (THREAD_SELF, header.rtld_must_xmm_save))	      \
-    {									      \
-      _dl_x86_64_save_sse ();						      \
-      THREAD_SETMEM (THREAD_SELF, header.rtld_must_xmm_save, 0);	      \
-    }									      \
-  while (0)
-
-#  define RTLD_FINALIZE_FOREIGN_CALL \
-  do {									      \
-    if (THREAD_GETMEM (THREAD_SELF, header.rtld_must_xmm_save) == 0)	      \
-      _dl_x86_64_restore_sse ();					      \
-    THREAD_SETMEM (THREAD_SELF, header.rtld_must_xmm_save,		      \
-		   old_rtld_must_xmm_save);				      \
-  } while (0)
-# endif
-
-
 #endif /* __ASSEMBLER__ */

 #endif	/* tls.h */