libitm: Remove unused code.

In particular, unused code that's presenting portability problems. From-SVN: r181241
2011-11-09 14:54:55 -08:00 · 2011-11-09 14:54:55 -08:00 · 79b1edb6b5
commit 79b1edb6b5
parent cb8010f922
22 changed files with 41 additions and 2669 deletions
--- a/libitm/ChangeLog
+++ b/libitm/ChangeLog
@ -1,3 +1,26 @@
 2011-11-09  Richard Henderson  <rth@redhat.com>
 	* barrier.tpl, memcpy.cc, memset.cc, method-wbetl.cc: Remove file.
 	* config/alpha/unaligned.h: Remove file.
 	* config/generic/unaligned.h: Remove file.
 	* config/x86/unaligned.h: Remove file.
 	* config/generic/cachepage.h: Remove file.
 	* config/posix/cachepage.cc: Remove file.
 	* config/generic/cacheline.cc: Remove file.
 	* config/x86/cacheline.cc: Remove file.
 	* config/generic/cacheline.h (gtm_cacheline): Remove the
 	store_mask, copy_mask, copy_mask_wb methods.
 	* config/x86/cacheline.h: Likewise.
 	* config/alpha/cacheline.h: Fall back to generic after setting size.
 	* config/generic/tls.cc (gtm_mask_stack): Remove.
 	* config/x86/x86_avx.cc (GTM_vpperm_shift): Remove.
 	(GTM_vpalignr_table): Remove.
 	* config/x86/x86_sse.cc (GTM_palignr_table): Remove.
 	(GTM_pshift_table): Remove.
 	* config/libitm_i.h: Don't include cachepage.h.
 	* Makefile.am (libitm_la_SOURCES): Remove cacheline.cc, cachepage.cc
 	* Makefile.in, testsuite/Makefile.in: Rebuild.
 2011-11-09  Richard Henderson  <rth@redhat.com>
 	* config/x86/cacheline.h (gtm_cacheline::store_mask): Use .byte
--- a/libitm/Makefile.am
+++ b/libitm/Makefile.am
@ -41,7 +41,7 @@ libitm_la_LDFLAGS = $(libitm_version_info) $(libitm_version_script) \
 libitm_la_SOURCES = \
 	aatree.cc alloc.cc alloc_c.cc alloc_cpp.cc barrier.cc beginend.cc \
-	clone.cc cacheline.cc cachepage.cc eh_cpp.cc local.cc \
+	clone.cc eh_cpp.cc local.cc \
 	query.cc retry.cc rwlock.cc useraction.cc util.cc \
 	sjlj.S tls.cc method-serial.cc method-gl.cc
--- a/libitm/Makefile.in
+++ b/libitm/Makefile.in
@ -48,6 +48,7 @@ DIST_COMMON = $(am__configure_deps) $(srcdir)/../config.guess \
 	$(top_srcdir)/configure ChangeLog
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
 am__aclocal_m4_deps = $(top_srcdir)/../config/acx.m4 \
 	$(top_srcdir)/../config/asmcfi.m4 \
 	$(top_srcdir)/../config/depstand.m4 \
 	$(top_srcdir)/../config/enable.m4 \
 	$(top_srcdir)/../config/futex.m4 \
@ -94,17 +95,17 @@ am__installdirs = "$(DESTDIR)$(toolexeclibdir)" "$(DESTDIR)$(infodir)" \
 LTLIBRARIES = $(toolexeclib_LTLIBRARIES)
 libitm_la_LIBADD =
 am__libitm_la_SOURCES_DIST = aatree.cc alloc.cc alloc_c.cc \
-	alloc_cpp.cc barrier.cc beginend.cc clone.cc cacheline.cc \
+	alloc_cpp.cc barrier.cc beginend.cc clone.cc eh_cpp.cc \
-	cachepage.cc eh_cpp.cc local.cc query.cc retry.cc rwlock.cc \
+	local.cc query.cc retry.cc rwlock.cc useraction.cc util.cc \
-	useraction.cc util.cc sjlj.S tls.cc method-serial.cc \
+	sjlj.S tls.cc method-serial.cc method-gl.cc x86_sse.cc \
-	method-gl.cc x86_sse.cc x86_avx.cc futex.cc
+	x86_avx.cc futex.cc
@ARCH_X86_TRUE@am__objects_1 = x86_sse.lo x86_avx.lo
@ARCH_FUTEX_TRUE@am__objects_2 = futex.lo
 am_libitm_la_OBJECTS = aatree.lo alloc.lo alloc_c.lo alloc_cpp.lo \
-	barrier.lo beginend.lo clone.lo cacheline.lo cachepage.lo \
+	barrier.lo beginend.lo clone.lo eh_cpp.lo local.lo query.lo \
-	eh_cpp.lo local.lo query.lo retry.lo rwlock.lo useraction.lo \
+	retry.lo rwlock.lo useraction.lo util.lo sjlj.lo tls.lo \
-	util.lo sjlj.lo tls.lo method-serial.lo method-gl.lo \
+	method-serial.lo method-gl.lo $(am__objects_1) \
-	$(am__objects_1) $(am__objects_2)
+	$(am__objects_2)
 libitm_la_OBJECTS = $(am_libitm_la_OBJECTS)
 DEFAULT_INCLUDES = -I.@am__isrc@
 depcomp = $(SHELL) $(top_srcdir)/../depcomp
@ -234,8 +235,6 @@ ECHO_N = @ECHO_N@
 ECHO_T = @ECHO_T@
 EGREP = @EGREP@
 EXEEXT = @EXEEXT@
 FC = @FC@
 FCFLAGS = @FCFLAGS@
 FGREP = @FGREP@
 GREP = @GREP@
 INSTALL = @INSTALL@
@ -286,7 +285,6 @@ abs_top_srcdir = @abs_top_srcdir@
 ac_ct_CC = @ac_ct_CC@
 ac_ct_CXX = @ac_ct_CXX@
 ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
 ac_ct_FC = @ac_ct_FC@
 am__include = @am__include@
 am__leading_dot = @am__leading_dot@
 am__quote = @am__quote@
@ -371,10 +369,9 @@ libitm_la_LDFLAGS = $(libitm_version_info) $(libitm_version_script) \
        -no-undefined
 libitm_la_SOURCES = aatree.cc alloc.cc alloc_c.cc alloc_cpp.cc \
-	barrier.cc beginend.cc clone.cc cacheline.cc cachepage.cc \
+	barrier.cc beginend.cc clone.cc eh_cpp.cc local.cc query.cc \
-	eh_cpp.cc local.cc query.cc retry.cc rwlock.cc useraction.cc \
+	retry.cc rwlock.cc useraction.cc util.cc sjlj.S tls.cc \
-	util.cc sjlj.S tls.cc method-serial.cc method-gl.cc \
+	method-serial.cc method-gl.cc $(am__append_1) $(am__append_2)
 	$(am__append_1) $(am__append_2)
 # Automake Documentation:
 # If your package has Texinfo files in many directories, you can use the
@ -500,8 +497,6 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/alloc_cpp.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/barrier.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/beginend.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/cacheline.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/cachepage.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/clone.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/eh_cpp.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/futex.Plo@am__quote@
--- a/libitm/barrier.tpl
+++ b/libitm/barrier.tpl
@ -1,170 +0,0 @@
 /* -*- c++ -*- */
 /* Copyright (C) 2008, 2009, 2011 Free Software Foundation, Inc.
   Contributed by Richard Henderson <rth@redhat.com>.
   This file is part of the GNU Transactional Memory Library (libitm).
   Libitm is free software; you can redistribute it and/or modify it
   under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3 of the License, or
   (at your option) any later version.
   Libitm is distributed in the hope that it will be useful, but WITHOUT ANY
   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
   more details.
   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.
   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */
 #include "unaligned.h"
 namespace {
 using namespace GTM;
 template<typename T>
 T do_read (const T *ptr, abi_dispatch::lock_type lock)
 {
  //
  // Find the cacheline that holds the current value of *PTR.
  //
  abi_dispatch *disp = abi_disp();
  uintptr_t iptr = reinterpret_cast<uintptr_t>(ptr);
  // Normalize PTR by chopping off the bottom bits so we can search
  // for PTR in the cacheline hash.
  uintptr_t iline = iptr & -CACHELINE_SIZE;
  // The position in the resulting cacheline where *PTR is actually stored.
  uintptr_t iofs = iptr & (CACHELINE_SIZE - 1);
  const gtm_cacheline *pline = reinterpret_cast<const gtm_cacheline *>(iline);
  // Search for the actual cacheline that holds the current value of *PTR.
  const gtm_cacheline *line = disp->read_lock(pline, lock);
  // Point to the position in the cacheline where *PTR is stored.
  ptr = reinterpret_cast<const T *>(&line->b[iofs]);
  // Straight up loads, because we're either aligned, or we don't care
  // about alignment.
  //
  // If we require alignment on type T, do a straight load if we're
  // aligned.  Otherwise do a straight load IFF the load fits entirely
  // in this cacheline.  That is, it won't span multiple cachelines.
  if (__builtin_expect (strict_alignment<T>::value
 			? (iofs & (sizeof (T) - 1)) == 0
 			: iofs + sizeof(T) <= CACHELINE_SIZE, 1))
    {
    do_normal_load:
      return *ptr;
    }
  // If alignment on T is necessary, but we're unaligned, yet we fit
  // entirely in this cacheline... do the unaligned load dance.
  else if (__builtin_expect (strict_alignment<T>::value
 			     && iofs + sizeof(T) <= CACHELINE_SIZE, 1))
    {
    do_unaligned_load:
      return unaligned_load<T>(ptr);
    }
  // Otherwise, this load will span multiple cachelines.
  else
    {
      // Get the following cacheline for the rest of the data.
      const gtm_cacheline *line2 = disp->read_lock(pline + 1, lock);
      // If the two cachelines are adjacent, just load it all in one
      // swoop.
      if (line2 == line + 1)
 	{
 	  if (!strict_alignment<T>::value)
 	    goto do_normal_load;
 	  else
 	    goto do_unaligned_load;
 	}
      else
 	{
 	  // Otherwise, ask the backend to load from two different
 	  // cachelines.
 	  return unaligned_load2<T>(line, line2, iofs);
 	}
    }
 }
 template<typename T>
 void do_write (T *ptr, T val, abi_dispatch::lock_type lock)
 {
  // Note: See comments for do_read() above for hints on this
  // function.  Ideally we should abstract out a lot out of these two
  // functions, and avoid all this duplication.
  abi_dispatch *disp = abi_disp();
  uintptr_t iptr = reinterpret_cast<uintptr_t>(ptr);
  uintptr_t iline = iptr & -CACHELINE_SIZE;
  uintptr_t iofs = iptr & (CACHELINE_SIZE - 1);
  gtm_cacheline *pline = reinterpret_cast<gtm_cacheline *>(iline);
  gtm_cacheline_mask m = ((gtm_cacheline_mask)2 << (sizeof(T) - 1)) - 1;
  abi_dispatch::mask_pair pair = disp->write_lock(pline, lock);
  ptr = reinterpret_cast<T *>(&pair.line->b[iofs]);
  if (__builtin_expect (strict_alignment<T>::value
 			? (iofs & (sizeof (val) - 1)) == 0
 			: iofs + sizeof(val) <= CACHELINE_SIZE, 1))
    {
      *pair.mask |= m << iofs;
    do_normal_store:
      *ptr = val;
    }
  else if (__builtin_expect (strict_alignment<T>::value
 			     && iofs + sizeof(val) <= CACHELINE_SIZE, 1))
    {
      *pair.mask |= m << iofs;
    do_unaligned_store:
      unaligned_store<T>(ptr, val);
    }
  else
    {
      *pair.mask |= m << iofs;
      abi_dispatch::mask_pair pair2 = disp->write_lock(pline + 1, lock);
      uintptr_t ileft = CACHELINE_SIZE - iofs;
      *pair2.mask |= m >> ileft;
      if (pair2.line == pair.line + 1)
 	{
 	  if (!strict_alignment<T>::value)
 	    goto do_normal_store;
 	  else
 	    goto do_unaligned_store;
 	}
      else
 	unaligned_store2<T>(pair.line, pair2.line, iofs, val);
    }
 }
 } /* anonymous namespace */
 #define ITM_READ(T, LOCK)						\
  _ITM_TYPE_##T ITM_REGPARM _ITM_##LOCK##T (const _ITM_TYPE_##T *ptr)	\
  {									\
    return do_read (ptr, abi_dispatch::LOCK);				\
  }
 #define ITM_WRITE(T, LOCK)						\
  void ITM_REGPARM _ITM_##LOCK##T (_ITM_TYPE_##T *ptr, _ITM_TYPE_##T val) \
  {									\
    do_write (ptr, val, abi_dispatch::LOCK);				\
  }
 #define ITM_BARRIERS(T)		\
  ITM_READ(T, R)		\
  ITM_READ(T, RaR)		\
  ITM_READ(T, RaW)		\
  ITM_READ(T, RfW)		\
  ITM_WRITE(T, W)		\
  ITM_WRITE(T, WaR)		\
  ITM_WRITE(T, WaW)
--- a/libitm/config/alpha/cacheline.h
+++ b/libitm/config/alpha/cacheline.h
@ -33,90 +33,6 @@
 // modification mask, below.
 #define CACHELINE_SIZE 64
-#ifdef __alpha_bwx__
+#include "config/generic/cacheline.h"
 # include "config/generic/cacheline.h"
 #else
 // If we don't have byte-word stores, then we'll never be able to
 // adjust *all* of the byte loads/stores to be truely atomic.  So
 // only guarantee 4-byte aligned values atomicly stored, exactly
 // like the native system.  Use byte zap instructions to accelerate
 // sub-word masked stores.
 namespace GTM HIDDEN {
 // A gtm_cacheline_mask stores a modified bit for every modified byte
 // in the cacheline with which it is associated.
 typedef sized_integral<CACHELINE_SIZE / 8>::type gtm_cacheline_mask;
 union gtm_cacheline
 {
  // Byte access to the cacheline.
  unsigned char b[CACHELINE_SIZE] __attribute__((aligned(CACHELINE_SIZE)));
  // Larger sized access to the cacheline.
  uint16_t u16[CACHELINE_SIZE / sizeof(uint16_t)];
  uint32_t u32[CACHELINE_SIZE / sizeof(uint32_t)];
  uint64_t u64[CACHELINE_SIZE / sizeof(uint64_t)];
  gtm_word w[CACHELINE_SIZE / sizeof(gtm_word)];
  // Store S into D, but only the bytes specified by M.
  static void store_mask(uint32_t *d, uint32_t s, uint8_t m);
  static void store_mask(uint64_t *d, uint64_t s, uint8_t m);
  // Copy S to D, but only the bytes specified by M.
  static void copy_mask (gtm_cacheline * __restrict d,
 			 const gtm_cacheline * __restrict s,
 			 gtm_cacheline_mask m);
  // A write barrier to emit after (a series of) copy_mask.
  static void copy_mask_wb () { atomic_write_barrier(); }
 };
 inline void ALWAYS_INLINE
 gtm_cacheline::store_mask (uint32_t *d, uint32_t s, uint8_t m)
 {
  const uint8_t tm = (1 << sizeof(uint32_t)) - 1;
  m &= tm;
  if (__builtin_expect (m, tm))
    {
      if (__builtin_expect (m == tm, 1))
 	*d = s;
      else
 	*d = __builtin_alpha_zap (*d, m) | __builtin_alpha_zapnot (s, m);
    }
 }
 inline void ALWAYS_INLINE
 gtm_cacheline::store_mask (uint64_t *d, uint64_t s, uint8_t m)
 {
  if (__builtin_expect (m, 0xff))
    {
      if (__builtin_expect (m == 0xff, 1))
 	*d = s;
      else
 	{
 	  typedef uint32_t *p32 __attribute__((may_alias));
 	  p32 d32 = reinterpret_cast<p32>(d);
 	  if ((m & 0x0f) == 0x0f)
 	    {
 	      d32[0] = s;
 	      m &= 0xf0;
 	    }
 	  else if ((m & 0xf0) == 0xf0)
 	    {
 	      d32[1] = s >> 32;
 	      m &= 0x0f;
 	    }
 	  if (m)
 	    *d = __builtin_alpha_zap (*d, m) | __builtin_alpha_zapnot (s, m);
 	}
    }
 }
 } // namespace GTM
 #endif // __alpha_bwx__
 #endif // LIBITM_ALPHA_CACHELINE_H
--- a/libitm/config/alpha/unaligned.h
+++ b/libitm/config/alpha/unaligned.h
@ -1,118 +0,0 @@
 /* Copyright (C) 2009, 2011 Free Software Foundation, Inc.
   Contributed by Richard Henderson <rth@redhat.com>.
   This file is part of the GNU Transactional Memory Library (libitm).
   Libitm is free software; you can redistribute it and/or modify it
   under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3 of the License, or
   (at your option) any later version.
   Libitm is distributed in the hope that it will be useful, but WITHOUT ANY
   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
   more details.
   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.
   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */
 #ifndef LIBITM_ALPHA_UNALIGNED_H
 #define LIBITM_ALPHA_UNALIGNED_H 1
 #define HAVE_ARCH_UNALIGNED_LOAD2_U2 1
 #define HAVE_ARCH_UNALIGNED_LOAD2_U4 1
 #define HAVE_ARCH_UNALIGNED_LOAD2_U8 1
 #ifndef __alpha_bwx__
 #define HAVE_ARCH_UNALIGNED_STORE2_U2 1
 #endif
 #define HAVE_ARCH_UNALIGNED_STORE2_U4 1
 #define HAVE_ARCH_UNALIGNED_STORE2_U8 1
 #include "config/generic/unaligned.h"
 namespace GTM HIDDEN {
 template<>
 inline uint16_t ALWAYS_INLINE
 unaligned_load2<uint16_t>(const gtm_cacheline *c1,
 			  const gtm_cacheline *c2, size_t ofs)
 {
  uint64_t v1 = c1->u64[CACHELINE_SIZE / sizeof(uint64_t) - 1];
  uint64_t v2 = c2->u64[0];
  return __builtin_alpha_extwl (v1, ofs) | __builtin_alpha_extwh (v2, ofs);
 }
 template<>
 inline uint32_t ALWAYS_INLINE
 unaligned_load2<uint32_t>(const gtm_cacheline *c1,
 			  const gtm_cacheline *c2, size_t ofs)
 {
  uint64_t v1 = c1->u64[CACHELINE_SIZE / sizeof(uint64_t) - 1];
  uint64_t v2 = c2->u64[0];
  return __builtin_alpha_extll (v1, ofs) + __builtin_alpha_extlh (v2, ofs);
 }
 template<>
 inline uint64_t ALWAYS_INLINE
 unaligned_load2<uint64_t>(const gtm_cacheline *c1,
 			  const gtm_cacheline *c2, size_t ofs)
 {
  uint64_t v1 = c1->u64[CACHELINE_SIZE / sizeof(uint64_t) - 1];
  uint64_t v2 = c2->u64[0];
  return __builtin_alpha_extql (v1, ofs) | __builtin_alpha_extqh (v2, ofs);
 }
 #ifndef __alpha_bwx__
 template<>
 inline void
 unaligned_store2<uint16_t>(gtm_cacheline *c1, gtm_cacheline *c2,
 			   size_t ofs, uint16_t val)
 {
  uint32_t vl = (uint32_t)val << 24, vh = val >> 8;
  gtm_cacheline::store_mask (&c1->u32[CACHELINE_SIZE / 4 - 1], vl, 4);
  gtm_cacheline::store_mask (&c2->u32[0], vh, 1);
 }
 #endif
 template<>
 inline void
 unaligned_store2<uint32_t>(gtm_cacheline *c1, gtm_cacheline *c2,
 			   size_t ofs, uint32_t val)
 {
  uint64_t vl = __builtin_alpha_insll (val, ofs);
  uint64_t ml = __builtin_alpha_insll (~0u, ofs);
  uint64_t vh = __builtin_alpha_inslh (val, ofs);
  uint64_t mh = __builtin_alpha_inslh (~0u, ofs);
  gtm_cacheline::store_mask (&c1->u64[CACHELINE_SIZE / 8 - 1], vl, ml);
  gtm_cacheline::store_mask (&c2->u64[0], vh, mh);
 }
 template<>
 inline void
 unaligned_store2<uint64_t>(gtm_cacheline *c1, gtm_cacheline *c2,
 			   size_t ofs, uint64_t val)
 {
  uint64_t vl = __builtin_alpha_insql (val, ofs);
  uint64_t ml = __builtin_alpha_insql (~0u, ofs);
  uint64_t vh = __builtin_alpha_insqh (val, ofs);
  uint64_t mh = __builtin_alpha_insqh (~0u, ofs);
  gtm_cacheline::store_mask (&c1->u64[CACHELINE_SIZE / 8 - 1], vl, ml);
  gtm_cacheline::store_mask (&c2->u64[0], vh, mh);
 }
 } // namespace GTM
 #endif // LIBITM_ALPHA_UNALIGNED_H
--- a/libitm/config/generic/cacheline.cc
+++ b/libitm/config/generic/cacheline.cc
@ -1,49 +0,0 @@
 /* Copyright (C) 2009, 2011 Free Software Foundation, Inc.
   Contributed by Richard Henderson <rth@redhat.com>.
   This file is part of the GNU Transactional Memory Library (libitm).
   Libitm is free software; you can redistribute it and/or modify it
   under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3 of the License, or
   (at your option) any later version.
   Libitm is distributed in the hope that it will be useful, but WITHOUT ANY
   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
   more details.
   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.
   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */
 #include "libitm_i.h"
 namespace GTM HIDDEN {
 void
 gtm_cacheline::copy_mask (gtm_cacheline * __restrict d,
 			  const gtm_cacheline * __restrict s,
 			  gtm_cacheline_mask m)
 {
  const size_t n = sizeof (gtm_word);
  if (m == (gtm_cacheline_mask) -1)
    {
      *d = *s;
      return;
    }
  if (__builtin_expect (m == 0, 0))
    return;
  for (size_t i = 0; i < CACHELINE_SIZE / n; ++i, m >>= n)
    store_mask (&d->w[i], s->w[i], m);
 }
 } // namespace GTM
--- a/libitm/config/generic/cacheline.h
+++ b/libitm/config/generic/cacheline.h
@ -51,57 +51,8 @@ union gtm_cacheline
  uint32_t u32[CACHELINE_SIZE / sizeof(uint32_t)];
  uint64_t u64[CACHELINE_SIZE / sizeof(uint64_t)];
  gtm_word w[CACHELINE_SIZE / sizeof(gtm_word)];
  // Store S into D, but only the bytes specified by M.
  template<typename T> static void store_mask (T *d, T s, uint8_t m);
  // Copy S to D, but only the bytes specified by M.
  static void copy_mask (gtm_cacheline * __restrict d,
 			 const gtm_cacheline * __restrict s,
 			 gtm_cacheline_mask m);
  // A write barrier to emit after (a series of) copy_mask.
  // When we're emitting non-temporal stores, the normal strong
  // ordering of the machine doesn't apply.
  static void copy_mask_wb () { atomic_write_barrier(); }
 };
 template<typename T>
 inline void
 gtm_cacheline::store_mask (T *d, T s, uint8_t m)
 {
  const uint8_t tm = (1 << sizeof(T)) - 1;
  if (__builtin_expect (m & tm, tm))
    {
      if (__builtin_expect ((m & tm) == tm, 1))
 	*d = s;
      else
 	{
 	  const int half = sizeof(T) / 2;
 	  typedef typename sized_integral<half>::type half_t;
 	  half_t *dhalf = reinterpret_cast<half_t *>(d);
 	  half_t s1, s2;
 	  if (WORDS_BIGENDIAN)
 	    s1 = s >> half*8, s2 = s;
 	  else
 	    s1 = s, s2 = s >> half*8;
 	  store_mask (dhalf, s1, m);
 	  store_mask (dhalf + 1, s2, m >> half);
 	}
    }
 }
 template<>
 inline void ALWAYS_INLINE
 gtm_cacheline::store_mask<uint8_t> (uint8_t *d, uint8_t s, uint8_t m)
 {
  if (m & 1)
    *d = s;
 }
 } // namespace GTM
 #endif // LIBITM_CACHELINE_H
--- a/libitm/config/generic/cachepage.h
+++ b/libitm/config/generic/cachepage.h
@ -1,77 +0,0 @@
 /* Copyright (C) 2009, 2011 Free Software Foundation, Inc.
   Contributed by Richard Henderson <rth@redhat.com>.
   This file is part of the GNU Transactional Memory Library (libitm).
   Libitm is free software; you can redistribute it and/or modify it
   under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3 of the License, or
   (at your option) any later version.
   Libitm is distributed in the hope that it will be useful, but WITHOUT ANY
   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
   more details.
   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.
   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */
 #ifndef LIBITM_CACHEPAGE_H
 #define LIBITM_CACHEPAGE_H 1
 namespace GTM HIDDEN {
 // A "page" worth of saved cachelines plus modification masks.  This
 // arrangement is intended to minimize the overhead of alignment.  The
 // PAGE_SIZE defined by the target must be a constant for this to work,
 // which means that this definition may not be the same as the real
 // system page size.  An additional define of FIXED_PAGE_SIZE by the
 // target indicates that PAGE_SIZE exactly matches the system page size.
 #ifndef PAGE_SIZE
 #define PAGE_SIZE 4096
 #endif
 struct gtm_cacheline_page
 {
  static const size_t LINES
    = ((PAGE_SIZE - sizeof(gtm_cacheline_page *))
       / (CACHELINE_SIZE + sizeof(gtm_cacheline_mask)));
  gtm_cacheline lines[LINES] __attribute__((aligned(PAGE_SIZE)));
  gtm_cacheline_mask masks[LINES];
  gtm_cacheline_page *prev;
  static gtm_cacheline_page *
  page_for_line (gtm_cacheline *c)
  {
    return (gtm_cacheline_page *)((uintptr_t)c & -PAGE_SIZE);
  }
  gtm_cacheline_mask *
  mask_for_line (gtm_cacheline *c)
  {
    size_t index = c - &this->lines[0];
    return &this->masks[index];
  }
  static gtm_cacheline_mask *
  mask_for_page_line (gtm_cacheline *c)
  {
    gtm_cacheline_page *p = page_for_line (c);
    return p->mask_for_line (c);
  }
  static void *operator new (size_t);
  static void operator delete (void *);
 };
 } // namespace GTM
 #endif // LIBITM_CACHEPAGE_H
--- a/libitm/config/generic/tls.cc
+++ b/libitm/config/generic/tls.cc
@ -30,51 +30,4 @@ namespace GTM HIDDEN {
 __thread gtm_thread_tls _gtm_thr_tls;
 #endif
 // Filter out any updates that overlap the libitm stack, as defined by
 // TOP (entry point to library) and BOT (below current function).  This
 // definition should be fine for all stack-grows-down architectures.
 gtm_cacheline_mask __attribute__((noinline))
 gtm_mask_stack(gtm_cacheline *line, gtm_cacheline_mask mask)
 {
  void *top = gtm_thr()->jb.cfa;
  void *bot = __builtin_dwarf_cfa();
  // We must have come through an entry point that set TOP.
  assert (top != NULL);
  if (line + 1 < bot)
    {
      // Since we don't have the REAL stack boundaries for this thread,
      // we cannot know if this is a dead write to a stack address below
      // the current function or if it is write to another VMA.  In either
      // case allowing the write should not affect correctness.
    }
  else if (line >= top)
    {
      // A valid write to an address in an outer stack frame, or a write
      // to another VMA.
    }
  else
    {
      uintptr_t diff = (uintptr_t)top - (uintptr_t)line;
      if (diff >= CACHELINE_SIZE)
 	{
 	  // The write is either fully within the proscribed area, or the tail
 	  // of the cacheline overlaps the proscribed area.  Assume that all
 	  // stacks are at least cacheline aligned and declare the head of the
 	  // cacheline dead.
 	  mask = 0;
 	}
      else
 	{
 	  // The head of the cacheline is within the proscribed area, but the
 	  // tail of the cacheline is live.  Eliminate the dead writes.
 	  mask &= (gtm_cacheline_mask)-1 << diff;
 	}
    }
  return mask;
 }
 } // namespace GTM
--- a/libitm/config/generic/unaligned.h
+++ b/libitm/config/generic/unaligned.h
@ -1,228 +0,0 @@
 /* Copyright (C) 2009, 2011 Free Software Foundation, Inc.
   Contributed by Richard Henderson <rth@redhat.com>.
   This file is part of the GNU Transactional Memory Library (libitm).
   Libitm is free software; you can redistribute it and/or modify it
   under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3 of the License, or
   (at your option) any later version.
   Libitm is distributed in the hope that it will be useful, but WITHOUT ANY
   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
   more details.
   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.
   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */
 #ifndef LIBITM_UNALIGNED_H
 #define LIBITM_UNALIGNED_H 1
 namespace GTM HIDDEN {
 #ifndef STRICT_ALIGNMENT
 #define STRICT_ALIGNMENT 1
 #endif
 // A type trait for whether type T requires strict alignment.
 // The generic types are assumed to all be the same; specializations
 // for target-specific types should be done in config/cpu/unaligned.h.
 template<typename T>
  struct strict_alignment
    : public std::integral_constant<bool, STRICT_ALIGNMENT>
  { };
 // A helper template for accessing an integral type the same size as T
 template<typename T>
  struct make_integral
    : public sized_integral<sizeof(T)>
  { };
 // A helper class for accessing T as an unaligned value.
 template<typename T>
 struct __attribute__((packed)) unaligned_helper
  { T x; };
 // A helper class for view-converting T as an integer.
 template<typename T>
 union view_convert_helper
 {
  typedef T type;
  typedef make_integral<T> itype;
  type t;
  itype i;
 };
 // Generate an unaligned load sequence.
 // The compiler knows how to do this for any specific type.
 template<typename T>
 inline T ALWAYS_INLINE
 unaligned_load(const void *t)
 {
  typedef unaligned_helper<T> UT;
  const UT *ut = reinterpret_cast<const UT *>(t);
  return ut->x;
 }
 // Generate an unaligned store sequence.
 template<typename T>
 inline void ALWAYS_INLINE
 unaligned_store(void *t, T val)
 {
  typedef unaligned_helper<T> UT;
  UT *ut = reinterpret_cast<UT *>(t);
  ut->x = val;
 }
 // Generate an unaligned load from two different cachelines.
 // It is known that OFS + SIZEOF(T) > CACHELINE_SIZE.
 template<typename T>
 inline T ALWAYS_INLINE
 unaligned_load2(const gtm_cacheline *c1, const gtm_cacheline *c2, size_t ofs)
 {
  size_t left = CACHELINE_SIZE - ofs;
  T ret;
  memcpy (&ret, &c1->b[ofs], left);
  memcpy ((char *)&ret + ofs, c2, sizeof(T) - left);
  return ret;
 }
 // Generate an unaligned store into two different cachelines.
 // It is known that OFS + SIZEOF(T) > CACHELINE_SIZE.
 template<typename T>
 inline void ALWAYS_INLINE
 unaligned_store2(gtm_cacheline *c1, gtm_cacheline *c2, size_t ofs, T val)
 {
  size_t left = CACHELINE_SIZE - ofs;
  memcpy (&c1->b[ofs], &val, left);
  memcpy (c2, (char *)&val + left, sizeof(T) - left);
 }
 #ifndef HAVE_ARCH_UNALIGNED_LOAD2_U2
 template<>
 inline uint16_t ALWAYS_INLINE
 unaligned_load2<uint16_t>(const gtm_cacheline *c1,
 			  const gtm_cacheline *c2, size_t ofs)
 {
  uint16_t v1 = c1->b[CACHELINE_SIZE - 1];
  uint16_t v2 = c2->b[0];
  if (WORDS_BIGENDIAN)
    return v1 << 8 | v2;
  else
    return v2 << 8 | v1;
 }
 #endif
 #ifndef HAVE_ARCH_UNALIGNED_LOAD2_U4
 template<>
 inline uint32_t ALWAYS_INLINE
 unaligned_load2<uint32_t>(const gtm_cacheline *c1,
 			  const gtm_cacheline *c2, size_t ofs)
 {
  uint32_t v1 = c1->u32[CACHELINE_SIZE / sizeof(uint32_t) - 1];
  uint32_t v2 = c2->u32[0];
  int s2 = (ofs & (sizeof(uint32_t) - 1)) * 8;
  int s1 = sizeof(uint32_t) * 8 - s2;
  if (WORDS_BIGENDIAN)
    return v1 << s2 | v2 >> s1;
  else
    return v2 << s2 | v1 >> s1;
 }
 #endif
 #ifndef HAVE_ARCH_UNALIGNED_LOAD2_U8
 template<>
 inline uint64_t ALWAYS_INLINE
 unaligned_load2<uint64_t>(const gtm_cacheline *c1,
 			  const gtm_cacheline *c2, size_t ofs)
 {
  uint64_t v1 = c1->u64[CACHELINE_SIZE / sizeof(uint64_t) - 1];
  uint64_t v2 = c2->u64[0];
  int s2 = (ofs & (sizeof(uint64_t) - 1)) * 8;
  int s1 = sizeof(uint64_t) * 8 - s2;
  if (WORDS_BIGENDIAN)
    return v1 << s2 | v2 >> s1;
  else
    return v2 << s2 | v1 >> s1;
 }
 #endif
 template<>
 inline float ALWAYS_INLINE
 unaligned_load2<float>(const gtm_cacheline *c1,
 		       const gtm_cacheline *c2, size_t ofs)
 {
  typedef view_convert_helper<float> VC; VC vc;
  vc.i = unaligned_load2<VC::itype>(c1, c2, ofs);
  return vc.t;
 }
 template<>
 inline double ALWAYS_INLINE
 unaligned_load2<double>(const gtm_cacheline *c1,
 			const gtm_cacheline *c2, size_t ofs)
 {
  typedef view_convert_helper<double> VC; VC vc;
  vc.i = unaligned_load2<VC::itype>(c1, c2, ofs);
  return vc.t;
 }
 #ifndef HAVE_ARCH_UNALIGNED_STORE2_U2
 template<>
 inline void ALWAYS_INLINE
 unaligned_store2<uint16_t>(gtm_cacheline *c1, gtm_cacheline *c2,
 			   size_t ofs, uint16_t val)
 {
  uint8_t vl = val, vh = val >> 8;
  if (WORDS_BIGENDIAN)
    {
      c1->b[CACHELINE_SIZE - 1] = vh;
      c2->b[0] = vl;
    }
  else
    {
      c1->b[CACHELINE_SIZE - 1] = vl;
      c2->b[0] = vh;
    }
 }
 #endif
 #if 0
 #ifndef HAVE_ARCH_UNALIGNED_STORE2_U4
 template<>
 inline void ALWAYS_INLINE
 unaligned_store2<uint32_t>(gtm_cacheline *c1, gtm_cacheline *c2,
 			   size_t ofs, uint32_t val)
 {
  // ??? We could reuse the store_mask stuff here.
 }
 #endif
 template<>
 inline void ALWAYS_INLINE
 unaligned_store2<float>(gtm_cacheline *c1, gtm_cacheline *c2,
 			size_t ofs, float val)
 {
  typedef view_convert_helper<float> VC; VC vc;
  vc.t = val;
  unaligned_store2(c1, c2, ofs, vc.i);
 }
 #endif
 } // namespace GTM
 #endif // LIBITM_UNALIGNED_H
--- a/libitm/config/posix/cachepage.cc
+++ b/libitm/config/posix/cachepage.cc
@ -1,183 +0,0 @@
 /* Copyright (C) 2009, 2011 Free Software Foundation, Inc.
   Contributed by Richard Henderson <rth@redhat.com>.
   This file is part of the GNU Transactional Memory Library (libitm).
   Libitm is free software; you can redistribute it and/or modify it
   under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3 of the License, or
   (at your option) any later version.
   Libitm is distributed in the hope that it will be useful, but WITHOUT ANY
   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
   more details.
   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.
   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */
 #include "libitm_i.h"
 #include <pthread.h>
 //
 // We have three possibilities for alloction: mmap, memalign, posix_memalign
 //
 #if defined(HAVE_MMAP_ANON) || defined(HAVE_MMAP_DEV_ZERO)
 #include <sys/mman.h>
 #include <fcntl.h>
 #endif
 #ifdef HAVE_MALLOC_H
 #include <malloc.h>
 #endif
 namespace GTM HIDDEN {
 #if defined(HAVE_MMAP_ANON)
 # if !defined(MAP_ANONYMOUS) && defined(MAP_ANON)
 #  define MAP_ANONYMOUS MAP_ANON
 # endif
 # define dev_zero -1
 #elif defined(HAVE_MMAP_DEV_ZERO)
 # ifndef MAP_ANONYMOUS
 #  define MAP_ANONYMOUS 0
 # endif
 static int dev_zero = -1;
 #endif
 #if defined(HAVE_MMAP_ANON) || defined(HAVE_MMAP_DEV_ZERO)
 /* If we get here, we've already opened /dev/zero and verified that
   PAGE_SIZE is valid for the system.  */
 static gtm_cacheline_page * alloc_mmap (void) UNUSED;
 static gtm_cacheline_page *
 alloc_mmap (void)
 {
  gtm_cacheline_page *r;
  r = (gtm_cacheline_page *) mmap (NULL, PAGE_SIZE, PROT_READ | PROT_WRITE,
 				   MAP_PRIVATE | MAP_ANONYMOUS, dev_zero, 0);
  if (r == (gtm_cacheline_page *) MAP_FAILED)
    abort ();
  return r;
 }
 #endif /* MMAP_ANON | MMAP_DEV_ZERO */
 #ifdef HAVE_MEMALIGN
 static gtm_cacheline_page * alloc_memalign (void) UNUSED;
 static gtm_cacheline_page *
 alloc_memalign (void)
 {
  gtm_cacheline_page *r;
  r = (gtm_cacheline_page *) memalign (PAGE_SIZE, PAGE_SIZE);
  if (r == NULL)
    abort ();
  return r;
 }
 #endif /* MEMALIGN */
 #ifdef HAVE_POSIX_MEMALIGN
 static gtm_cacheline_page *alloc_posix_memalign (void) UNUSED;
 static gtm_cacheline_page *
 alloc_posix_memalign (void)
 {
  void *r;
  if (posix_memalign (&r, PAGE_SIZE, PAGE_SIZE))
    abort ();
  return (gtm_cacheline_page *) r;
 }
 #endif /* POSIX_MEMALIGN */
 #if defined(HAVE_MMAP_ANON) && defined(FIXED_PAGE_SIZE)
 # define alloc_page  alloc_mmap
 #elif defined(HAVE_MMAP_DEV_ZERO) && defined(FIXED_PAGE_SIZE)
 static gtm_cacheline_page *
 alloc_page (void)
 {
  if (dev_zero < 0)
    {
      dev_zero = open ("/dev/zero", O_RDWR);
      assert (dev_zero >= 0);
    }
  return alloc_mmap ();
 }
 #elif defined(HAVE_MMAP_ANON) || defined(HAVE_MMAP_DEV_ZERO)
 static gtm_cacheline_page * (*alloc_page) (void);
 static void __attribute__((constructor))
 init_alloc_page (void)
 {
  size_t page_size = getpagesize ();
  if (page_size <= PAGE_SIZE && PAGE_SIZE % page_size == 0)
    {
 # ifndef HAVE_MMAP_ANON
      dev_zero = open ("/dev/zero", O_RDWR);
      assert (dev_zero >= 0);
 # endif
      alloc_page = alloc_mmap;
      return;
    }
 # ifdef HAVE_MEMALIGN
  alloc_page = alloc_memalign;
 # elif defined(HAVE_POSIX_MEMALIGN)
  alloc_page = alloc_posix_memalign;
 # else
 #  error "No fallback aligned memory allocation method"
 # endif
 }
 #elif defined(HAVE_MEMALIGN)
 # define alloc_page  alloc_memalign
 #elif defined(HAVE_POSIX_MEMALIGN)
 # define alloc_page  alloc_posix_memalign
 #else
 # error "No aligned memory allocation method"
 #endif
 static gtm_cacheline_page *free_pages;
 static pthread_mutex_t free_page_lock = PTHREAD_MUTEX_INITIALIZER;
 void *
 gtm_cacheline_page::operator new (size_t size)
 {
  assert (size == sizeof (gtm_cacheline_page));
  assert (size <= PAGE_SIZE);
  pthread_mutex_lock(&free_page_lock);
  gtm_cacheline_page *r = free_pages;
  free_pages = r ? r->prev : NULL;
  pthread_mutex_unlock(&free_page_lock);
  if (r == NULL)
    r = alloc_page ();
  return r;
 }
 void
 gtm_cacheline_page::operator delete (void *xhead)
 {
  gtm_cacheline_page *head = static_cast<gtm_cacheline_page *>(xhead);
  gtm_cacheline_page *tail;
  if (head == 0)
    return;
  /* ??? We should eventually really free some of these.  */
  for (tail = head; tail->prev != 0; tail = tail->prev)
    continue;
  pthread_mutex_lock(&free_page_lock);
  tail->prev = free_pages;
  free_pages = head;
  pthread_mutex_unlock(&free_page_lock);
 }
 } // namespace GTM
--- a/libitm/config/x86/cacheline.cc
+++ b/libitm/config/x86/cacheline.cc
@ -1,73 +0,0 @@
 /* Copyright (C) 2009, 2011 Free Software Foundation, Inc.
   Contributed by Richard Henderson <rth@redhat.com>.
   This file is part of the GNU Transactional Memory Library (libitm).
   Libitm is free software; you can redistribute it and/or modify it
   under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3 of the License, or
   (at your option) any later version.
   Libitm is distributed in the hope that it will be useful, but WITHOUT ANY
   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
   more details.
   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.
   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */
 #include "libitm_i.h"
 namespace GTM HIDDEN {
 uint32_t const gtm_bit_to_byte_mask[16] =
 {
  0x00000000,
  0x000000ff,
  0x0000ff00,
  0x0000ffff,
  0x00ff0000,
  0x00ff00ff,
  0x00ffff00,
  0x00ffffff,
  0xff000000,
  0xff0000ff,
  0xff00ff00,
  0xff00ffff,
  0xffff0000,
  0xffff00ff,
  0xffffff00,
  0xffffffff
 };
 #ifdef __SSE2__
 # define MEMBER	m128i
 #else
 # define MEMBER	w
 #endif
 void
 gtm_cacheline::copy_mask (gtm_cacheline * __restrict d,
 			  const gtm_cacheline * __restrict s,
 			  gtm_cacheline_mask m)
 {
  if (m == (gtm_cacheline_mask)-1)
    {
      *d = *s;
      return;
    }
  if (__builtin_expect (m == 0, 0))
    return;
  size_t n = sizeof(d->MEMBER[0]);
  for (size_t i = 0; i < CACHELINE_SIZE / n; ++i, m >>= n)
    store_mask (&d->MEMBER[i], s->MEMBER[i], m);
 }
 } // namespace GTM
--- a/libitm/config/x86/cacheline.h
+++ b/libitm/config/x86/cacheline.h
@ -40,8 +40,6 @@ namespace GTM HIDDEN {
 // in the cacheline with which it is associated.
 typedef sized_integral<CACHELINE_SIZE / 8>::type gtm_cacheline_mask;
 extern uint32_t const gtm_bit_to_byte_mask[16];
 union gtm_cacheline
 {
  // Byte access to the cacheline.
@ -67,23 +65,6 @@ union gtm_cacheline
  __m256i m256i[CACHELINE_SIZE / sizeof(__m256i)];
 #endif
  // Store S into D, but only the bytes specified by M.
  static void store_mask (uint32_t *d, uint32_t s, uint8_t m);
  static void store_mask (uint64_t *d, uint64_t s, uint8_t m);
 #ifdef __SSE2__
  static void store_mask (__m128i *d, __m128i s, uint16_t m);
 #endif
  // Copy S to D, but only the bytes specified by M.
  static void copy_mask (gtm_cacheline * __restrict d,
 			 const gtm_cacheline * __restrict s,
 			 gtm_cacheline_mask m);
  // A write barrier to emit after (a series of) copy_mask.
  // When we're emitting non-temporal stores, the normal strong
  // ordering of the machine doesn't apply.
  static void copy_mask_wb ();
 #if defined(__SSE__) || defined(__AVX__)
  // Copy S to D; only bother defining if we can do this more efficiently
  // than the compiler-generated default implementation.
@ -91,14 +72,6 @@ union gtm_cacheline
 #endif // SSE, AVX
 };
 inline void
 gtm_cacheline::copy_mask_wb ()
 {
 #ifdef __SSE2__
  _mm_sfence ();
 #endif
 }
 #if defined(__SSE__) || defined(__AVX__)
 inline gtm_cacheline& ALWAYS_INLINE
 gtm_cacheline::operator= (const gtm_cacheline & __restrict s)
@ -141,104 +114,12 @@ gtm_cacheline::operator= (const gtm_cacheline & __restrict s)
    }
  return *this;
 #undef CP
 #undef TYPE
 }
 #endif
 // Support masked integer stores more efficiently with an unlocked cmpxchg
 // insn.  My reasoning is that while we write to locations that we do not wish
 // to modify, we do it in an uninterruptable insn, and so we either truely
 // write back the original data or the insn fails -- unlike with a
 // load/and/or/write sequence which can be interrupted either by a kernel
 // task switch or an unlucky cacheline steal by another processor.  Avoiding
 // the LOCK prefix improves performance by a factor of 10, and we don't need
 // the memory barrier semantics implied by that prefix.
 inline void ALWAYS_INLINE
 gtm_cacheline::store_mask (uint32_t *d, uint32_t s, uint8_t m)
 {
  gtm_cacheline_mask tm = (1 << sizeof (s)) - 1;
  if (__builtin_expect (m & tm, tm))
    {
      if (__builtin_expect ((m & tm) == tm, 1))
 	*d = s;
      else
 	{
 	  gtm_cacheline_mask bm = gtm_bit_to_byte_mask[m & 15];
 	  gtm_word n, o = *d;
 	  __asm("\n0:\t"
 		"mov	%[o], %[n]\n\t"
 		"and	%[m], %[n]\n\t"
 		"or	%[s], %[n]\n\t"
 		"cmpxchg %[n], %[d]\n\t"
 		".byte	0x2e\n\t"	// predict not-taken, aka jnz,pn
 		"jnz	0b"
 		: [d] "+m"(*d), [n] "=&r" (n), [o] "+a"(o)
 		: [s] "r" (s & bm), [m] "r" (~bm));
 	}
    }
 }
 inline void ALWAYS_INLINE
 gtm_cacheline::store_mask (uint64_t *d, uint64_t s, uint8_t m)
 {
  gtm_cacheline_mask tm = (1 << sizeof (s)) - 1;
  if (__builtin_expect (m & tm, tm))
    {
      if (__builtin_expect ((m & tm) == tm, 1))
 	*d = s;
      else
 	{
 #ifdef __x86_64__
 	  uint32_t bl = gtm_bit_to_byte_mask[m & 15];
 	  uint32_t bh = gtm_bit_to_byte_mask[(m >> 4) & 15];
 	  gtm_cacheline_mask bm = bl | ((gtm_cacheline_mask)bh << 31 << 1);
 	  uint64_t n, o = *d;
 	  __asm("\n0:\t"
 		"mov	%[o], %[n]\n\t"
 		"and	%[m], %[n]\n\t"
 		"or	%[s], %[n]\n\t"
 		"cmpxchg %[n], %[d]\n\t"
 		".byte	0x2e\n\t"	// predict not-taken, aka jnz,pn
 		"jnz	0b"
 		: [d] "+m"(*d), [n] "=&r" (n), [o] "+a"(o)
 		: [s] "r" (s & bm), [m] "r" (~bm));
 #else
 	  /* ??? While it's possible to perform this operation with
 	     cmpxchg8b, the sequence requires all 7 general registers
 	     and thus cannot be performed with -fPIC.  Don't even try.  */
 	  uint32_t *d32 = reinterpret_cast<uint32_t *>(d);
 	  store_mask (d32, s, m);
 	  store_mask (d32 + 1, s >> 32, m >> 4);
 #endif
 	}
    }
 }
 #ifdef __SSE2__
 inline void ALWAYS_INLINE
 gtm_cacheline::store_mask (__m128i *d, __m128i s, uint16_t m)
 {
  if (__builtin_expect (m == 0, 0))
    return;
  if (__builtin_expect (m == 0xffff, 1))
    *d = s;
  else
    {
      __m128i bm0, bm1, bm2, bm3;
      bm0 = _mm_set_epi32 (0, 0, 0, gtm_bit_to_byte_mask[m & 15]); m >>= 4;
      bm1 = _mm_set_epi32 (0, 0, 0, gtm_bit_to_byte_mask[m & 15]); m >>= 4;
      bm2 = _mm_set_epi32 (0, 0, 0, gtm_bit_to_byte_mask[m & 15]); m >>= 4;
      bm3 = _mm_set_epi32 (0, 0, 0, gtm_bit_to_byte_mask[m & 15]); m >>= 4;
      bm0 = _mm_unpacklo_epi32 (bm0, bm1);
      bm2 = _mm_unpacklo_epi32 (bm2, bm3);
      bm0 = _mm_unpacklo_epi64 (bm0, bm2);
      _mm_maskmoveu_si128 (s, bm0, (char *)d);
    }
 }
 #endif // SSE2
 } // namespace GTM
 #endif // LIBITM_CACHELINE_H
--- a/libitm/config/x86/unaligned.h
+++ b/libitm/config/x86/unaligned.h
@ -1,237 +0,0 @@
 /* Copyright (C) 2009, 2011 Free Software Foundation, Inc.
   Contributed by Richard Henderson <rth@redhat.com>.
   This file is part of the GNU Transactional Memory Library (libitm).
   Libitm is free software; you can redistribute it and/or modify it
   under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3 of the License, or
   (at your option) any later version.
   Libitm is distributed in the hope that it will be useful, but WITHOUT ANY
   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
   more details.
   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.
   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */
 #ifndef LIBITM_X86_UNALIGNED_H
 #define LIBITM_X86_UNALIGNED_H 1
 #define HAVE_ARCH_UNALIGNED_LOAD2_U4 1
 #define HAVE_ARCH_UNALIGNED_LOAD2_U8 1
 #include "config/generic/unaligned.h"
 namespace GTM HIDDEN {
 template<>
 inline uint32_t
 unaligned_load2<uint32_t>(const gtm_cacheline *c1,
 			  const gtm_cacheline *c2, size_t ofs)
 {
  uint32_t r, lo, hi;
  lo = c1->u32[CACHELINE_SIZE / sizeof(uint32_t) - 1];
  hi = c2->u32[0];
  asm("shrd %b2, %1, %0" : "=r"(r) : "r"(hi), "c"((ofs & 3) * 8), "0"(lo));
  return r;
 }
 template<>
 inline uint64_t
 unaligned_load2<uint64_t>(const gtm_cacheline *c1,
 			  const gtm_cacheline *c2, size_t ofs)
 {
 #ifdef __x86_64__
  uint64_t r, lo, hi;
  lo = c1->u64[CACHELINE_SIZE / sizeof(uint64_t) - 1];
  hi = c2->u64[0];
  asm("shrd %b2, %1, %0" : "=r"(r) : "r"(hi), "c"((ofs & 3) * 8), "0"(lo));
  return r;
 #else
  uint32_t v0, v1, v2;
  uint64_t r;
  if (ofs < CACHELINE_SIZE - 4)
    {
      v0 = c1->u32[CACHELINE_SIZE / sizeof(uint32_t) - 2];
      v1 = c1->u32[CACHELINE_SIZE / sizeof(uint32_t) - 1];
      v2 = c2->u32[0];
    }
  else
    {
      v0 = c1->u32[CACHELINE_SIZE / sizeof(uint32_t) - 1];
      v1 = c2->u32[0];
      v2 = c2->u32[1];
    }
  ofs = (ofs & 3) * 8;
  asm("shrd %%cl, %[v1], %[v0]; shrd %%cl, %[v2], %[v1]"
      : "=A"(r) : "c"(ofs), [v0] "a"(v0), [v1] "d"(v1), [v2] "r"(v2));
  return r;
 #endif
 }
 #if defined(__SSE2__) || defined(__MMX__)
 template<>
 inline _ITM_TYPE_M64
 unaligned_load2<_ITM_TYPE_M64>(const gtm_cacheline *c1,
 			       const gtm_cacheline *c2, size_t ofs)
 {
 # ifdef __x86_64__
  __m128i lo = _mm_movpi64_epi64 (c1->m64[CACHELINE_SIZE / 8 - 1]);
  __m128i hi = _mm_movpi64_epi64 (c2->m64[0]);
  ofs = (ofs & 7) * 8;
  lo = _mm_srli_epi64 (lo, ofs);
  hi = _mm_slli_epi64 (hi, 64 - ofs);
  lo = lo | hi;
  return _mm_movepi64_pi64 (lo);
 # else
  // On 32-bit we're about to return the result in an MMX register, so go
  // ahead and do the computation in that unit, even if SSE2 is available.
  __m64 lo = c1->m64[CACHELINE_SIZE / 8 - 1];
  __m64 hi = c2->m64[0];
  ofs = (ofs & 7) * 8;
  lo = _mm_srli_si64 (lo, ofs);
  hi = _mm_slli_si64 (hi, 64 - ofs);
  return lo | hi;
 # endif
 }
 #endif // SSE2 or MMX
 // The SSE types are strictly aligned.
 #ifdef __SSE__
 template<>
  struct strict_alignment<_ITM_TYPE_M128>
    : public std::true_type
  { };
 // Expand the unaligned SSE move instructions.
 template<>
 inline _ITM_TYPE_M128
 unaligned_load<_ITM_TYPE_M128>(const void *t)
 {
  return _mm_loadu_ps (static_cast<const float *>(t));
 }
 template<>
 inline void
 unaligned_store<_ITM_TYPE_M128>(void *t, _ITM_TYPE_M128 val)
 {
  _mm_storeu_ps (static_cast<float *>(t), val);
 }
 #endif // SSE
 #ifdef __AVX__
 // The AVX types are strictly aligned when it comes to vmovaps vs vmovups.
 template<>
  struct strict_alignment<_ITM_TYPE_M256>
    : public std::true_type
  { };
 template<>
 inline _ITM_TYPE_M256
 unaligned_load<_ITM_TYPE_M256>(const void *t)
 {
  return _mm256_loadu_ps (static_cast<const float *>(t));
 }
 template<>
 inline void
 unaligned_store<_ITM_TYPE_M256>(void *t, _ITM_TYPE_M256 val)
 {
  _mm256_storeu_ps (static_cast<float *>(t), val);
 }
 #endif // AVX
 #ifdef __XOP__
 # define HAVE_ARCH_REALIGN_M128I 1
 extern const __v16qi GTM_vpperm_shift[16];
 inline __m128i
 realign_m128i (__m128i lo, __m128i hi, unsigned byte_count)
 {
  return _mm_perm_epi8 (lo, hi, GTM_vpperm_shift[byte_count]);
 }
 #elif defined(__AVX__)
 # define HAVE_ARCH_REALIGN_M128I 1
 extern "C" const uint64_t GTM_vpalignr_table[16];
 inline __m128i
 realign_m128i (__m128i lo, __m128i hi, unsigned byte_count)
 {
  register __m128i xmm0 __asm__("xmm0") = hi;
  register __m128i xmm1 __asm__("xmm1") = lo;
  __asm("call *%2" : "+x"(xmm0) : "x"(xmm1),
 	"r"(&GTM_vpalignr_table[byte_count]));
  return xmm0;
 }
 #elif defined(__SSSE3__)
 # define HAVE_ARCH_REALIGN_M128I 1
 extern "C" const uint64_t GTM_palignr_table[16];
 inline __m128i
 realign_m128i (__m128i lo, __m128i hi, unsigned byte_count)
 {
  register __m128i xmm0 __asm__("xmm0") = hi;
  register __m128i xmm1 __asm__("xmm1") = lo;
  __asm("call *%2" : "+x"(xmm0) : "x"(xmm1),
 	"r"(&GTM_palignr_table[byte_count]));
  return xmm0;
 }
 #elif defined(__SSE2__)
 # define HAVE_ARCH_REALIGN_M128I 1
 extern "C" const char GTM_pshift_table[16 * 16];
 inline __m128i
 realign_m128i (__m128i lo, __m128i hi, unsigned byte_count)
 {
  register __m128i xmm0 __asm__("xmm0") = lo;
  register __m128i xmm1 __asm__("xmm1") = hi;
  __asm("call *%2" : "+x"(xmm0), "+x"(xmm1)
 	: "r"(GTM_pshift_table + byte_count*16));
  return xmm0;
 }
 #endif // XOP, AVX, SSSE3, SSE2
 #ifdef HAVE_ARCH_REALIGN_M128I
 template<>
 inline _ITM_TYPE_M128
 unaligned_load2<_ITM_TYPE_M128>(const gtm_cacheline *c1,
 				const gtm_cacheline *c2, size_t ofs)
 {
  return (_ITM_TYPE_M128)
    realign_m128i (c1->m128i[CACHELINE_SIZE / 16 - 1],
 		   c2->m128i[0], ofs & 15);
 }
 #endif // HAVE_ARCH_REALIGN_M128I
 #ifdef __AVX__
 template<>
 inline _ITM_TYPE_M256
 unaligned_load2<_ITM_TYPE_M256>(const gtm_cacheline *c1,
 				const gtm_cacheline *c2, size_t ofs)
 {
  __m128i v0, v1;
  __m256i r;
  v0 = (__m128i) unaligned_load2<_ITM_TYPE_M128>(c1, c2, ofs);
  if (ofs < CACHELINE_SIZE - 16)
    v1 = v0, v0 = _mm_loadu_si128 ((const __m128i *) &c1->b[ofs]);
  else
    v1 = _mm_loadu_si128((const __m128i *)&c2->b[ofs + 16 - CACHELINE_SIZE]);
  r = _mm256_castsi128_si256 ((__m128i)v0);
  r = _mm256_insertf128_si256 (r, (__m128i)v1, 1);
  return (_ITM_TYPE_M256) r;
 }
 #endif // AVX
 } // namespace GTM
 #endif // LIBITM_X86_UNALIGNED_H
--- a/libitm/config/x86/x86_avx.cc
+++ b/libitm/config/x86/x86_avx.cc
@ -34,62 +34,3 @@ _ITM_LM256 (const _ITM_TYPE_M256 *ptr)
 {
  GTM::GTM_LB (ptr, sizeof (*ptr));
 }
 // Helpers for re-aligning two 128-bit values.
 #ifdef __XOP__
 const __v16qi GTM::GTM_vpperm_shift[16] =
 {
  {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
  {  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16 },
  {  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17 },
  {  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18 },
  {  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19 },
  {  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20 },
  {  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 },
  {  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22 },
  {  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23 },
  {  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24 },
  { 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25 },
  { 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26 },
  { 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27 },
  { 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28 },
  { 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29 },
  { 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 },
 };
 #else
 # define INSN0		"movdqa  %xmm1, %xmm0"
 # define INSN(N)	"vpalignr $" #N ", %xmm0, %xmm1, %xmm0"
 # define TABLE_ENT_0	INSN0 "\n\tret\n\t"
 # define TABLE_ENT(N)	".balign 8\n\t" INSN(N) "\n\tret\n\t"
 asm(".pushsection .text\n\
 	.balign 16\n\
 	.globl	GTM_vpalignr_table\n\
 	.hidden	GTM_vpalignr_table\n\
 	.type	GTM_vpalignr_table, @function\n\
 GTM_vpalignr_table:\n\t"
 	TABLE_ENT_0
 	TABLE_ENT(1)
 	TABLE_ENT(2)
 	TABLE_ENT(3)
 	TABLE_ENT(4)
 	TABLE_ENT(5)
 	TABLE_ENT(6)
 	TABLE_ENT(7)
 	TABLE_ENT(8)
 	TABLE_ENT(9)
 	TABLE_ENT(10)
 	TABLE_ENT(11)
 	TABLE_ENT(12)
 	TABLE_ENT(13)
 	TABLE_ENT(14)
 	TABLE_ENT(15)
 	".balign 8\n\
 	.size	GTM_vpalignr_table, .-GTM_vpalignr_table\n\
 	.popsection");
 # undef INSN0
 # undef INSN
 # undef TABLE_ENT_0
 # undef TABLE_ENT
 #endif
--- a/libitm/config/x86/x86_sse.cc
+++ b/libitm/config/x86/x86_sse.cc
@ -41,82 +41,3 @@ _ITM_LM128 (const _ITM_TYPE_M128 *ptr)
 {
  GTM::GTM_LB (ptr, sizeof (*ptr));
 }
 // Helpers for re-aligning two 128-bit values.
 #ifdef __SSSE3__
 # define INSN0		"movdqa	%xmm1, %xmm0"
 # define INSN(N)	"palignr $" #N ", %xmm1, %xmm0"
 # define TABLE_ENT_0	INSN0 "\n\tret\n\t"
 # define TABLE_ENT(N)	".balign 8\n\t" INSN(N) "\n\tret\n\t"
 asm(".pushsection .text\n\
 	.balign 16\n\
 	.globl	GTM_palignr_table\n\
 	.hidden	GTM_palignr_table\n\
 	.type	GTM_palignr_table, @function\n\
 GTM_palignr_table:\n\t"
 	TABLE_ENT_0
 	TABLE_ENT(1)
 	TABLE_ENT(2)
 	TABLE_ENT(3)
 	TABLE_ENT(4)
 	TABLE_ENT(5)
 	TABLE_ENT(6)
 	TABLE_ENT(7)
 	TABLE_ENT(8)
 	TABLE_ENT(9)
 	TABLE_ENT(10)
 	TABLE_ENT(11)
 	TABLE_ENT(12)
 	TABLE_ENT(13)
 	TABLE_ENT(14)
 	TABLE_ENT(15)
 	".balign 8\n\
 	.size	GTM_palignr_table, .-GTM_palignr_table\n\
 	.popsection");
 # undef INSN0
 # undef INSN
 # undef TABLE_ENT_0
 # undef TABLE_ENT
 #elif defined(__SSE2__)
 # define INSNS_8	"punpcklqdq %xmm1, %xmm0"
 # define INSNS(N)	"psrldq $"#N", %xmm0\n\t" \
 			"pslldq $(16-"#N"), %xmm1\n\t" \
 			"por %xmm1, %xmm0"
 # define TABLE_ENT_0	"ret\n\t"
 # define TABLE_ENT_8	".balign 16\n\t" INSNS_8 "\n\tret\n\t"
 # define TABLE_ENT(N)	".balign 16\n\t" INSNS(N) "\n\tret\n\t"
 asm(".pushsection .text\n\
 	.balign 16\n\
 	.globl	GTM_pshift_table\n\
 	.hidden	GTM_pshift_table\n\
 	.type	GTM_pshift_table, @function\n\
 GTM_pshift_table:\n\t"
 	TABLE_ENT_0
 	TABLE_ENT(1)
 	TABLE_ENT(2)
 	TABLE_ENT(3)
 	TABLE_ENT(4)
 	TABLE_ENT(5)
 	TABLE_ENT(6)
 	TABLE_ENT(7)
 	TABLE_ENT_8
 	TABLE_ENT(9)
 	TABLE_ENT(10)
 	TABLE_ENT(11)
 	TABLE_ENT(12)
 	TABLE_ENT(13)
 	TABLE_ENT(14)
 	TABLE_ENT(15)
 	".balign 8\n\
 	.size	GTM_pshift_table, .-GTM_pshift_table\n\
 	.popsection");
 # undef INSNS_8
 # undef INSNS
 # undef TABLE_ENT_0
 # undef TABLE_ENT_8
 # undef TABLE_ENT
 #endif
--- a/libitm/libitm_i.h
+++ b/libitm/libitm_i.h
@ -78,7 +78,6 @@ enum gtm_restart_reason
 #include "rwlock.h"
 #include "aatree.h"
 #include "cacheline.h"
 #include "cachepage.h"
 #include "stmlock.h"
 #include "dispatch.h"
 #include "containers.h"
--- a/libitm/memcpy.cc
+++ b/libitm/memcpy.cc
@ -1,365 +0,0 @@
 /* Copyright (C) 2008, 2009, 2011 Free Software Foundation, Inc.
   Contributed by Richard Henderson <rth@redhat.com>.
   This file is part of the GNU Transactional Memory Library (libitm).
   Libitm is free software; you can redistribute it and/or modify it
   under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3 of the License, or
   (at your option) any later version.
   Libitm is distributed in the hope that it will be useful, but WITHOUT ANY
   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
   more details.
   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.
   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */
 #include "libitm_i.h"
 using namespace GTM;
 static void
 do_memcpy (uintptr_t idst, uintptr_t isrc, size_t size,
 	   abi_dispatch::lock_type W, abi_dispatch::lock_type R)
 {
  abi_dispatch *disp = abi_disp();
  // The position in the destination cacheline where *IDST starts.
  uintptr_t dofs = idst & (CACHELINE_SIZE - 1);
  // The position in the source cacheline where *ISRC starts.
  uintptr_t sofs = isrc & (CACHELINE_SIZE - 1);
  const gtm_cacheline *src
    = reinterpret_cast<const gtm_cacheline *>(isrc & -CACHELINE_SIZE);
  gtm_cacheline *dst
    = reinterpret_cast<gtm_cacheline *>(idst & -CACHELINE_SIZE);
  const gtm_cacheline *sline;
  abi_dispatch::mask_pair dpair;
  if (size == 0)
    return;
  // If both SRC and DST data start at the same position in the cachelines,
  // we can easily copy the data in tandem, cacheline by cacheline...
  if (dofs == sofs)
    {
      // We copy the data in three stages:
      // (a) Copy stray bytes at the beginning that are smaller than a
      // cacheline.
      if (sofs != 0)
 	{
 	  size_t sleft = CACHELINE_SIZE - sofs;
 	  size_t min = (size <= sleft ? size : sleft);
 	  dpair = disp->write_lock(dst, W);
 	  sline = disp->read_lock(src, R);
 	  *dpair.mask |= (((gtm_cacheline_mask)1 << min) - 1) << sofs;
 	  memcpy (&dpair.line->b[sofs], &sline->b[sofs], min);
 	  dst++;
 	  src++;
 	  size -= min;
 	}
      // (b) Copy subsequent cacheline sized chunks.
      while (size >= CACHELINE_SIZE)
 	{
 	  dpair = disp->write_lock(dst, W);
 	  sline = disp->read_lock(src, R);
 	  *dpair.mask = -1;
 	  *dpair.line = *sline;
 	  dst++;
 	  src++;
 	  size -= CACHELINE_SIZE;
 	}
      // (c) Copy anything left over.
      if (size != 0)
 	{
 	  dpair = disp->write_lock(dst, W);
 	  sline = disp->read_lock(src, R);
 	  *dpair.mask |= ((gtm_cacheline_mask)1 << size) - 1;
 	  memcpy (dpair.line, sline, size);
 	}
    }
  // ... otherwise, we must copy the data in disparate hunks using
  // temporary storage.
  else
    {
      gtm_cacheline c;
      size_t sleft = CACHELINE_SIZE - sofs;
      sline = disp->read_lock(src, R);
      // As above, we copy the data in three stages:
      // (a) Copy stray bytes at the beginning that are smaller than a
      // cacheline.
      if (dofs != 0)
 	{
 	  size_t dleft = CACHELINE_SIZE - dofs;
 	  size_t min = (size <= dleft ? size : dleft);
 	  dpair = disp->write_lock(dst, W);
 	  *dpair.mask |= (((gtm_cacheline_mask)1 << min) - 1) << dofs;
 	  // If what's left in the source cacheline will fit in the
 	  // rest of the destination cacheline, straight up copy it.
 	  if (min <= sleft)
 	    {
 	      memcpy (&dpair.line->b[dofs], &sline->b[sofs], min);
 	      sofs += min;
 	    }
 	  // Otherwise, we need more bits from the source cacheline
 	  // that are available.  Piece together what we need from
 	  // contiguous (source) cachelines, into temp space, and copy
 	  // it over.
 	  else
 	    {
 	      memcpy (&c, &sline->b[sofs], sleft);
 	      sline = disp->read_lock(++src, R);
 	      sofs = min - sleft;
 	      memcpy (&c.b[sleft], sline, sofs);
 	      memcpy (&dpair.line->b[dofs], &c, min);
 	    }
 	  sleft = CACHELINE_SIZE - sofs;
 	  dst++;
 	  size -= min;
 	}
      // (b) Copy subsequent cacheline sized chunks.
      while (size >= CACHELINE_SIZE)
 	{
 	  // We have a full (destination) cacheline where to put the
 	  // data, but to get to the corresponding cacheline sized
 	  // chunk in the source, we have to piece together two
 	  // contiguous source cachelines.
 	  memcpy (&c, &sline->b[sofs], sleft);
 	  sline = disp->read_lock(++src, R);
 	  memcpy (&c.b[sleft], sline, sofs);
 	  dpair = disp->write_lock(dst, W);
 	  *dpair.mask = -1;
 	  *dpair.line = c;
 	  dst++;
 	  size -= CACHELINE_SIZE;
 	}
      // (c) Copy anything left over.
      if (size != 0)
 	{
 	  dpair = disp->write_lock(dst, W);
 	  *dpair.mask |= ((gtm_cacheline_mask)1 << size) - 1;
 	  // If what's left to copy is entirely in the remaining
 	  // source cacheline, do it.
 	  if (size <= sleft)
 	    memcpy (dpair.line, &sline->b[sofs], size);
 	  // Otherwise, piece together the remaining bits, and copy.
 	  else
 	    {
 	      memcpy (&c, &sline->b[sofs], sleft);
 	      sline = disp->read_lock(++src, R);
 	      memcpy (&c.b[sleft], sline, size - sleft);
 	      memcpy (dpair.line, &c, size);
 	    }
 	}
    }
 }
 static void
 do_memmove (uintptr_t idst, uintptr_t isrc, size_t size,
 	    abi_dispatch::lock_type W, abi_dispatch::lock_type R)
 {
  abi_dispatch *disp = abi_disp();
  uintptr_t dleft, sleft, sofs, dofs;
  const gtm_cacheline *sline;
  abi_dispatch::mask_pair dpair;
  if (size == 0)
    return;
  /* The co-aligned memmove below doesn't work for DST == SRC, so filter
     that out.  It's tempting to just return here, as this is a no-op move.
     However, our caller has the right to expect the locks to be acquired
     as advertized.  */
  if (__builtin_expect (idst == isrc, 0))
    {
      /* If the write lock is already acquired, nothing to do.  */
      if (W == abi_dispatch::WaW)
 	return;
      /* If the destination is protected, acquire a write lock.  */
      if (W != abi_dispatch::NOLOCK)
 	R = abi_dispatch::RfW;
      /* Notice serial mode, where we don't acquire locks at all.  */
      if (R == abi_dispatch::NOLOCK)
 	return;
      idst = isrc + size;
      for (isrc &= -CACHELINE_SIZE; isrc < idst; isrc += CACHELINE_SIZE)
 	disp->read_lock(reinterpret_cast<const gtm_cacheline *>(isrc), R);
      return;
    }
  /* Fall back to memcpy if the implementation above can handle it.  */
  if (idst < isrc || isrc + size <= idst)
    {
      do_memcpy (idst, isrc, size, W, R);
      return;
    }
  /* What remains requires a backward copy from the end of the blocks.  */
  idst += size;
  isrc += size;
  dofs = idst & (CACHELINE_SIZE - 1);
  sofs = isrc & (CACHELINE_SIZE - 1);
  dleft = CACHELINE_SIZE - dofs;
  sleft = CACHELINE_SIZE - sofs;
  gtm_cacheline *dst
    = reinterpret_cast<gtm_cacheline *>(idst & -CACHELINE_SIZE);
  const gtm_cacheline *src
    = reinterpret_cast<const gtm_cacheline *>(isrc & -CACHELINE_SIZE);
  if (dofs == 0)
    dst--;
  if (sofs == 0)
    src--;
  if (dofs == sofs)
    {
      /* Since DST and SRC are co-aligned, and we didn't use the memcpy
 	 optimization above, that implies that SIZE > CACHELINE_SIZE.  */
      if (sofs != 0)
 	{
 	  dpair = disp->write_lock(dst, W);
 	  sline = disp->read_lock(src, R);
 	  *dpair.mask |= ((gtm_cacheline_mask)1 << sleft) - 1;
 	  memcpy (dpair.line, sline, sleft);
 	  dst--;
 	  src--;
 	  size -= sleft;
 	}
      while (size >= CACHELINE_SIZE)
 	{
 	  dpair = disp->write_lock(dst, W);
 	  sline = disp->read_lock(src, R);
 	  *dpair.mask = -1;
 	  *dpair.line = *sline;
 	  dst--;
 	  src--;
 	  size -= CACHELINE_SIZE;
 	}
      if (size != 0)
 	{
 	  size_t ofs = CACHELINE_SIZE - size;
 	  dpair = disp->write_lock(dst, W);
 	  sline = disp->read_lock(src, R);
 	  *dpair.mask |= (((gtm_cacheline_mask)1 << size) - 1) << ofs;
 	  memcpy (&dpair.line->b[ofs], &sline->b[ofs], size);
 	}
    }
  else
    {
      gtm_cacheline c;
      sline = disp->read_lock(src, R);
      if (dofs != 0)
 	{
 	  size_t min = (size <= dofs ? size : dofs);
 	  if (min <= sofs)
 	    {
 	      sofs -= min;
 	      memcpy (&c, &sline->b[sofs], min);
 	    }
 	  else
 	    {
 	      size_t min_ofs = min - sofs;
 	      memcpy (&c.b[min_ofs], sline, sofs);
 	      sline = disp->read_lock(--src, R);
 	      sofs = CACHELINE_SIZE - min_ofs;
 	      memcpy (&c, &sline->b[sofs], min_ofs);
 	    }
 	  dofs = dleft - min;
 	  dpair = disp->write_lock(dst, W);
 	  *dpair.mask |= (((gtm_cacheline_mask)1 << min) - 1) << dofs;
 	  memcpy (&dpair.line->b[dofs], &c, min);
 	  sleft = CACHELINE_SIZE - sofs;
 	  dst--;
 	  size -= min;
 	}
      while (size >= CACHELINE_SIZE)
 	{
 	  memcpy (&c.b[sleft], sline, sofs);
 	  sline = disp->read_lock(--src, R);
 	  memcpy (&c, &sline->b[sofs], sleft);
 	  dpair = disp->write_lock(dst, W);
 	  *dpair.mask = -1;
 	  *dpair.line = c;
 	  dst--;
 	  size -= CACHELINE_SIZE;
 	}
      if (size != 0)
 	{
 	  dofs = CACHELINE_SIZE - size;
 	  memcpy (&c.b[sleft], sline, sofs);
 	  if (sleft > dofs)
 	    {
 	      sline = disp->read_lock(--src, R);
 	      memcpy (&c, &sline->b[sofs], sleft);
 	    }
 	  dpair = disp->write_lock(dst, W);
 	  *dpair.mask |= (gtm_cacheline_mask)-1 << dofs;
 	  memcpy (&dpair.line->b[dofs], &c.b[dofs], size);
 	}
    }
 }
 #define ITM_MEM_DEF(NAME, READ, WRITE) \
 void ITM_REGPARM _ITM_memcpy##NAME(void *dst, const void *src, size_t size)  \
 {									     \
  do_memcpy ((uintptr_t)dst, (uintptr_t)src, size,			     \
 	     abi_dispatch::WRITE, abi_dispatch::READ);			     \
 }									     \
 void ITM_REGPARM _ITM_memmove##NAME(void *dst, const void *src, size_t size) \
 {									     \
  do_memmove ((uintptr_t)dst, (uintptr_t)src, size,			     \
 	      abi_dispatch::WRITE, abi_dispatch::READ);			     \
 }
 ITM_MEM_DEF(RnWt,	NOLOCK,		W)
 ITM_MEM_DEF(RnWtaR,	NOLOCK,		WaR)
 ITM_MEM_DEF(RnWtaW,	NOLOCK,		WaW)
 ITM_MEM_DEF(RtWn,	R,		NOLOCK)
 ITM_MEM_DEF(RtWt,	R,		W)
 ITM_MEM_DEF(RtWtaR,	R,		WaR)
 ITM_MEM_DEF(RtWtaW,	R,		WaW)
 ITM_MEM_DEF(RtaRWn,	RaR,		NOLOCK)
 ITM_MEM_DEF(RtaRWt,	RaR,		W)
 ITM_MEM_DEF(RtaRWtaR,	RaR,		WaR)
 ITM_MEM_DEF(RtaRWtaW,	RaR,		WaW)
 ITM_MEM_DEF(RtaWWn,	RaW,		NOLOCK)
 ITM_MEM_DEF(RtaWWt,	RaW,		W)
 ITM_MEM_DEF(RtaWWtaR,	RaW,		WaR)
 ITM_MEM_DEF(RtaWWtaW,	RaW,		WaW)
--- a/libitm/memset.cc
+++ b/libitm/memset.cc
@ -1,78 +0,0 @@
 /* Copyright (C) 2008, 2009, 2011 Free Software Foundation, Inc.
   Contributed by Richard Henderson <rth@redhat.com>.
   This file is part of the GNU Transactional Memory Library (libitm).
   Libitm is free software; you can redistribute it and/or modify it
   under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3 of the License, or
   (at your option) any later version.
   Libitm is distributed in the hope that it will be useful, but WITHOUT ANY
   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
   more details.
   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.
   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */
 #include "libitm_i.h"
 using namespace GTM;
 static void
 do_memset(uintptr_t idst, int c, size_t size, abi_dispatch::lock_type W)
 {
  abi_dispatch *disp = abi_disp();
  uintptr_t dofs = idst & (CACHELINE_SIZE - 1);
  abi_dispatch::mask_pair dpair;
  gtm_cacheline *dst
    = reinterpret_cast<gtm_cacheline *>(idst & -CACHELINE_SIZE);
  if (size == 0)
    return;
  if (dofs != 0)
    {
      size_t dleft = CACHELINE_SIZE - dofs;
      size_t min = (size <= dleft ? size : dleft);
      dpair = disp->write_lock(dst, W);
      *dpair.mask |= (((gtm_cacheline_mask)1 << min) - 1) << dofs;
      memset (&dpair.line->b[dofs], c, min);
      dst++;
      size -= min;
    }
  while (size >= CACHELINE_SIZE)
    {
      dpair = disp->write_lock(dst, W);
      *dpair.mask = -1;
      memset (dpair.line, c, CACHELINE_SIZE);
      dst++;
      size -= CACHELINE_SIZE;
    }
  if (size != 0)
    {
      dpair = disp->write_lock(dst, W);
      *dpair.mask |= ((gtm_cacheline_mask)1 << size) - 1;
      memset (dpair.line, c, size);
    }
 }
 #define ITM_MEM_DEF(WRITE) \
 void ITM_REGPARM _ITM_memset##WRITE(void *dst, int c, size_t size)	\
 {									\
  do_memset ((uintptr_t)dst, c, size, abi_dispatch::WRITE);		\
 }
 ITM_MEM_DEF(W)
 ITM_MEM_DEF(WaR)
 ITM_MEM_DEF(WaW)
--- a/libitm/method-wbetl.cc
+++ b/libitm/method-wbetl.cc
@ -1,628 +0,0 @@
 /* Copyright (C) 2009, 2011 Free Software Foundation, Inc.
   Contributed by Richard Henderson <rth@redhat.com>.
   This file is part of the GNU Transactional Memory Library (libitm).
   Libitm is free software; you can redistribute it and/or modify it
   under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3 of the License, or
   (at your option) any later version.
   Libitm is distributed in the hope that it will be useful, but WITHOUT ANY
   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
   more details.
   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.
   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */
 #include "libitm_i.h"
 namespace {
 using namespace GTM;
 class wbetl_dispatch : public abi_dispatch
 {
 private:
  static const size_t RW_SET_SIZE = 4096;
  struct r_entry
  {
    gtm_version version;
    gtm_stmlock *lock;
  };
  r_entry *m_rset_entries;
  size_t m_rset_nb_entries;
  size_t m_rset_size;
  struct w_entry
  {
    /* There's a hashtable where the locks are held, so multiple
       cachelines can hash to a given bucket.  This link points to the
       possible next cacheline that also hashes to this bucket.  */
    struct w_entry *next;
    /* Every entry in this bucket (accessed by NEXT) has the same LOCK
       address below.  */
    gtm_stmlock *lock;
    gtm_cacheline *addr;
    gtm_cacheline *value;
    gtm_version version;
  };
  w_entry *m_wset_entries;
  size_t m_wset_nb_entries;
  size_t m_wset_size;
  bool m_wset_reallocate;
  gtm_version m_start;
  gtm_version m_end;
  gtm_cacheline_page *m_cache_page;
  unsigned m_n_cache_page;
 private:
  bool local_w_entry_p (w_entry *w);
  bool has_read (gtm_stmlock *lock);
  bool validate();
  bool extend();
  gtm_cacheline *do_write_lock(gtm_cacheline *);
  gtm_cacheline *do_after_write_lock(gtm_cacheline *);
  const gtm_cacheline *do_read_lock(const gtm_cacheline *, bool);
 public:
  wbetl_dispatch();
  virtual const gtm_cacheline *read_lock(const gtm_cacheline *, ls_modifier);
  virtual mask_pair write_lock(gtm_cacheline *, ls_modifier);
  virtual bool trycommit();
  virtual void rollback();
  virtual void reinit();
  virtual void fini();
  virtual bool trydropreference (void *, size_t);
 };
 /* Check if W is one of our write locks.  */
 inline bool
 wbetl_dispatch::local_w_entry_p (w_entry *w)
 {
  return (m_wset_entries <= w && w < m_wset_entries + m_wset_nb_entries);
 }
 /* Check if stripe has been read previously.  */
 inline bool
 wbetl_dispatch::has_read (gtm_stmlock *lock)
 {
  // ??? Consider using an AA tree to lookup the r_set entries.
  size_t n = m_rset_nb_entries;
  for (size_t i = 0; i < n; ++i)
    if (m_rset_entries[i].lock == lock)
      return true;
  return false;
 }
 /* Validate read set, i.e. check if all read addresses are still valid now.  */
 bool
 wbetl_dispatch::validate ()
 {
  __sync_synchronize ();
  size_t n = m_rset_nb_entries;
  for (size_t i = 0; i < n; ++i)
    {
      r_entry *r = &m_rset_entries[i];
      gtm_stmlock l = *r->lock;
      if (gtm_stmlock_owned_p (l))
 	{
 	  w_entry *w = (w_entry *) gtm_stmlock_get_addr (l);
 	  // If someone has locked us, it better be by someone in the
 	  // current thread.
 	  if (!local_w_entry_p (w))
 	    return false;
 	}
      else if (gtm_stmlock_get_version (l) != r->version)
 	return false;
    }
  return true;
 }
 /* Extend the snapshot range.  */
 bool
 wbetl_dispatch::extend ()
 {
  gtm_version now = gtm_get_clock ();
  if (validate ())
    {
      m_end = now;
      return true;
    }
  return false;
 }
 /* Acquire a write lock on ADDR.  */
 gtm_cacheline *
 wbetl_dispatch::do_write_lock(gtm_cacheline *addr)
 {
  gtm_stmlock *lock;
  gtm_stmlock l, l2;
  gtm_version version;
  w_entry *w, *prev = NULL;
  lock = gtm_get_stmlock (addr);
  l = *lock;
 restart_no_load:
  if (gtm_stmlock_owned_p (l))
    {
      w = (w_entry *) gtm_stmlock_get_addr (l);
      /* Did we previously write the same address?  */
      if (local_w_entry_p (w))
 	{
 	  prev = w;
 	  while (1)
 	    {
 	      if (addr == prev->addr)
 		return prev->value;
 	      if (prev->next == NULL)
 		break;
 	      prev = prev->next;
 	    }
 	  /* Get version from previous entry write set.  */
 	  version = prev->version;
 	  /* If there's not enough entries, we must reallocate the array,
 	     which invalidates all pointers to write set entries, which
 	     means we have to restart the transaction.  */
 	  if (m_wset_nb_entries == m_wset_size)
 	    {
 	      m_wset_size *= 2;
 	      m_wset_reallocate = true;
 	      gtm_tx()->restart (RESTART_REALLOCATE);
 	    }
 	  w = &m_wset_entries[m_wset_nb_entries];
 	  goto do_write;
 	}
      gtm_tx()->restart (RESTART_LOCKED_WRITE);
    }
  else
    {
      version = gtm_stmlock_get_version (l);
      /* We might have read an older version previously.  */
      if (version > m_end)
 	{
 	  if (has_read (lock))
 	    gtm_tx()->restart (RESTART_VALIDATE_WRITE);
 	}
      /* Extend write set, aborting to reallocate write set entries.  */
      if (m_wset_nb_entries == m_wset_size)
 	{
 	  m_wset_size *= 2;
 	  m_wset_reallocate = true;
 	  gtm_tx()->restart (RESTART_REALLOCATE);
 	}
      /* Acquire the lock.  */
      w = &m_wset_entries[m_wset_nb_entries];
      l2 = gtm_stmlock_set_owned (w);
      l = __sync_val_compare_and_swap (lock, l, l2);
      if (l != l2)
 	goto restart_no_load;
    }
 do_write:
  m_wset_nb_entries++;
  if (prev != NULL)
    prev->next = w;
  w->next = 0;
  w->lock = lock;
  w->addr = addr;
  w->version = version;
  gtm_cacheline_page *page = m_cache_page;
  unsigned index = m_n_cache_page;
  if (page == NULL || index == gtm_cacheline_page::LINES)
    {
      gtm_cacheline_page *npage = new gtm_cacheline_page;
      npage->prev = page;
      m_cache_page = page = npage;
      m_n_cache_page = 1;
      index = 0;
    }
  else
    m_n_cache_page = index + 1;
  gtm_cacheline *line = &page->lines[index];
  w->value = line;
  page->masks[index] = 0;
  *line = *addr;
  return line;
 }
 gtm_cacheline *
 wbetl_dispatch::do_after_write_lock (gtm_cacheline *addr)
 {
  gtm_stmlock *lock;
  gtm_stmlock l;
  w_entry *w;
  lock = gtm_get_stmlock (addr);
  l = *lock;
  assert (gtm_stmlock_owned_p (l));
  w = (w_entry *) gtm_stmlock_get_addr (l);
  assert (local_w_entry_p (w));
  while (1)
    {
      if (addr == w->addr)
 	return w->value;
      w = w->next;
    }
 }
 /* Acquire a read lock on ADDR.  */
 const gtm_cacheline *
 wbetl_dispatch::do_read_lock (const gtm_cacheline *addr, bool after_read)
 {
  gtm_stmlock *lock;
  gtm_stmlock l, l2;
  gtm_version version;
  w_entry *w;
  lock = gtm_get_stmlock (addr);
  l = *lock;
 restart_no_load:
  if (gtm_stmlock_owned_p (l))
    {
      w = (w_entry *) gtm_stmlock_get_addr (l);
      /* Did we previously write the same address?  */
      if (local_w_entry_p (w))
 	{
 	  while (1)
 	    {
 	      if (addr == w->addr)
 		return w->value;
 	      if (w->next == NULL)
 		return addr;
 	      w = w->next;
 	    }
 	}
      gtm_tx()->restart (RESTART_LOCKED_READ);
    }
  version = gtm_stmlock_get_version (l);
  /* If version is no longer valid, re-validate the read set.  */
  if (version > m_end)
    {
      if (!extend ())
 	gtm_tx()->restart (RESTART_VALIDATE_READ);
      if (!after_read)
 	{
 	  // Verify that the version has not yet been overwritten.  The read
 	  // value has not yet been added to read set and may not have been
 	  // checked during the extend.
 	  //
 	  // ??? This only makes sense if we're actually reading the value
 	  // and returning it now -- which I believe the original TinySTM
 	  // did.  This doesn't make a whole lot of sense when we're
 	  // manipulating cachelines as we are now.  Do we need some other
 	  // form of lock verification here, or is the validate call in
 	  // trycommit sufficient?
 	  __sync_synchronize ();
 	  l2 = *lock;
 	  if (l != l2)
 	    {
 	      l = l2;
 	      goto restart_no_load;
 	    }
 	}
    }
  if (!after_read)
    {
      r_entry *r;
      /* Add the address and version to the read set.  */
      if (m_rset_nb_entries == m_rset_size)
 	{
 	  m_rset_size *= 2;
 	  m_rset_entries = (r_entry *)
 	    xrealloc (m_rset_entries, m_rset_size * sizeof(r_entry));
 	}
      r = &m_rset_entries[m_rset_nb_entries++];
      r->version = version;
      r->lock = lock;
    }
  return addr;
 }
 const gtm_cacheline *
 wbetl_dispatch::read_lock (const gtm_cacheline *addr, ls_modifier ltype)
 {
  switch (ltype)
    {
    case NONTXNAL:
      return addr;
    case R:
      return do_read_lock (addr, false);
    case RaR:
      return do_read_lock (addr, true);
    case RaW:
      return do_after_write_lock (const_cast<gtm_cacheline *>(addr));
    case RfW:
      return do_write_lock (const_cast<gtm_cacheline *>(addr));
    default:
      abort ();
    }
 }
 abi_dispatch::mask_pair
 wbetl_dispatch::write_lock (gtm_cacheline *addr, ls_modifier ltype)
 {
  gtm_cacheline *line;
  switch (ltype)
    {
    case NONTXNAL:
      return mask_pair (addr, &mask_sink);
    case W:
    case WaR:
      line = do_write_lock (addr);
      break;
    case WaW:
      line = do_after_write_lock (addr);
      break;
    default:
      abort ();
    }
  return mask_pair (line, gtm_cacheline_page::mask_for_page_line (line));
 }
 /* Commit the transaction.  */
 bool
 wbetl_dispatch::trycommit ()
 {
  const size_t n = m_wset_nb_entries;
  if (n != 0)
    {
      /* Get commit timestamp.  */
      gtm_version t = gtm_inc_clock ();
      /* Validate only if a concurrent transaction has started since.  */
      if (m_start != t - 1 && !validate ())
 	return false;
      /* Install new versions.  */
      for (size_t i = 0; i < n; ++i)
 	{
 	  w_entry *w = &m_wset_entries[i];
 	  gtm_cacheline_mask mask
 	    = *gtm_cacheline_page::mask_for_page_line (w->value);
 	  /* Filter out any updates that overlap the libitm stack.  */
 	  mask = gtm_mask_stack (w->addr, mask);
 	  gtm_cacheline::copy_mask (w->addr, w->value, mask);
 	}
      /* Only emit barrier after all cachelines are copied.  */
      gtm_cacheline::copy_mask_wb ();
      /* Drop locks.  */
      for (size_t i = 0; i < n; ++i)
 	{
 	  w_entry *w = &m_wset_entries[i];
 	  /* Every link along the chain has the same lock, but only
 	     bother dropping the lock once per bucket (at the end).  */
 	  if (w->next == NULL)
 	    *w->lock = gtm_stmlock_set_version (t);
 	}
    }
  __sync_synchronize ();
  return true;
 }
 void
 wbetl_dispatch::rollback ()
 {
  /* Drop locks.  */
  const size_t n = m_wset_nb_entries;
  for (size_t i = 0; i < n; ++i)
    {
      w_entry *w = &m_wset_entries[i];
      /* Every link along the chain has the same lock, but only
 	 bother dropping the lock once per bucket (at the end).  */
      if (w->next == NULL)
 	*w->lock = gtm_stmlock_set_version (w->version);
    }
  __sync_synchronize ();
 }
 void
 wbetl_dispatch::reinit ()
 {
  gtm_cacheline_page *page;
  m_rset_nb_entries = 0;
  m_wset_nb_entries = 0;
  if (m_wset_reallocate)
    {
      m_wset_reallocate = 0;
      m_wset_entries = (w_entry *)
 	xrealloc (m_wset_entries, m_wset_size * sizeof(w_entry));
    }
  page = m_cache_page;
  if (page)
    {
      /* Release all but one of the pages of cachelines.  */
      gtm_cacheline_page *prev = page->prev;
      if (prev)
 	{
 	  page->prev = 0;
 	  delete prev;
 	}
      /* Start the next cacheline allocation from the beginning.  */
      m_n_cache_page = 0;
    }
  m_start = m_end = gtm_get_clock ();
 }
 void
 wbetl_dispatch::fini ()
 {
  delete m_cache_page;
  free (m_rset_entries);
  free (m_wset_entries);
  delete this;
 }
 /* Attempt to drop any internal references to PTR.  Return TRUE if successful.
   This is an adaptation of the transactional memcpy function.
   What we do here is flush out the current transactional content of
   PTR to real memory, and remove the write mask bits associated with
   it so future commits will ignore this piece of memory.  */
 bool
 wbetl_dispatch::trydropreference (void *ptr, size_t size)
 {
  if (size == 0)
    return true;
  if (!validate ())
    return false;
  uintptr_t isrc = (uintptr_t)ptr;
  // The position in the source cacheline where *PTR starts.
  uintptr_t sofs = isrc & (CACHELINE_SIZE - 1);
  gtm_cacheline *src
    = reinterpret_cast<gtm_cacheline *>(isrc & -CACHELINE_SIZE);
  unsigned char *dst = (unsigned char *)ptr;
  abi_dispatch::mask_pair pair;
  // If we're trying to drop a reference, we should already have a
  // write lock on it.  If we don't have one, there's no work to do.
  if (!gtm_stmlock_owned_p (*gtm_get_stmlock (src)))
    return true;
  // We copy the data in three stages:
  // (a) Copy stray bytes at the beginning that are smaller than a
  // cacheline.
  if (sofs != 0)
    {
      size_t sleft = CACHELINE_SIZE - sofs;
      size_t min = (size <= sleft ? size : sleft);
      // WaW will give us the current locked entry.
      pair = this->write_lock (src, WaW);
      // *jedi mind wave*...these aren't the droids you're looking for.
      *pair.mask &= ~((((gtm_cacheline_mask)1 << min) - 1) << sofs);
      memcpy (dst, &pair.line->b[sofs], min);
      dst += min;
      src++;
      size -= min;
    }
  // (b) Copy subsequent cacheline sized chunks.
  while (size >= CACHELINE_SIZE)
    {
      pair = this->write_lock(src, WaW);
      *pair.mask = 0;
      memcpy (dst, pair.line, CACHELINE_SIZE);
      dst += CACHELINE_SIZE;
      src++;
      size -= CACHELINE_SIZE;
    }
  // (c) Copy anything left over.
  if (size != 0)
    {
      pair = this->write_lock(src, WaW);
      *pair.mask &= ~(((gtm_cacheline_mask)1 << size) - 1);
      memcpy (dst, pair.line, size);
    }
  // No need to drop locks, since we're going to abort the transaction
  // anyhow.
  return true;
 }
 wbetl_dispatch::wbetl_dispatch ()
  : abi_dispatch (false, false)
 {
  m_rset_entries = (r_entry *) xmalloc (RW_SET_SIZE * sizeof(r_entry));
  m_rset_nb_entries = 0;
  m_rset_size = RW_SET_SIZE;
  m_wset_entries = (w_entry *) xmalloc (RW_SET_SIZE * sizeof(w_entry));
  m_wset_nb_entries = 0;
  m_wset_size = RW_SET_SIZE;
  m_wset_reallocate = false;
  m_start = m_end = gtm_get_clock ();
  m_cache_page = 0;
  m_n_cache_page = 0;
 }
 } // anon namespace
 abi_dispatch *
 GTM::dispatch_wbetl ()
 {
  return new wbetl_dispatch ();
 }
--- a/libitm/testsuite/Makefile.in
+++ b/libitm/testsuite/Makefile.in
@ -38,6 +38,7 @@ subdir = testsuite
 DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
 am__aclocal_m4_deps = $(top_srcdir)/../config/acx.m4 \
 	$(top_srcdir)/../config/asmcfi.m4 \
 	$(top_srcdir)/../config/depstand.m4 \
 	$(top_srcdir)/../config/enable.m4 \
 	$(top_srcdir)/../config/futex.m4 \
@ -90,8 +91,6 @@ ECHO_N = @ECHO_N@
 ECHO_T = @ECHO_T@
 EGREP = @EGREP@
 EXEEXT = @EXEEXT@
 FC = @FC@
 FCFLAGS = @FCFLAGS@
 FGREP = @FGREP@
 GREP = @GREP@
 INSTALL = @INSTALL@
@ -142,7 +141,6 @@ abs_top_srcdir = @abs_top_srcdir@
 ac_ct_CC = @ac_ct_CC@
 ac_ct_CXX = @ac_ct_CXX@
 ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
 ac_ct_FC = @ac_ct_FC@
 am__include = @am__include@
 am__leading_dot = @am__leading_dot@
 am__quote = @am__quote@