libstdc++: Add std::experimental::simd from the Parallelism TS 2

Adds <experimental/simd>. This implements the simd and simd_mask class templates via [[gnu::vector_size(N)]] data members. It implements overloads for all of <cmath> for simd. Explicit vectorization of the <cmath> functions is not finished. The majority of functions are marked as [[gnu::always_inline]] to enable quasi-ODR-conforming linking of TUs with different -m flags. Performance optimization was done for x86_64. ARM, Aarch64, and POWER rely on the compiler to recognize reduction, conversion, and shuffle patterns. Besides verification using many different machine flages, the code was also verified with different fast-math flags. libstdc++-v3/ChangeLog: * doc/xml/manual/status_cxx2017.xml: Add implementation status of the Parallelism TS 2. Document implementation-defined types and behavior. * include/Makefile.am: Add new headers. * include/Makefile.in: Regenerate. * include/experimental/simd: New file. New header for Parallelism TS 2. * include/experimental/bits/numeric_traits.h: New file. Implementation of P1841R1 using internal naming. Addition of missing IEC559 functionality query. * include/experimental/bits/simd.h: New file. Definition of the public simd interfaces and general implementation helpers. * include/experimental/bits/simd_builtin.h: New file. Implementation of the _VecBuiltin simd_abi. * include/experimental/bits/simd_converter.h: New file. Generic simd conversions. * include/experimental/bits/simd_detail.h: New file. Internal macros for the simd implementation. * include/experimental/bits/simd_fixed_size.h: New file. Simd fixed_size ABI specific implementations. * include/experimental/bits/simd_math.h: New file. Math overloads for simd. * include/experimental/bits/simd_neon.h: New file. Simd NEON specific implementations. * include/experimental/bits/simd_ppc.h: New file. Implement bit shifts to avoid invalid results for integral types smaller than int. * include/experimental/bits/simd_scalar.h: New file. Simd scalar ABI specific implementations. * include/experimental/bits/simd_x86.h: New file. Simd x86 specific implementations. * include/experimental/bits/simd_x86_conversions.h: New file. x86 specific conversion optimizations. The conversion patterns work around missing conversion patterns in the compiler and should be removed as soon as PR85048 is resolved. * testsuite/experimental/simd/standard_abi_usable.cc: New file. Test that all (not all fixed_size<N>, though) standard simd and simd_mask types are usable. * testsuite/experimental/simd/standard_abi_usable_2.cc: New file. As above but with -ffast-math. * testsuite/libstdc++-dg/conformance.exp: Don't build simd tests from the standard test loop. Instead use check_vect_support_and_set_flags to build simd tests with the relevant machine flags.
2021-01-21 11:45:15 +00:00 · 2021-01-21 11:45:15 +00:00 · 2bcceb6fc5
commit 2bcceb6fc5
parent c91db798ec
19 changed files with 21802 additions and 1 deletions
--- a/libstdc++-v3/doc/xml/manual/status_cxx2017.xml
+++ b/libstdc++-v3/doc/xml/manual/status_cxx2017.xml
@ -2869,6 +2869,17 @@ since C++14 and the implementation is complete.
      <entry>Library Fundamentals 2 TS</entry>
    </row>

+    <row>
+      <entry>
+	<link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2018/p0214r9.pdf">
+	  P0214R9
+	</link>
+      </entry>
+      <entry>Data-Parallel Types</entry>
+      <entry>Y</entry>
+      <entry>Parallelism 2 TS</entry>
+    </row>
+
  </tbody>
 </tgroup>
 </table>
@ -3014,6 +3025,211 @@ since C++14 and the implementation is complete.
      If <code>!is_regular_file(p)</code>, an error is reported.
   </para>

+   <section xml:id="iso.2017.par2ts" xreflabel="Implementation Specific Behavior of the Parallelism 2 TS"><info><title>Parallelism 2 TS</title></info>
+
+     <para>
+        <emphasis>9.3 [parallel.simd.abi]</emphasis>
+        <code>max_fixed_size&lt;T&gt;</code> is 32, except when targetting
+        AVX512BW and <code>sizeof(T)</code> is 1.
+     </para>
+
+     <para>
+        When targeting 32-bit x86,
+        <classname>simd_abi::compatible&lt;T&gt;</classname> is an alias for
+        <classname>simd_abi::scalar</classname>.
+        When targeting 64-bit x86 (including x32) or Aarch64,
+        <classname>simd_abi::compatible&lt;T&gt;</classname> is an alias for
+        <classname>simd_abi::_VecBuiltin&lt;16&gt;</classname>,
+        unless <code>T</code> is <code>long double</code>, in which case it is
+        an alias for <classname>simd_abi::scalar</classname>.
+        When targeting ARM (but not Aarch64) with NEON support,
+        <classname>simd_abi::compatible&lt;T&gt;</classname> is an alias for
+        <classname>simd_abi::_VecBuiltin&lt;16&gt;</classname>,
+        unless <code>sizeof(T) &gt; 4</code>, in which case it is
+        an alias for <classname>simd_abi::scalar</classname>. Additionally,
+        <classname>simd_abi::compatible&lt;float&gt;</classname> is an alias for
+        <classname>simd_abi::scalar</classname> unless compiling with
+        -ffast-math.
+     </para>
+
+     <para>
+        When targeting x86 (both 32-bit and 64-bit),
+        <classname>simd_abi::native&lt;T&gt;</classname> is an alias for one of
+        <classname>simd_abi::scalar</classname>,
+        <classname>simd_abi::_VecBuiltin&lt;16&gt;</classname>,
+        <classname>simd_abi::_VecBuiltin&lt;32&gt;</classname>, or
+        <classname>simd_abi::_VecBltnBtmsk&lt;64&gt;</classname>, depending on
+        <code>T</code> and the machine options the compiler was invoked with.
+     </para>
+
+     <para>
+        When targeting ARM/Aarch64 or POWER,
+        <classname>simd_abi::native&lt;T&gt;</classname> is an alias for
+        <classname>simd_abi::scalar</classname> or
+        <classname>simd_abi::_VecBuiltin&lt;16&gt;</classname>, depending on
+        <code>T</code> and the machine options the compiler was invoked with.
+     </para>
+
+     <para>
+        For any other targeted machine
+        <classname>simd_abi::compatible&lt;T&gt;</classname> and
+        <classname>simd_abi::native&lt;T&gt;</classname> are aliases for
+        <classname>simd_abi::scalar</classname>. (subject to change)
+     </para>
+
+     <para>
+        The extended ABI tag types defined in the
+        <code>std::experimental::parallelism_v2::simd_abi</code> namespace are:
+        <classname>simd_abi::_VecBuiltin&lt;Bytes&gt;</classname>, and
+        <classname>simd_abi::_VecBltnBtmsk&lt;Bytes&gt;</classname>.
+     </para>
+
+     <para>
+        <classname>simd_abi::deduce&lt;T, N, Abis...&gt;::type</classname>,
+        with <code>N &gt; 1</code> is an alias for an extended ABI tag, if a
+        supported extended ABI tag exists. Otherwise it is an alias for
+        <classname>simd_abi::fixed_size&lt;N&gt;</classname>. The <classname>
+        simd_abi::_VecBltnBtmsk</classname> ABI tag is preferred over
+        <classname>simd_abi::_VecBuiltin</classname>.
+     </para>
+
+     <para>
+        <emphasis>9.4 [parallel.simd.traits]</emphasis>
+        <classname>memory_alignment&lt;T, U&gt;::value</classname> is
+        <code>sizeof(U) * T::size()</code> rounded up to the next power-of-two
+        value.
+     </para>
+
+     <para>
+        <emphasis>9.6.1 [parallel.simd.overview]</emphasis>
+        On ARM, <classname>simd&lt;T, _VecBuiltin&lt;Bytes&gt;&gt;</classname>
+        is supported if <code>__ARM_NEON</code> is defined and
+        <code>sizeof(T) &lt;= 4</code>. Additionally,
+        <code>sizeof(T) == 8</code> with integral <code>T</code> is supported if
+        <code>__ARM_ARCH &gt;= 8</code>, and <code>double</code> is supported if
+        <code>__aarch64__</code> is defined.
+
+        On POWER, <classname>simd&lt;T, _VecBuiltin&lt;Bytes&gt;&gt;</classname>
+        is supported if <code>__ALTIVEC__</code> is defined and <code>sizeof(T)
+        &lt; 8</code>. Additionally, <code>double</code> is supported if
+        <code>__VSX__</code> is defined, and any <code>T</code> with <code>
+        sizeof(T) &le; 8</code> is supported if <code>__POWER8_VECTOR__</code>
+        is defined.
+
+        On x86, given an extended ABI tag <code>Abi</code>,
+        <classname>simd&lt;T, Abi&gt;</classname> is supported according to the
+        following table:
+        <table frame="all" xml:id="table.par2ts_simd_support">
+          <title>Support for Extended ABI Tags</title>
+
+          <tgroup cols="4" align="left" colsep="0" rowsep="1">
+          <colspec colname="c1"/>
+          <colspec colname="c2"/>
+          <colspec colname="c3"/>
+          <colspec colname="c4"/>
+            <thead>
+              <row>
+                <entry>ABI tag <code>Abi</code></entry>
+                <entry>value type <code>T</code></entry>
+                <entry>values for <code>Bytes</code></entry>
+                <entry>required machine option</entry>
+              </row>
+            </thead>
+
+            <tbody>
+              <row>
+                <entry morerows="5">
+                  <classname>_VecBuiltin&lt;Bytes&gt;</classname>
+                </entry>
+                <entry morerows="1"><code>float</code></entry>
+                <entry>8, 12, 16</entry>
+                <entry>"-msse"</entry>
+              </row>
+
+              <row>
+                <entry>20, 24, 28, 32</entry>
+                <entry>"-mavx"</entry>
+              </row>
+
+              <row>
+                <entry morerows="1"><code>double</code></entry>
+                <entry>16</entry>
+                <entry>"-msse2"</entry>
+              </row>
+
+              <row>
+                <entry>24, 32</entry>
+                <entry>"-mavx"</entry>
+              </row>
+
+              <row>
+                <entry morerows="1">
+                  integral types other than <code>bool</code>
+                </entry>
+                <entry>
+                  <code>Bytes</code> ≤ 16 and <code>Bytes</code> divisible by
+                  <code>sizeof(T)</code>
+                </entry>
+                <entry>"-msse2"</entry>
+              </row>
+
+              <row>
+                <entry>
+                  16 &lt; <code>Bytes</code> ≤ 32 and <code>Bytes</code>
+                  divisible by <code>sizeof(T)</code>
+                </entry>
+                <entry>"-mavx2"</entry>
+              </row>
+
+              <row>
+                <entry morerows="1">
+                  <classname>_VecBuiltin&lt;Bytes&gt;</classname> and
+                  <classname>_VecBltnBtmsk&lt;Bytes&gt;</classname>
+                </entry>
+                <entry>
+                  vectorizable types with <code>sizeof(T)</code> ≥ 4
+                </entry>
+                <entry morerows="1">
+                  32 &lt; <code>Bytes</code> ≤ 64 and <code>Bytes</code>
+                  divisible by <code>sizeof(T)</code>
+                </entry>
+                <entry>"-mavx512f"</entry>
+              </row>
+
+              <row>
+                <entry>
+                  vectorizable types with <code>sizeof(T)</code> &lt; 4
+                </entry>
+                <entry>"-mavx512bw"</entry>
+              </row>
+
+              <row>
+                <entry morerows="1">
+                  <classname>_VecBltnBtmsk&lt;Bytes&gt;</classname>
+                </entry>
+                <entry>
+                  vectorizable types with <code>sizeof(T)</code> ≥ 4
+                </entry>
+                <entry morerows="1">
+                  <code>Bytes</code> ≤ 32 and <code>Bytes</code> divisible by
+                  <code>sizeof(T)</code>
+                </entry>
+                <entry>"-mavx512vl"</entry>
+              </row>
+
+              <row>
+                <entry>
+                  vectorizable types with <code>sizeof(T)</code> &lt; 4
+                </entry>
+                <entry>"-mavx512bw" and "-mavx512vl"</entry>
+              </row>
+
+            </tbody>
+          </tgroup>
+        </table>
+     </para>
+
+   </section>

 </section>

--- a/libstdc++-v3/include/Makefile.am
+++ b/libstdc++-v3/include/Makefile.am
@ -747,6 +747,7 @@ experimental_headers = \
 	${experimental_srcdir}/ratio \
 	${experimental_srcdir}/regex \
 	${experimental_srcdir}/set \
+	${experimental_srcdir}/simd \
 	${experimental_srcdir}/socket \
 	${experimental_srcdir}/source_location \
 	${experimental_srcdir}/string \
@ -766,7 +767,19 @@ experimental_bits_builddir = ./experimental/bits
 experimental_bits_headers = \
 	${experimental_bits_srcdir}/lfts_config.h \
 	${experimental_bits_srcdir}/net.h \
+	${experimental_bits_srcdir}/numeric_traits.h \
 	${experimental_bits_srcdir}/shared_ptr.h \
+	${experimental_bits_srcdir}/simd.h \
+	${experimental_bits_srcdir}/simd_builtin.h \
+	${experimental_bits_srcdir}/simd_converter.h \
+	${experimental_bits_srcdir}/simd_detail.h \
+	${experimental_bits_srcdir}/simd_fixed_size.h \
+	${experimental_bits_srcdir}/simd_math.h \
+	${experimental_bits_srcdir}/simd_neon.h \
+	${experimental_bits_srcdir}/simd_ppc.h \
+	${experimental_bits_srcdir}/simd_scalar.h \
+	${experimental_bits_srcdir}/simd_x86.h \
+	${experimental_bits_srcdir}/simd_x86_conversions.h \
 	${experimental_bits_srcdir}/string_view.tcc \
 	${experimental_bits_filesystem_headers}

--- a/libstdc++-v3/include/Makefile.in
+++ b/libstdc++-v3/include/Makefile.in
@ -1097,6 +1097,7 @@ experimental_headers = \
 	${experimental_srcdir}/ratio \
 	${experimental_srcdir}/regex \
 	${experimental_srcdir}/set \
+	${experimental_srcdir}/simd \
 	${experimental_srcdir}/socket \
 	${experimental_srcdir}/source_location \
 	${experimental_srcdir}/string \
@ -1116,7 +1117,19 @@ experimental_bits_builddir = ./experimental/bits
 experimental_bits_headers = \
 	${experimental_bits_srcdir}/lfts_config.h \
 	${experimental_bits_srcdir}/net.h \
+	${experimental_bits_srcdir}/numeric_traits.h \
 	${experimental_bits_srcdir}/shared_ptr.h \
+	${experimental_bits_srcdir}/simd.h \
+	${experimental_bits_srcdir}/simd_builtin.h \
+	${experimental_bits_srcdir}/simd_converter.h \
+	${experimental_bits_srcdir}/simd_detail.h \
+	${experimental_bits_srcdir}/simd_fixed_size.h \
+	${experimental_bits_srcdir}/simd_math.h \
+	${experimental_bits_srcdir}/simd_neon.h \
+	${experimental_bits_srcdir}/simd_ppc.h \
+	${experimental_bits_srcdir}/simd_scalar.h \
+	${experimental_bits_srcdir}/simd_x86.h \
+	${experimental_bits_srcdir}/simd_x86_conversions.h \
 	${experimental_bits_srcdir}/string_view.tcc \
 	${experimental_bits_filesystem_headers}

--- a/libstdc++-v3/include/experimental/bits/numeric_traits.h
+++ b/libstdc++-v3/include/experimental/bits/numeric_traits.h
@ -0,0 +1,567 @@
+// Definition of numeric_limits replacement traits P1841R1 -*- C++ -*-
+
+// Copyright (C) 2020 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// Under Section 7 of GPL version 3, you are granted additional
+// permissions described in the GCC Runtime Library Exception, version
+// 3.1, as published by the Free Software Foundation.
+
+// You should have received a copy of the GNU General Public License and
+// a copy of the GCC Runtime Library Exception along with this program;
+// see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+// <http://www.gnu.org/licenses/>.
+
+#include <type_traits>
+
+namespace std {
+
+template <template <typename> class _Trait, typename _Tp, typename = void>
+  struct __value_exists_impl : false_type {};
+
+template <template <typename> class _Trait, typename _Tp>
+  struct __value_exists_impl<_Trait, _Tp, void_t<decltype(_Trait<_Tp>::value)>>
+  : true_type {};
+
+template <typename _Tp, bool = is_arithmetic_v<_Tp>>
+  struct __digits_impl {};
+
+template <typename _Tp>
+  struct __digits_impl<_Tp, true>
+  {
+    static inline constexpr int value
+      = sizeof(_Tp) * __CHAR_BIT__ - is_signed_v<_Tp>;
+  };
+
+template <>
+  struct __digits_impl<float, true>
+  { static inline constexpr int value = __FLT_MANT_DIG__; };
+
+template <>
+  struct __digits_impl<double, true>
+  { static inline constexpr int value = __DBL_MANT_DIG__; };
+
+template <>
+  struct __digits_impl<long double, true>
+  { static inline constexpr int value = __LDBL_MANT_DIG__; };
+
+template <typename _Tp, bool = is_arithmetic_v<_Tp>>
+  struct __digits10_impl {};
+
+template <typename _Tp>
+  struct __digits10_impl<_Tp, true>
+  {
+    // The fraction 643/2136 approximates log10(2) to 7 significant digits.
+    static inline constexpr int value = __digits_impl<_Tp>::value * 643L / 2136;
+  };
+
+template <>
+  struct __digits10_impl<float, true>
+  { static inline constexpr int value = __FLT_DIG__; };
+
+template <>
+  struct __digits10_impl<double, true>
+  { static inline constexpr int value = __DBL_DIG__; };
+
+template <>
+  struct __digits10_impl<long double, true>
+  { static inline constexpr int value = __LDBL_DIG__; };
+
+template <typename _Tp, bool = is_arithmetic_v<_Tp>>
+  struct __max_digits10_impl {};
+
+template <typename _Tp>
+  struct __max_digits10_impl<_Tp, true>
+  {
+    static inline constexpr int value
+      = is_floating_point_v<_Tp> ? 2 + __digits_impl<_Tp>::value * 643L / 2136
+				 : __digits10_impl<_Tp>::value + 1;
+  };
+
+template <typename _Tp>
+  struct __max_exponent_impl {};
+
+template <>
+  struct __max_exponent_impl<float>
+  { static inline constexpr int value = __FLT_MAX_EXP__; };
+
+template <>
+  struct __max_exponent_impl<double>
+  { static inline constexpr int value = __DBL_MAX_EXP__; };
+
+template <>
+  struct __max_exponent_impl<long double>
+  { static inline constexpr int value = __LDBL_MAX_EXP__; };
+
+template <typename _Tp>
+  struct __max_exponent10_impl {};
+
+template <>
+  struct __max_exponent10_impl<float>
+  { static inline constexpr int value = __FLT_MAX_10_EXP__; };
+
+template <>
+  struct __max_exponent10_impl<double>
+  { static inline constexpr int value = __DBL_MAX_10_EXP__; };
+
+template <>
+  struct __max_exponent10_impl<long double>
+  { static inline constexpr int value = __LDBL_MAX_10_EXP__; };
+
+template <typename _Tp>
+  struct __min_exponent_impl {};
+
+template <>
+  struct __min_exponent_impl<float>
+  { static inline constexpr int value = __FLT_MIN_EXP__; };
+
+template <>
+  struct __min_exponent_impl<double>
+  { static inline constexpr int value = __DBL_MIN_EXP__; };
+
+template <>
+  struct __min_exponent_impl<long double>
+  { static inline constexpr int value = __LDBL_MIN_EXP__; };
+
+template <typename _Tp>
+  struct __min_exponent10_impl {};
+
+template <>
+  struct __min_exponent10_impl<float>
+  { static inline constexpr int value = __FLT_MIN_10_EXP__; };
+
+template <>
+  struct __min_exponent10_impl<double>
+  { static inline constexpr int value = __DBL_MIN_10_EXP__; };
+
+template <>
+  struct __min_exponent10_impl<long double>
+  { static inline constexpr int value = __LDBL_MIN_10_EXP__; };
+
+template <typename _Tp, bool = is_arithmetic_v<_Tp>>
+  struct __radix_impl {};
+
+template <typename _Tp>
+  struct __radix_impl<_Tp, true>
+  {
+    static inline constexpr int value
+      = is_floating_point_v<_Tp> ? __FLT_RADIX__ : 2;
+  };
+
+// [num.traits.util], numeric utility traits
+template <template <typename> class _Trait, typename _Tp>
+  struct __value_exists : __value_exists_impl<_Trait, _Tp> {};
+
+template <template <typename> class _Trait, typename _Tp>
+  inline constexpr bool __value_exists_v = __value_exists<_Trait, _Tp>::value;
+
+template <template <typename> class _Trait, typename _Tp, typename _Up = _Tp>
+  inline constexpr _Up
+  __value_or(_Up __def = _Up()) noexcept
+  {
+    if constexpr (__value_exists_v<_Trait, _Tp>)
+      return static_cast<_Up>(_Trait<_Tp>::value);
+    else
+      return __def;
+  }
+
+template <typename _Tp, bool = is_arithmetic_v<_Tp>>
+  struct __norm_min_impl {};
+
+template <typename _Tp>
+  struct __norm_min_impl<_Tp, true>
+  { static inline constexpr _Tp value = 1; };
+
+template <>
+  struct __norm_min_impl<float, true>
+  { static inline constexpr float value = __FLT_MIN__; };
+
+template <>
+  struct __norm_min_impl<double, true>
+  { static inline constexpr double value = __DBL_MIN__; };
+
+template <>
+  struct __norm_min_impl<long double, true>
+  { static inline constexpr long double value = __LDBL_MIN__; };
+
+template <typename _Tp>
+  struct __denorm_min_impl : __norm_min_impl<_Tp> {};
+
+#if __FLT_HAS_DENORM__
+template <>
+  struct __denorm_min_impl<float>
+  { static inline constexpr float value = __FLT_DENORM_MIN__; };
+#endif
+
+#if __DBL_HAS_DENORM__
+template <>
+  struct __denorm_min_impl<double>
+  { static inline constexpr double value = __DBL_DENORM_MIN__; };
+#endif
+
+#if __LDBL_HAS_DENORM__
+template <>
+  struct __denorm_min_impl<long double>
+  { static inline constexpr long double value = __LDBL_DENORM_MIN__; };
+#endif
+
+template <typename _Tp>
+  struct __epsilon_impl {};
+
+template <>
+  struct __epsilon_impl<float>
+  { static inline constexpr float value = __FLT_EPSILON__; };
+
+template <>
+  struct __epsilon_impl<double>
+  { static inline constexpr double value = __DBL_EPSILON__; };
+
+template <>
+  struct __epsilon_impl<long double>
+  { static inline constexpr long double value = __LDBL_EPSILON__; };
+
+template <typename _Tp, bool = is_arithmetic_v<_Tp>>
+  struct __finite_min_impl {};
+
+template <typename _Tp>
+  struct __finite_min_impl<_Tp, true>
+  {
+    static inline constexpr _Tp value
+      = is_unsigned_v<_Tp> ? _Tp()
+			   : -2 * (_Tp(1) << __digits_impl<_Tp>::value - 1);
+  };
+
+template <>
+  struct __finite_min_impl<float, true>
+  { static inline constexpr float value = -__FLT_MAX__; };
+
+template <>
+  struct __finite_min_impl<double, true>
+  { static inline constexpr double value = -__DBL_MAX__; };
+
+template <>
+  struct __finite_min_impl<long double, true>
+  { static inline constexpr long double value = -__LDBL_MAX__; };
+
+template <typename _Tp, bool = is_arithmetic_v<_Tp>>
+  struct __finite_max_impl {};
+
+template <typename _Tp>
+  struct __finite_max_impl<_Tp, true>
+  { static inline constexpr _Tp value = ~__finite_min_impl<_Tp>::value; };
+
+template <>
+  struct __finite_max_impl<float, true>
+  { static inline constexpr float value = __FLT_MAX__; };
+
+template <>
+  struct __finite_max_impl<double, true>
+  { static inline constexpr double value = __DBL_MAX__; };
+
+template <>
+  struct __finite_max_impl<long double, true>
+  { static inline constexpr long double value = __LDBL_MAX__; };
+
+template <typename _Tp>
+  struct __infinity_impl {};
+
+#if __FLT_HAS_INFINITY__
+template <>
+  struct __infinity_impl<float>
+  { static inline constexpr float value = __builtin_inff(); };
+#endif
+
+#if __DBL_HAS_INFINITY__
+template <>
+  struct __infinity_impl<double>
+  { static inline constexpr double value = __builtin_inf(); };
+#endif
+
+#if __LDBL_HAS_INFINITY__
+template <>
+  struct __infinity_impl<long double>
+  { static inline constexpr long double value = __builtin_infl(); };
+#endif
+
+template <typename _Tp>
+  struct __quiet_NaN_impl {};
+
+#if __FLT_HAS_QUIET_NAN__
+template <>
+  struct __quiet_NaN_impl<float>
+  { static inline constexpr float value = __builtin_nanf(""); };
+#endif
+
+#if __DBL_HAS_QUIET_NAN__
+template <>
+  struct __quiet_NaN_impl<double>
+  { static inline constexpr double value = __builtin_nan(""); };
+#endif
+
+#if __LDBL_HAS_QUIET_NAN__
+template <>
+  struct __quiet_NaN_impl<long double>
+  { static inline constexpr long double value = __builtin_nanl(""); };
+#endif
+
+template <typename _Tp, bool = is_floating_point_v<_Tp>>
+  struct __reciprocal_overflow_threshold_impl {};
+
+template <typename _Tp>
+  struct __reciprocal_overflow_threshold_impl<_Tp, true>
+  {
+    // This typically yields a subnormal value. Is this incorrect for
+    // flush-to-zero configurations?
+    static constexpr _Tp _S_search(_Tp __ok, _Tp __overflows)
+    {
+      const _Tp __mid = (__ok + __overflows) / 2;
+      // 1/__mid without -ffast-math is not a constant expression if it
+      // overflows. Therefore divide 1 by the radix before division.
+      // Consequently finite_max (the threshold) must be scaled by the
+      // same value.
+      if (__mid == __ok || __mid == __overflows)
+	return __ok;
+      else if (_Tp(1) / (__radix_impl<_Tp>::value * __mid)
+	       <= __finite_max_impl<_Tp>::value / __radix_impl<_Tp>::value)
+	return _S_search(__mid, __overflows);
+      else
+	return _S_search(__ok, __mid);
+    }
+
+    static inline constexpr _Tp value
+      = _S_search(_Tp(1.01) / __finite_max_impl<_Tp>::value,
+		  _Tp(0.99) / __finite_max_impl<_Tp>::value);
+  };
+
+template <typename _Tp, bool = is_floating_point_v<_Tp>>
+  struct __round_error_impl {};
+
+template <typename _Tp>
+  struct __round_error_impl<_Tp, true>
+  { static inline constexpr _Tp value = 0.5; };
+
+template <typename _Tp>
+  struct __signaling_NaN_impl {};
+
+#if __FLT_HAS_QUIET_NAN__
+template <>
+  struct __signaling_NaN_impl<float>
+  { static inline constexpr float value = __builtin_nansf(""); };
+#endif
+
+#if __DBL_HAS_QUIET_NAN__
+template <>
+  struct __signaling_NaN_impl<double>
+  { static inline constexpr double value = __builtin_nans(""); };
+#endif
+
+#if __LDBL_HAS_QUIET_NAN__
+template <>
+  struct __signaling_NaN_impl<long double>
+  { static inline constexpr long double value = __builtin_nansl(""); };
+#endif
+
+// [num.traits.val], numeric distinguished value traits
+template <typename _Tp>
+  struct __denorm_min : __denorm_min_impl<remove_cv_t<_Tp>> {};
+
+template <typename _Tp>
+  struct __epsilon : __epsilon_impl<remove_cv_t<_Tp>> {};
+
+template <typename _Tp>
+  struct __finite_max : __finite_max_impl<remove_cv_t<_Tp>> {};
+
+template <typename _Tp>
+  struct __finite_min : __finite_min_impl<remove_cv_t<_Tp>> {};
+
+template <typename _Tp>
+  struct __infinity : __infinity_impl<remove_cv_t<_Tp>> {};
+
+template <typename _Tp>
+  struct __norm_min : __norm_min_impl<remove_cv_t<_Tp>> {};
+
+template <typename _Tp>
+  struct __quiet_NaN : __quiet_NaN_impl<remove_cv_t<_Tp>> {};
+
+template <typename _Tp>
+  struct __reciprocal_overflow_threshold
+  : __reciprocal_overflow_threshold_impl<remove_cv_t<_Tp>> {};
+
+template <typename _Tp>
+  struct __round_error : __round_error_impl<remove_cv_t<_Tp>> {};
+
+template <typename _Tp>
+  struct __signaling_NaN : __signaling_NaN_impl<remove_cv_t<_Tp>> {};
+
+template <typename _Tp>
+  inline constexpr auto __denorm_min_v = __denorm_min<_Tp>::value;
+
+template <typename _Tp>
+  inline constexpr auto __epsilon_v = __epsilon<_Tp>::value;
+
+template <typename _Tp>
+  inline constexpr auto __finite_max_v = __finite_max<_Tp>::value;
+
+template <typename _Tp>
+  inline constexpr auto __finite_min_v = __finite_min<_Tp>::value;
+
+template <typename _Tp>
+  inline constexpr auto __infinity_v = __infinity<_Tp>::value;
+
+template <typename _Tp>
+  inline constexpr auto __norm_min_v = __norm_min<_Tp>::value;
+
+template <typename _Tp>
+  inline constexpr auto __quiet_NaN_v = __quiet_NaN<_Tp>::value;
+
+template <typename _Tp>
+  inline constexpr auto __reciprocal_overflow_threshold_v
+    = __reciprocal_overflow_threshold<_Tp>::value;
+
+template <typename _Tp>
+  inline constexpr auto __round_error_v = __round_error<_Tp>::value;
+
+template <typename _Tp>
+  inline constexpr auto __signaling_NaN_v = __signaling_NaN<_Tp>::value;
+
+// [num.traits.char], numeric characteristics traits
+template <typename _Tp>
+  struct __digits : __digits_impl<remove_cv_t<_Tp>> {};
+
+template <typename _Tp>
+  struct __digits10 : __digits10_impl<remove_cv_t<_Tp>> {};
+
+template <typename _Tp>
+  struct __max_digits10 : __max_digits10_impl<remove_cv_t<_Tp>> {};
+
+template <typename _Tp>
+  struct __max_exponent : __max_exponent_impl<remove_cv_t<_Tp>> {};
+
+template <typename _Tp>
+  struct __max_exponent10 : __max_exponent10_impl<remove_cv_t<_Tp>> {};
+
+template <typename _Tp>
+  struct __min_exponent : __min_exponent_impl<remove_cv_t<_Tp>> {};
+
+template <typename _Tp>
+  struct __min_exponent10 : __min_exponent10_impl<remove_cv_t<_Tp>> {};
+
+template <typename _Tp>
+  struct __radix : __radix_impl<remove_cv_t<_Tp>> {};
+
+template <typename _Tp>
+  inline constexpr auto __digits_v = __digits<_Tp>::value;
+
+template <typename _Tp>
+  inline constexpr auto __digits10_v = __digits10<_Tp>::value;
+
+template <typename _Tp>
+  inline constexpr auto __max_digits10_v = __max_digits10<_Tp>::value;
+
+template <typename _Tp>
+  inline constexpr auto __max_exponent_v = __max_exponent<_Tp>::value;
+
+template <typename _Tp>
+  inline constexpr auto __max_exponent10_v = __max_exponent10<_Tp>::value;
+
+template <typename _Tp>
+  inline constexpr auto __min_exponent_v = __min_exponent<_Tp>::value;
+
+template <typename _Tp>
+  inline constexpr auto __min_exponent10_v = __min_exponent10<_Tp>::value;
+
+template <typename _Tp>
+  inline constexpr auto __radix_v = __radix<_Tp>::value;
+
+// mkretz's extensions
+// TODO: does GCC tell me? __GCC_IEC_559 >= 2 is not the right answer
+template <typename _Tp>
+  struct __has_iec559_storage_format : true_type {};
+
+template <typename _Tp>
+  inline constexpr bool __has_iec559_storage_format_v
+    = __has_iec559_storage_format<_Tp>::value;
+
+/* To propose:
+   If __has_iec559_behavior<__quiet_NaN, T> is true the following holds:
+     - nan == nan is false
+     - isnan(nan) is true
+     - isnan(nan + x) is true
+     - isnan(inf/inf) is true
+     - isnan(0/0) is true
+     - isunordered(nan, x) is true
+
+   If __has_iec559_behavior<__infinity, T> is true the following holds (x is
+   neither nan nor inf):
+     - isinf(inf) is true
+     - isinf(inf + x) is true
+     - isinf(1/0) is true
+ */
+template <template <typename> class _Trait, typename _Tp>
+  struct __has_iec559_behavior : false_type {};
+
+template <template <typename> class _Trait, typename _Tp>
+  inline constexpr bool __has_iec559_behavior_v
+    = __has_iec559_behavior<_Trait, _Tp>::value;
+
+#if !__FINITE_MATH_ONLY__
+#if __FLT_HAS_QUIET_NAN__
+template <>
+  struct __has_iec559_behavior<__quiet_NaN, float> : true_type {};
+#endif
+
+#if __DBL_HAS_QUIET_NAN__
+template <>
+  struct __has_iec559_behavior<__quiet_NaN, double> : true_type {};
+#endif
+
+#if __LDBL_HAS_QUIET_NAN__
+template <>
+  struct __has_iec559_behavior<__quiet_NaN, long double> : true_type {};
+#endif
+
+#if __FLT_HAS_INFINITY__
+template <>
+  struct __has_iec559_behavior<__infinity, float> : true_type {};
+#endif
+
+#if __DBL_HAS_INFINITY__
+template <>
+  struct __has_iec559_behavior<__infinity, double> : true_type {};
+#endif
+
+#if __LDBL_HAS_INFINITY__
+template <>
+  struct __has_iec559_behavior<__infinity, long double> : true_type {};
+#endif
+
+#ifdef __SUPPORT_SNAN__
+#if __FLT_HAS_QUIET_NAN__
+template <>
+  struct __has_iec559_behavior<__signaling_NaN, float> : true_type {};
+#endif
+
+#if __DBL_HAS_QUIET_NAN__
+template <>
+  struct __has_iec559_behavior<__signaling_NaN, double> : true_type {};
+#endif
+
+#if __LDBL_HAS_QUIET_NAN__
+template <>
+  struct __has_iec559_behavior<__signaling_NaN, long double> : true_type {};
+#endif
+
+#endif
+#endif // __FINITE_MATH_ONLY__
+
+} // namespace std
--- a/libstdc++-v3/include/experimental/bits/simd.h
+++ b/libstdc++-v3/include/experimental/bits/simd.h
--- a/libstdc++-v3/include/experimental/bits/simd_builtin.h
+++ b/libstdc++-v3/include/experimental/bits/simd_builtin.h
--- a/libstdc++-v3/include/experimental/bits/simd_converter.h
+++ b/libstdc++-v3/include/experimental/bits/simd_converter.h
@ -0,0 +1,354 @@
+// Generic simd conversions -*- C++ -*-
+
+// Copyright (C) 2020 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// Under Section 7 of GPL version 3, you are granted additional
+// permissions described in the GCC Runtime Library Exception, version
+// 3.1, as published by the Free Software Foundation.
+
+// You should have received a copy of the GNU General Public License and
+// a copy of the GCC Runtime Library Exception along with this program;
+// see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+// <http://www.gnu.org/licenses/>.
+
+#ifndef _GLIBCXX_EXPERIMENTAL_SIMD_CONVERTER_H_
+#define _GLIBCXX_EXPERIMENTAL_SIMD_CONVERTER_H_
+
+#if __cplusplus >= 201703L
+
+_GLIBCXX_SIMD_BEGIN_NAMESPACE
+// _SimdConverter scalar -> scalar {{{
+template <typename _From, typename _To>
+  struct _SimdConverter<_From, simd_abi::scalar, _To, simd_abi::scalar,
+			enable_if_t<!is_same_v<_From, _To>>>
+  {
+    _GLIBCXX_SIMD_INTRINSIC constexpr _To operator()(_From __a) const noexcept
+    { return static_cast<_To>(__a); }
+  };
+
+// }}}
+// _SimdConverter scalar -> "native" {{{
+template <typename _From, typename _To, typename _Abi>
+  struct _SimdConverter<_From, simd_abi::scalar, _To, _Abi,
+			enable_if_t<!is_same_v<_Abi, simd_abi::scalar>>>
+  {
+    using _Ret = typename _Abi::template __traits<_To>::_SimdMember;
+
+    template <typename... _More>
+      _GLIBCXX_SIMD_INTRINSIC constexpr _Ret
+      operator()(_From __a, _More... __more) const noexcept
+      {
+	static_assert(sizeof...(_More) + 1 == _Abi::template _S_size<_To>);
+	static_assert(conjunction_v<is_same<_From, _More>...>);
+	return __make_vector<_To>(__a, __more...);
+      }
+  };
+
+// }}}
+// _SimdConverter "native 1" -> "native 2" {{{
+template <typename _From, typename _To, typename _AFrom, typename _ATo>
+  struct _SimdConverter<
+    _From, _AFrom, _To, _ATo,
+    enable_if_t<!disjunction_v<
+      __is_fixed_size_abi<_AFrom>, __is_fixed_size_abi<_ATo>,
+      is_same<_AFrom, simd_abi::scalar>, is_same<_ATo, simd_abi::scalar>,
+      conjunction<is_same<_From, _To>, is_same<_AFrom, _ATo>>>>>
+  {
+    using _Arg = typename _AFrom::template __traits<_From>::_SimdMember;
+    using _Ret = typename _ATo::template __traits<_To>::_SimdMember;
+    using _V = __vector_type_t<_To, simd_size_v<_To, _ATo>>;
+
+    template <typename... _More>
+      _GLIBCXX_SIMD_INTRINSIC constexpr _Ret
+      operator()(_Arg __a, _More... __more) const noexcept
+      { return __vector_convert<_V>(__a, __more...); }
+  };
+
+// }}}
+// _SimdConverter scalar -> fixed_size<1> {{{1
+template <typename _From, typename _To>
+  struct _SimdConverter<_From, simd_abi::scalar, _To, simd_abi::fixed_size<1>,
+			void>
+  {
+    _GLIBCXX_SIMD_INTRINSIC constexpr _SimdTuple<_To, simd_abi::scalar>
+    operator()(_From __x) const noexcept
+    { return {static_cast<_To>(__x)}; }
+  };
+
+// _SimdConverter fixed_size<1> -> scalar {{{1
+template <typename _From, typename _To>
+  struct _SimdConverter<_From, simd_abi::fixed_size<1>, _To, simd_abi::scalar,
+			void>
+  {
+    _GLIBCXX_SIMD_INTRINSIC constexpr _To
+    operator()(_SimdTuple<_From, simd_abi::scalar> __x) const noexcept
+    { return {static_cast<_To>(__x.first)}; }
+  };
+
+// _SimdConverter fixed_size<_Np> -> fixed_size<_Np> {{{1
+template <typename _From, typename _To, int _Np>
+  struct _SimdConverter<_From, simd_abi::fixed_size<_Np>, _To,
+			simd_abi::fixed_size<_Np>,
+			enable_if_t<!is_same_v<_From, _To>>>
+  {
+    using _Ret = __fixed_size_storage_t<_To, _Np>;
+    using _Arg = __fixed_size_storage_t<_From, _Np>;
+
+    _GLIBCXX_SIMD_INTRINSIC constexpr _Ret
+    operator()(const _Arg& __x) const noexcept
+    {
+      if constexpr (is_same_v<_From, _To>)
+	return __x;
+
+      // special case (optimize) int signedness casts
+      else if constexpr (sizeof(_From) == sizeof(_To)
+			 && is_integral_v<_From> && is_integral_v<_To>)
+	return __bit_cast<_Ret>(__x);
+
+      // special case if all ABI tags in _Ret are scalar
+      else if constexpr (__is_scalar_abi<typename _Ret::_FirstAbi>())
+	{
+	  return __call_with_subscripts(
+	    __x, make_index_sequence<_Np>(),
+	    [](auto... __values) constexpr->_Ret {
+	      return __make_simd_tuple<_To, decltype((void) __values,
+						     simd_abi::scalar())...>(
+		static_cast<_To>(__values)...);
+	    });
+	}
+
+      // from one vector to one vector
+      else if constexpr (_Arg::_S_first_size == _Ret::_S_first_size)
+	{
+	  _SimdConverter<_From, typename _Arg::_FirstAbi, _To,
+			 typename _Ret::_FirstAbi>
+	    __native_cvt;
+	  if constexpr (_Arg::_S_tuple_size == 1)
+	    return {__native_cvt(__x.first)};
+	  else
+	    {
+	      constexpr size_t _NRemain = _Np - _Arg::_S_first_size;
+	      _SimdConverter<_From, simd_abi::fixed_size<_NRemain>, _To,
+			     simd_abi::fixed_size<_NRemain>>
+		__remainder_cvt;
+	      return {__native_cvt(__x.first), __remainder_cvt(__x.second)};
+	    }
+	}
+
+      // from one vector to multiple vectors
+      else if constexpr (_Arg::_S_first_size > _Ret::_S_first_size)
+	{
+	  const auto __multiple_return_chunks
+	    = __convert_all<__vector_type_t<_To, _Ret::_S_first_size>>(
+	      __x.first);
+	  constexpr auto __converted = __multiple_return_chunks.size()
+				       * _Ret::_FirstAbi::template _S_size<_To>;
+	  constexpr auto __remaining = _Np - __converted;
+	  if constexpr (_Arg::_S_tuple_size == 1 && __remaining == 0)
+	    return __to_simd_tuple<_To, _Np>(__multiple_return_chunks);
+	  else if constexpr (_Arg::_S_tuple_size == 1)
+	    { // e.g. <int, 3> -> <double, 2, 1> or <short, 7> -> <double, 4, 2,
+	      // 1>
+	      using _RetRem
+		= __remove_cvref_t<decltype(__simd_tuple_pop_front<__converted>(
+		  _Ret()))>;
+	      const auto __return_chunks2
+		= __convert_all<__vector_type_t<_To, _RetRem::_S_first_size>, 0,
+				__converted>(__x.first);
+	      constexpr auto __converted2
+		= __converted
+		  + __return_chunks2.size() * _RetRem::_S_first_size;
+	      if constexpr (__converted2 == _Np)
+		return __to_simd_tuple<_To, _Np>(__multiple_return_chunks,
+						 __return_chunks2);
+	      else
+		{
+		  using _RetRem2 = __remove_cvref_t<
+		    decltype(__simd_tuple_pop_front<__return_chunks2.size()
+						    * _RetRem::_S_first_size>(
+		      _RetRem()))>;
+		  const auto __return_chunks3 = __convert_all<
+		    __vector_type_t<_To, _RetRem2::_S_first_size>, 0,
+		    __converted2>(__x.first);
+		  constexpr auto __converted3
+		    = __converted2
+		      + __return_chunks3.size() * _RetRem2::_S_first_size;
+		  if constexpr (__converted3 == _Np)
+		    return __to_simd_tuple<_To, _Np>(__multiple_return_chunks,
+						     __return_chunks2,
+						     __return_chunks3);
+		  else
+		    {
+		      using _RetRem3
+			= __remove_cvref_t<decltype(__simd_tuple_pop_front<
+						    __return_chunks3.size()
+						    * _RetRem2::_S_first_size>(
+			  _RetRem2()))>;
+		      const auto __return_chunks4 = __convert_all<
+			__vector_type_t<_To, _RetRem3::_S_first_size>, 0,
+			__converted3>(__x.first);
+		      constexpr auto __converted4
+			= __converted3
+			  + __return_chunks4.size() * _RetRem3::_S_first_size;
+		      if constexpr (__converted4 == _Np)
+			return __to_simd_tuple<_To, _Np>(
+			  __multiple_return_chunks, __return_chunks2,
+			  __return_chunks3, __return_chunks4);
+		      else
+			__assert_unreachable<_To>();
+		    }
+		}
+	    }
+	  else
+	    {
+	      constexpr size_t _NRemain = _Np - _Arg::_S_first_size;
+	      _SimdConverter<_From, simd_abi::fixed_size<_NRemain>, _To,
+			     simd_abi::fixed_size<_NRemain>>
+		__remainder_cvt;
+	      return __simd_tuple_concat(
+		__to_simd_tuple<_To, _Arg::_S_first_size>(
+		  __multiple_return_chunks),
+		__remainder_cvt(__x.second));
+	    }
+	}
+
+      // from multiple vectors to one vector
+      // _Arg::_S_first_size < _Ret::_S_first_size
+      // a) heterogeneous input at the end of the tuple (possible with partial
+      //    native registers in _Ret)
+      else if constexpr (_Ret::_S_tuple_size == 1
+			 && _Np % _Arg::_S_first_size != 0)
+	{
+	  static_assert(_Ret::_FirstAbi::template _S_is_partial<_To>);
+	  return _Ret{__generate_from_n_evaluations<
+	    _Np, typename _VectorTraits<typename _Ret::_FirstType>::type>(
+	    [&](auto __i) { return static_cast<_To>(__x[__i]); })};
+	}
+      else
+	{
+	  static_assert(_Arg::_S_tuple_size > 1);
+	  constexpr auto __n
+	    = __div_roundup(_Ret::_S_first_size, _Arg::_S_first_size);
+	  return __call_with_n_evaluations<__n>(
+	    [&__x](auto... __uncvted) {
+	      // assuming _Arg Abi tags for all __i are _Arg::_FirstAbi
+	      _SimdConverter<_From, typename _Arg::_FirstAbi, _To,
+			     typename _Ret::_FirstAbi>
+		__native_cvt;
+	      if constexpr (_Ret::_S_tuple_size == 1)
+		return _Ret{__native_cvt(__uncvted...)};
+	      else
+		return _Ret{
+		  __native_cvt(__uncvted...),
+		  _SimdConverter<
+		    _From, simd_abi::fixed_size<_Np - _Ret::_S_first_size>, _To,
+		    simd_abi::fixed_size<_Np - _Ret::_S_first_size>>()(
+		    __simd_tuple_pop_front<_Ret::_S_first_size>(__x))};
+	    },
+	    [&__x](auto __i) { return __get_tuple_at<__i>(__x); });
+	}
+    }
+  };
+
+// _SimdConverter "native" -> fixed_size<_Np> {{{1
+// i.e. 1 register to ? registers
+template <typename _From, typename _Ap, typename _To, int _Np>
+  struct _SimdConverter<_From, _Ap, _To, simd_abi::fixed_size<_Np>,
+			enable_if_t<!__is_fixed_size_abi_v<_Ap>>>
+  {
+    static_assert(
+      _Np == simd_size_v<_From, _Ap>,
+      "_SimdConverter to fixed_size only works for equal element counts");
+
+    using _Ret = __fixed_size_storage_t<_To, _Np>;
+
+    _GLIBCXX_SIMD_INTRINSIC constexpr _Ret
+    operator()(typename _SimdTraits<_From, _Ap>::_SimdMember __x) const noexcept
+    {
+      if constexpr (_Ret::_S_tuple_size == 1)
+	return {__vector_convert<typename _Ret::_FirstType::_BuiltinType>(__x)};
+      else
+	{
+	  using _FixedNp = simd_abi::fixed_size<_Np>;
+	  _SimdConverter<_From, _FixedNp, _To, _FixedNp> __fixed_cvt;
+	  using _FromFixedStorage = __fixed_size_storage_t<_From, _Np>;
+	  if constexpr (_FromFixedStorage::_S_tuple_size == 1)
+	    return __fixed_cvt(_FromFixedStorage{__x});
+	  else if constexpr (_FromFixedStorage::_S_tuple_size == 2)
+	    {
+	      _FromFixedStorage __tmp;
+	      static_assert(sizeof(__tmp) <= sizeof(__x));
+	      __builtin_memcpy(&__tmp.first, &__x, sizeof(__tmp.first));
+	      __builtin_memcpy(&__tmp.second.first,
+			       reinterpret_cast<const char*>(&__x)
+				 + sizeof(__tmp.first),
+			       sizeof(__tmp.second.first));
+	      return __fixed_cvt(__tmp);
+	    }
+	  else
+	    __assert_unreachable<_From>();
+	}
+    }
+  };
+
+// _SimdConverter fixed_size<_Np> -> "native" {{{1
+// i.e. ? register to 1 registers
+template <typename _From, int _Np, typename _To, typename _Ap>
+  struct _SimdConverter<_From, simd_abi::fixed_size<_Np>, _To, _Ap,
+			enable_if_t<!__is_fixed_size_abi_v<_Ap>>>
+  {
+    static_assert(
+      _Np == simd_size_v<_To, _Ap>,
+      "_SimdConverter to fixed_size only works for equal element counts");
+
+    using _Arg = __fixed_size_storage_t<_From, _Np>;
+
+    _GLIBCXX_SIMD_INTRINSIC constexpr
+      typename _SimdTraits<_To, _Ap>::_SimdMember
+      operator()(_Arg __x) const noexcept
+    {
+      if constexpr (_Arg::_S_tuple_size == 1)
+	return __vector_convert<__vector_type_t<_To, _Np>>(__x.first);
+      else if constexpr (_Arg::_S_is_homogeneous)
+	return __call_with_n_evaluations<_Arg::_S_tuple_size>(
+	  [](auto... __members) {
+	    if constexpr ((is_convertible_v<decltype(__members), _To> && ...))
+	      return __vector_type_t<_To, _Np>{static_cast<_To>(__members)...};
+	    else
+	      return __vector_convert<__vector_type_t<_To, _Np>>(__members...);
+	  },
+	  [&](auto __i) { return __get_tuple_at<__i>(__x); });
+      else if constexpr (__fixed_size_storage_t<_To, _Np>::_S_tuple_size == 1)
+	{
+	  _SimdConverter<_From, simd_abi::fixed_size<_Np>, _To,
+			 simd_abi::fixed_size<_Np>>
+	    __fixed_cvt;
+	  return __fixed_cvt(__x).first;
+	}
+      else
+	{
+	  const _SimdWrapper<_From, _Np> __xv
+	    = __generate_from_n_evaluations<_Np, __vector_type_t<_From, _Np>>(
+	      [&](auto __i) { return __x[__i]; });
+	  return __vector_convert<__vector_type_t<_To, _Np>>(__xv);
+	}
+    }
+  };
+
+// }}}1
+_GLIBCXX_SIMD_END_NAMESPACE
+#endif // __cplusplus >= 201703L
+#endif // _GLIBCXX_EXPERIMENTAL_SIMD_CONVERTER_H_
+
+// vim: foldmethod=marker sw=2 noet ts=8 sts=2 tw=80
--- a/libstdc++-v3/include/experimental/bits/simd_detail.h
+++ b/libstdc++-v3/include/experimental/bits/simd_detail.h
@ -0,0 +1,306 @@
+// Internal macros for the simd implementation -*- C++ -*-
+
+// Copyright (C) 2020 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// Under Section 7 of GPL version 3, you are granted additional
+// permissions described in the GCC Runtime Library Exception, version
+// 3.1, as published by the Free Software Foundation.
+
+// You should have received a copy of the GNU General Public License and
+// a copy of the GCC Runtime Library Exception along with this program;
+// see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+// <http://www.gnu.org/licenses/>.
+
+#ifndef _GLIBCXX_EXPERIMENTAL_SIMD_DETAIL_H_
+#define _GLIBCXX_EXPERIMENTAL_SIMD_DETAIL_H_
+
+#if __cplusplus >= 201703L
+
+#include <cstddef>
+#include <cstdint>
+
+
+#define _GLIBCXX_SIMD_BEGIN_NAMESPACE                                          \
+  namespace std _GLIBCXX_VISIBILITY(default)                                   \
+  {                                                                            \
+    _GLIBCXX_BEGIN_NAMESPACE_VERSION                                           \
+      namespace experimental {                                                 \
+      inline namespace parallelism_v2 {
+#define _GLIBCXX_SIMD_END_NAMESPACE                                            \
+  }                                                                            \
+  }                                                                            \
+  _GLIBCXX_END_NAMESPACE_VERSION                                               \
+  }
+
+// ISA extension detection. The following defines all the _GLIBCXX_SIMD_HAVE_XXX
+// macros ARM{{{
+#if defined __ARM_NEON
+#define _GLIBCXX_SIMD_HAVE_NEON 1
+#else
+#define _GLIBCXX_SIMD_HAVE_NEON 0
+#endif
+#if defined __ARM_NEON && (__ARM_ARCH >= 8 || defined __aarch64__)
+#define _GLIBCXX_SIMD_HAVE_NEON_A32 1
+#else
+#define _GLIBCXX_SIMD_HAVE_NEON_A32 0
+#endif
+#if defined __ARM_NEON && defined __aarch64__
+#define _GLIBCXX_SIMD_HAVE_NEON_A64 1
+#else
+#define _GLIBCXX_SIMD_HAVE_NEON_A64 0
+#endif
+//}}}
+// x86{{{
+#ifdef __MMX__
+#define _GLIBCXX_SIMD_HAVE_MMX 1
+#else
+#define _GLIBCXX_SIMD_HAVE_MMX 0
+#endif
+#if defined __SSE__ || defined __x86_64__
+#define _GLIBCXX_SIMD_HAVE_SSE 1
+#else
+#define _GLIBCXX_SIMD_HAVE_SSE 0
+#endif
+#if defined __SSE2__ || defined __x86_64__
+#define _GLIBCXX_SIMD_HAVE_SSE2 1
+#else
+#define _GLIBCXX_SIMD_HAVE_SSE2 0
+#endif
+#ifdef __SSE3__
+#define _GLIBCXX_SIMD_HAVE_SSE3 1
+#else
+#define _GLIBCXX_SIMD_HAVE_SSE3 0
+#endif
+#ifdef __SSSE3__
+#define _GLIBCXX_SIMD_HAVE_SSSE3 1
+#else
+#define _GLIBCXX_SIMD_HAVE_SSSE3 0
+#endif
+#ifdef __SSE4_1__
+#define _GLIBCXX_SIMD_HAVE_SSE4_1 1
+#else
+#define _GLIBCXX_SIMD_HAVE_SSE4_1 0
+#endif
+#ifdef __SSE4_2__
+#define _GLIBCXX_SIMD_HAVE_SSE4_2 1
+#else
+#define _GLIBCXX_SIMD_HAVE_SSE4_2 0
+#endif
+#ifdef __XOP__
+#define _GLIBCXX_SIMD_HAVE_XOP 1
+#else
+#define _GLIBCXX_SIMD_HAVE_XOP 0
+#endif
+#ifdef __AVX__
+#define _GLIBCXX_SIMD_HAVE_AVX 1
+#else
+#define _GLIBCXX_SIMD_HAVE_AVX 0
+#endif
+#ifdef __AVX2__
+#define _GLIBCXX_SIMD_HAVE_AVX2 1
+#else
+#define _GLIBCXX_SIMD_HAVE_AVX2 0
+#endif
+#ifdef __BMI__
+#define _GLIBCXX_SIMD_HAVE_BMI1 1
+#else
+#define _GLIBCXX_SIMD_HAVE_BMI1 0
+#endif
+#ifdef __BMI2__
+#define _GLIBCXX_SIMD_HAVE_BMI2 1
+#else
+#define _GLIBCXX_SIMD_HAVE_BMI2 0
+#endif
+#ifdef __LZCNT__
+#define _GLIBCXX_SIMD_HAVE_LZCNT 1
+#else
+#define _GLIBCXX_SIMD_HAVE_LZCNT 0
+#endif
+#ifdef __SSE4A__
+#define _GLIBCXX_SIMD_HAVE_SSE4A 1
+#else
+#define _GLIBCXX_SIMD_HAVE_SSE4A 0
+#endif
+#ifdef __FMA__
+#define _GLIBCXX_SIMD_HAVE_FMA 1
+#else
+#define _GLIBCXX_SIMD_HAVE_FMA 0
+#endif
+#ifdef __FMA4__
+#define _GLIBCXX_SIMD_HAVE_FMA4 1
+#else
+#define _GLIBCXX_SIMD_HAVE_FMA4 0
+#endif
+#ifdef __F16C__
+#define _GLIBCXX_SIMD_HAVE_F16C 1
+#else
+#define _GLIBCXX_SIMD_HAVE_F16C 0
+#endif
+#ifdef __POPCNT__
+#define _GLIBCXX_SIMD_HAVE_POPCNT 1
+#else
+#define _GLIBCXX_SIMD_HAVE_POPCNT 0
+#endif
+#ifdef __AVX512F__
+#define _GLIBCXX_SIMD_HAVE_AVX512F 1
+#else
+#define _GLIBCXX_SIMD_HAVE_AVX512F 0
+#endif
+#ifdef __AVX512DQ__
+#define _GLIBCXX_SIMD_HAVE_AVX512DQ 1
+#else
+#define _GLIBCXX_SIMD_HAVE_AVX512DQ 0
+#endif
+#ifdef __AVX512VL__
+#define _GLIBCXX_SIMD_HAVE_AVX512VL 1
+#else
+#define _GLIBCXX_SIMD_HAVE_AVX512VL 0
+#endif
+#ifdef __AVX512BW__
+#define _GLIBCXX_SIMD_HAVE_AVX512BW 1
+#else
+#define _GLIBCXX_SIMD_HAVE_AVX512BW 0
+#endif
+
+#if _GLIBCXX_SIMD_HAVE_SSE
+#define _GLIBCXX_SIMD_HAVE_SSE_ABI 1
+#else
+#define _GLIBCXX_SIMD_HAVE_SSE_ABI 0
+#endif
+#if _GLIBCXX_SIMD_HAVE_SSE2
+#define _GLIBCXX_SIMD_HAVE_FULL_SSE_ABI 1
+#else
+#define _GLIBCXX_SIMD_HAVE_FULL_SSE_ABI 0
+#endif
+
+#if _GLIBCXX_SIMD_HAVE_AVX
+#define _GLIBCXX_SIMD_HAVE_AVX_ABI 1
+#else
+#define _GLIBCXX_SIMD_HAVE_AVX_ABI 0
+#endif
+#if _GLIBCXX_SIMD_HAVE_AVX2
+#define _GLIBCXX_SIMD_HAVE_FULL_AVX_ABI 1
+#else
+#define _GLIBCXX_SIMD_HAVE_FULL_AVX_ABI 0
+#endif
+
+#if _GLIBCXX_SIMD_HAVE_AVX512F
+#define _GLIBCXX_SIMD_HAVE_AVX512_ABI 1
+#else
+#define _GLIBCXX_SIMD_HAVE_AVX512_ABI 0
+#endif
+#if _GLIBCXX_SIMD_HAVE_AVX512BW
+#define _GLIBCXX_SIMD_HAVE_FULL_AVX512_ABI 1
+#else
+#define _GLIBCXX_SIMD_HAVE_FULL_AVX512_ABI 0
+#endif
+
+#if defined __x86_64__ && !_GLIBCXX_SIMD_HAVE_SSE2
+#error "Use of SSE2 is required on AMD64"
+#endif
+//}}}
+
+#ifdef __clang__
+#define _GLIBCXX_SIMD_NORMAL_MATH
+#else
+#define _GLIBCXX_SIMD_NORMAL_MATH                                              \
+  [[__gnu__::__optimize__("finite-math-only,no-signed-zeros")]]
+#endif
+#define _GLIBCXX_SIMD_NEVER_INLINE [[__gnu__::__noinline__]]
+#define _GLIBCXX_SIMD_INTRINSIC                                                \
+  [[__gnu__::__always_inline__, __gnu__::__artificial__]] inline
+#define _GLIBCXX_SIMD_ALWAYS_INLINE [[__gnu__::__always_inline__]] inline
+#define _GLIBCXX_SIMD_IS_UNLIKELY(__x) __builtin_expect(__x, 0)
+#define _GLIBCXX_SIMD_IS_LIKELY(__x) __builtin_expect(__x, 1)
+
+#if defined __STRICT_ANSI__ && __STRICT_ANSI__
+#define _GLIBCXX_SIMD_CONSTEXPR
+#define _GLIBCXX_SIMD_USE_CONSTEXPR_API const
+#else
+#define _GLIBCXX_SIMD_CONSTEXPR constexpr
+#define _GLIBCXX_SIMD_USE_CONSTEXPR_API constexpr
+#endif
+
+#if defined __clang__
+#define _GLIBCXX_SIMD_USE_CONSTEXPR const
+#else
+#define _GLIBCXX_SIMD_USE_CONSTEXPR constexpr
+#endif
+
+#define _GLIBCXX_SIMD_LIST_BINARY(__macro) __macro(|) __macro(&) __macro(^)
+#define _GLIBCXX_SIMD_LIST_SHIFTS(__macro) __macro(<<) __macro(>>)
+#define _GLIBCXX_SIMD_LIST_ARITHMETICS(__macro)                                \
+  __macro(+) __macro(-) __macro(*) __macro(/) __macro(%)
+
+#define _GLIBCXX_SIMD_ALL_BINARY(__macro)                                      \
+  _GLIBCXX_SIMD_LIST_BINARY(__macro) static_assert(true)
+#define _GLIBCXX_SIMD_ALL_SHIFTS(__macro)                                      \
+  _GLIBCXX_SIMD_LIST_SHIFTS(__macro) static_assert(true)
+#define _GLIBCXX_SIMD_ALL_ARITHMETICS(__macro)                                 \
+  _GLIBCXX_SIMD_LIST_ARITHMETICS(__macro) static_assert(true)
+
+#ifdef _GLIBCXX_SIMD_NO_ALWAYS_INLINE
+#undef _GLIBCXX_SIMD_ALWAYS_INLINE
+#define _GLIBCXX_SIMD_ALWAYS_INLINE inline
+#undef _GLIBCXX_SIMD_INTRINSIC
+#define _GLIBCXX_SIMD_INTRINSIC inline
+#endif
+
+#if _GLIBCXX_SIMD_HAVE_SSE || _GLIBCXX_SIMD_HAVE_MMX
+#define _GLIBCXX_SIMD_X86INTRIN 1
+#else
+#define _GLIBCXX_SIMD_X86INTRIN 0
+#endif
+
+// workaround macros {{{
+// use aliasing loads to help GCC understand the data accesses better
+// This also seems to hide a miscompilation on swap(x[i], x[i + 1]) with
+// fixed_size_simd<float, 16> x.
+#define _GLIBCXX_SIMD_USE_ALIASING_LOADS 1
+
+// vector conversions on x86 not optimized:
+#if _GLIBCXX_SIMD_X86INTRIN
+#define _GLIBCXX_SIMD_WORKAROUND_PR85048 1
+#endif
+
+// integer division not optimized
+#define _GLIBCXX_SIMD_WORKAROUND_PR90993 1
+
+// very bad codegen for extraction and concatenation of 128/256 "subregisters"
+// with sizeof(element type) < 8: https://godbolt.org/g/mqUsgM
+#if _GLIBCXX_SIMD_X86INTRIN
+#define _GLIBCXX_SIMD_WORKAROUND_XXX_1 1
+#endif
+
+// bad codegen for 8 Byte memcpy to __vector_type_t<char, 16>
+#define _GLIBCXX_SIMD_WORKAROUND_PR90424 1
+
+// bad codegen for zero-extend using simple concat(__x, 0)
+#if _GLIBCXX_SIMD_X86INTRIN
+#define _GLIBCXX_SIMD_WORKAROUND_XXX_3 1
+#endif
+
+// https://github.com/cplusplus/parallelism-ts/issues/65 (incorrect return type
+// of static_simd_cast)
+#define _GLIBCXX_SIMD_FIX_P2TS_ISSUE65 1
+
+// https://github.com/cplusplus/parallelism-ts/issues/66 (incorrect SFINAE
+// constraint on (static)_simd_cast)
+#define _GLIBCXX_SIMD_FIX_P2TS_ISSUE66 1
+// }}}
+
+#endif // __cplusplus >= 201703L
+#endif // _GLIBCXX_EXPERIMENTAL_SIMD_DETAIL_H_
+
+// vim: foldmethod=marker
--- a/libstdc++-v3/include/experimental/bits/simd_fixed_size.h
+++ b/libstdc++-v3/include/experimental/bits/simd_fixed_size.h
--- a/libstdc++-v3/include/experimental/bits/simd_math.h
+++ b/libstdc++-v3/include/experimental/bits/simd_math.h
--- a/libstdc++-v3/include/experimental/bits/simd_neon.h
+++ b/libstdc++-v3/include/experimental/bits/simd_neon.h
@ -0,0 +1,519 @@
+// Simd NEON specific implementations -*- C++ -*-
+
+// Copyright (C) 2020 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// Under Section 7 of GPL version 3, you are granted additional
+// permissions described in the GCC Runtime Library Exception, version
+// 3.1, as published by the Free Software Foundation.
+
+// You should have received a copy of the GNU General Public License and
+// a copy of the GCC Runtime Library Exception along with this program;
+// see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+// <http://www.gnu.org/licenses/>.
+
+#ifndef _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_
+#define _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_
+
+#if __cplusplus >= 201703L
+
+#if !_GLIBCXX_SIMD_HAVE_NEON
+#error "simd_neon.h may only be included when NEON on ARM is available"
+#endif
+
+_GLIBCXX_SIMD_BEGIN_NAMESPACE
+
+// _CommonImplNeon {{{
+struct _CommonImplNeon : _CommonImplBuiltin
+{
+  // _S_store {{{
+  using _CommonImplBuiltin::_S_store;
+
+  // }}}
+};
+
+// }}}
+// _SimdImplNeon {{{
+template <typename _Abi>
+  struct _SimdImplNeon : _SimdImplBuiltin<_Abi>
+  {
+    using _Base = _SimdImplBuiltin<_Abi>;
+
+    template <typename _Tp>
+      using _MaskMember = typename _Base::template _MaskMember<_Tp>;
+
+    template <typename _Tp>
+      static constexpr size_t _S_max_store_size = 16;
+
+    // _S_masked_load {{{
+    template <typename _Tp, size_t _Np, typename _Up>
+      static inline _SimdWrapper<_Tp, _Np>
+      _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k,
+		     const _Up* __mem) noexcept
+      {
+	__execute_n_times<_Np>([&](auto __i) {
+	  if (__k[__i] != 0)
+	    __merge._M_set(__i, static_cast<_Tp>(__mem[__i]));
+	});
+	return __merge;
+      }
+
+    // }}}
+    // _S_masked_store_nocvt {{{
+    template <typename _Tp, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static void
+      _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem,
+			    _MaskMember<_Tp> __k)
+      {
+	__execute_n_times<_Np>([&](auto __i) {
+	  if (__k[__i] != 0)
+	    __mem[__i] = __v[__i];
+	});
+      }
+
+    // }}}
+    // _S_reduce {{{
+    template <typename _Tp, typename _BinaryOperation>
+      _GLIBCXX_SIMD_INTRINSIC static _Tp
+      _S_reduce(simd<_Tp, _Abi> __x, _BinaryOperation&& __binary_op)
+      {
+	constexpr size_t _Np = __x.size();
+	if constexpr (sizeof(__x) == 16 && _Np >= 4
+		      && !_Abi::template _S_is_partial<_Tp>)
+	  {
+	    const auto __halves = split<simd<_Tp, simd_abi::_Neon<8>>>(__x);
+	    const auto __y = __binary_op(__halves[0], __halves[1]);
+	    return _SimdImplNeon<simd_abi::_Neon<8>>::_S_reduce(
+	      __y, static_cast<_BinaryOperation&&>(__binary_op));
+	  }
+	else if constexpr (_Np == 8)
+	  {
+	    __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
+				     __vector_permute<1, 0, 3, 2, 5, 4, 7, 6>(
+				       __x._M_data)));
+	    __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
+				     __vector_permute<3, 2, 1, 0, 7, 6, 5, 4>(
+				       __x._M_data)));
+	    __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
+				     __vector_permute<7, 6, 5, 4, 3, 2, 1, 0>(
+				       __x._M_data)));
+	    return __x[0];
+	  }
+	else if constexpr (_Np == 4)
+	  {
+	    __x
+	      = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
+				   __vector_permute<1, 0, 3, 2>(__x._M_data)));
+	    __x
+	      = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
+				   __vector_permute<3, 2, 1, 0>(__x._M_data)));
+	    return __x[0];
+	  }
+	else if constexpr (_Np == 2)
+	  {
+	    __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
+				     __vector_permute<1, 0>(__x._M_data)));
+	    return __x[0];
+	  }
+	else
+	  return _Base::_S_reduce(__x,
+				  static_cast<_BinaryOperation&&>(__binary_op));
+      }
+
+    // }}}
+    // math {{{
+    // _S_sqrt {{{
+    template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
+      _GLIBCXX_SIMD_INTRINSIC static _Tp _S_sqrt(_Tp __x)
+      {
+	if constexpr (__have_neon_a64)
+	  {
+	    const auto __intrin = __to_intrin(__x);
+	    if constexpr (_TVT::template _S_is<float, 2>)
+	      return vsqrt_f32(__intrin);
+	    else if constexpr (_TVT::template _S_is<float, 4>)
+	      return vsqrtq_f32(__intrin);
+	    else if constexpr (_TVT::template _S_is<double, 1>)
+	      return vsqrt_f64(__intrin);
+	    else if constexpr (_TVT::template _S_is<double, 2>)
+	      return vsqrtq_f64(__intrin);
+	    else
+	      __assert_unreachable<_Tp>();
+	  }
+	else
+	  return _Base::_S_sqrt(__x);
+      }
+
+    // }}}
+    // _S_trunc {{{
+    template <typename _TW, typename _TVT = _VectorTraits<_TW>>
+      _GLIBCXX_SIMD_INTRINSIC static _TW _S_trunc(_TW __x)
+      {
+	using _Tp = typename _TVT::value_type;
+	if constexpr (__have_neon_a32)
+	  {
+	    const auto __intrin = __to_intrin(__x);
+	    if constexpr (_TVT::template _S_is<float, 2>)
+	      return vrnd_f32(__intrin);
+	    else if constexpr (_TVT::template _S_is<float, 4>)
+	      return vrndq_f32(__intrin);
+	    else if constexpr (_TVT::template _S_is<double, 1>)
+	      return vrnd_f64(__intrin);
+	    else if constexpr (_TVT::template _S_is<double, 2>)
+	      return vrndq_f64(__intrin);
+	    else
+	      __assert_unreachable<_Tp>();
+	  }
+	else if constexpr (is_same_v<_Tp, float>)
+	  {
+	    auto __intrin = __to_intrin(__x);
+	    if constexpr (sizeof(__x) == 16)
+	      __intrin = vcvtq_f32_s32(vcvtq_s32_f32(__intrin));
+	    else
+	      __intrin = vcvt_f32_s32(vcvt_s32_f32(__intrin));
+	    return _Base::_S_abs(__x)._M_data < 0x1p23f
+		     ? __vector_bitcast<float>(__intrin)
+		     : __x._M_data;
+	  }
+	else
+	  return _Base::_S_trunc(__x);
+      }
+
+    // }}}
+    // _S_round {{{
+    template <typename _Tp, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
+      _S_round(_SimdWrapper<_Tp, _Np> __x)
+      {
+	if constexpr (__have_neon_a32)
+	  {
+	    const auto __intrin = __to_intrin(__x);
+	    if constexpr (sizeof(_Tp) == 4 && sizeof(__x) == 8)
+	      return vrnda_f32(__intrin);
+	    else if constexpr (sizeof(_Tp) == 4 && sizeof(__x) == 16)
+	      return vrndaq_f32(__intrin);
+	    else if constexpr (sizeof(_Tp) == 8 && sizeof(__x) == 8)
+	      return vrnda_f64(__intrin);
+	    else if constexpr (sizeof(_Tp) == 8 && sizeof(__x) == 16)
+	      return vrndaq_f64(__intrin);
+	    else
+	      __assert_unreachable<_Tp>();
+	  }
+	else
+	  return _Base::_S_round(__x);
+      }
+
+    // }}}
+    // _S_floor {{{
+    template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
+      _GLIBCXX_SIMD_INTRINSIC static _Tp _S_floor(_Tp __x)
+      {
+	if constexpr (__have_neon_a32)
+	  {
+	    const auto __intrin = __to_intrin(__x);
+	    if constexpr (_TVT::template _S_is<float, 2>)
+	      return vrndm_f32(__intrin);
+	    else if constexpr (_TVT::template _S_is<float, 4>)
+	      return vrndmq_f32(__intrin);
+	    else if constexpr (_TVT::template _S_is<double, 1>)
+	      return vrndm_f64(__intrin);
+	    else if constexpr (_TVT::template _S_is<double, 2>)
+	      return vrndmq_f64(__intrin);
+	    else
+	      __assert_unreachable<_Tp>();
+	  }
+	else
+	  return _Base::_S_floor(__x);
+      }
+
+    // }}}
+    // _S_ceil {{{
+    template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
+      _GLIBCXX_SIMD_INTRINSIC static _Tp _S_ceil(_Tp __x)
+      {
+	if constexpr (__have_neon_a32)
+	  {
+	    const auto __intrin = __to_intrin(__x);
+	    if constexpr (_TVT::template _S_is<float, 2>)
+	      return vrndp_f32(__intrin);
+	    else if constexpr (_TVT::template _S_is<float, 4>)
+	      return vrndpq_f32(__intrin);
+	    else if constexpr (_TVT::template _S_is<double, 1>)
+	      return vrndp_f64(__intrin);
+	    else if constexpr (_TVT::template _S_is<double, 2>)
+	      return vrndpq_f64(__intrin);
+	    else
+	      __assert_unreachable<_Tp>();
+	  }
+	else
+	  return _Base::_S_ceil(__x);
+      }
+
+    //}}} }}}
+  }; // }}}
+// _MaskImplNeonMixin {{{
+struct _MaskImplNeonMixin
+{
+  using _Base = _MaskImplBuiltinMixin;
+
+  template <typename _Tp, size_t _Np>
+    _GLIBCXX_SIMD_INTRINSIC static constexpr _SanitizedBitMask<_Np>
+    _S_to_bits(_SimdWrapper<_Tp, _Np> __x)
+    {
+      if (__builtin_is_constant_evaluated())
+	return _Base::_S_to_bits(__x);
+
+      using _I = __int_for_sizeof_t<_Tp>;
+      if constexpr (sizeof(__x) == 16)
+	{
+	  auto __asint = __vector_bitcast<_I>(__x);
+#ifdef __aarch64__
+	  [[maybe_unused]] constexpr auto __zero = decltype(__asint)();
+#else
+	  [[maybe_unused]] constexpr auto __zero = decltype(__lo64(__asint))();
+#endif
+	  if constexpr (sizeof(_Tp) == 1)
+	    {
+	      constexpr auto __bitsel
+		= __generate_from_n_evaluations<16, __vector_type_t<_I, 16>>(
+		  [&](auto __i) {
+		    return static_cast<_I>(
+		      __i < _Np ? (__i < 8 ? 1 << __i : 1 << (__i - 8)) : 0);
+		  });
+	      __asint &= __bitsel;
+#ifdef __aarch64__
+	      return __vector_bitcast<_UShort>(
+		vpaddq_s8(vpaddq_s8(vpaddq_s8(__asint, __zero), __zero),
+			  __zero))[0];
+#else
+	      return __vector_bitcast<_UShort>(
+		vpadd_s8(vpadd_s8(vpadd_s8(__lo64(__asint), __hi64(__asint)),
+				  __zero),
+			 __zero))[0];
+#endif
+	    }
+	  else if constexpr (sizeof(_Tp) == 2)
+	    {
+	      constexpr auto __bitsel
+		= __generate_from_n_evaluations<8, __vector_type_t<_I, 8>>(
+		  [&](auto __i) {
+		    return static_cast<_I>(__i < _Np ? 1 << __i : 0);
+		  });
+	      __asint &= __bitsel;
+#ifdef __aarch64__
+	      return vpaddq_s16(vpaddq_s16(vpaddq_s16(__asint, __zero), __zero),
+				__zero)[0];
+#else
+	      return vpadd_s16(
+		vpadd_s16(vpadd_s16(__lo64(__asint), __hi64(__asint)), __zero),
+		__zero)[0];
+#endif
+	    }
+	  else if constexpr (sizeof(_Tp) == 4)
+	    {
+	      constexpr auto __bitsel
+		= __generate_from_n_evaluations<4, __vector_type_t<_I, 4>>(
+		  [&](auto __i) {
+		    return static_cast<_I>(__i < _Np ? 1 << __i : 0);
+		  });
+	      __asint &= __bitsel;
+#ifdef __aarch64__
+	      return vpaddq_s32(vpaddq_s32(__asint, __zero), __zero)[0];
+#else
+	      return vpadd_s32(vpadd_s32(__lo64(__asint), __hi64(__asint)),
+			       __zero)[0];
+#endif
+	    }
+	  else if constexpr (sizeof(_Tp) == 8)
+	    return (__asint[0] & 1) | (__asint[1] & 2);
+	  else
+	    __assert_unreachable<_Tp>();
+	}
+      else if constexpr (sizeof(__x) == 8)
+	{
+	  auto __asint = __vector_bitcast<_I>(__x);
+	  [[maybe_unused]] constexpr auto __zero = decltype(__asint)();
+	  if constexpr (sizeof(_Tp) == 1)
+	    {
+	      constexpr auto __bitsel
+		= __generate_from_n_evaluations<8, __vector_type_t<_I, 8>>(
+		  [&](auto __i) {
+		    return static_cast<_I>(__i < _Np ? 1 << __i : 0);
+		  });
+	      __asint &= __bitsel;
+	      return vpadd_s8(vpadd_s8(vpadd_s8(__asint, __zero), __zero),
+			      __zero)[0];
+	    }
+	  else if constexpr (sizeof(_Tp) == 2)
+	    {
+	      constexpr auto __bitsel
+		= __generate_from_n_evaluations<4, __vector_type_t<_I, 4>>(
+		  [&](auto __i) {
+		    return static_cast<_I>(__i < _Np ? 1 << __i : 0);
+		  });
+	      __asint &= __bitsel;
+	      return vpadd_s16(vpadd_s16(__asint, __zero), __zero)[0];
+	    }
+	  else if constexpr (sizeof(_Tp) == 4)
+	    {
+	      __asint &= __make_vector<_I>(0x1, 0x2);
+	      return vpadd_s32(__asint, __zero)[0];
+	    }
+	  else
+	    __assert_unreachable<_Tp>();
+	}
+      else
+	return _Base::_S_to_bits(__x);
+    }
+};
+
+// }}}
+// _MaskImplNeon {{{
+template <typename _Abi>
+  struct _MaskImplNeon : _MaskImplNeonMixin, _MaskImplBuiltin<_Abi>
+  {
+    using _MaskImplBuiltinMixin::_S_to_maskvector;
+    using _MaskImplNeonMixin::_S_to_bits;
+    using _Base = _MaskImplBuiltin<_Abi>;
+    using _Base::_S_convert;
+
+    // _S_all_of {{{
+    template <typename _Tp>
+      _GLIBCXX_SIMD_INTRINSIC static bool _S_all_of(simd_mask<_Tp, _Abi> __k)
+      {
+	const auto __kk
+	  = __vector_bitcast<char>(__k._M_data)
+	    | ~__vector_bitcast<char>(_Abi::template _S_implicit_mask<_Tp>());
+	if constexpr (sizeof(__k) == 16)
+	  {
+	    const auto __x = __vector_bitcast<long long>(__kk);
+	    return __x[0] + __x[1] == -2;
+	  }
+	else if constexpr (sizeof(__k) <= 8)
+	  return __bit_cast<__int_for_sizeof_t<decltype(__kk)>>(__kk) == -1;
+	else
+	  __assert_unreachable<_Tp>();
+      }
+
+    // }}}
+    // _S_any_of {{{
+    template <typename _Tp>
+      _GLIBCXX_SIMD_INTRINSIC static bool _S_any_of(simd_mask<_Tp, _Abi> __k)
+      {
+	const auto __kk
+	  = __vector_bitcast<char>(__k._M_data)
+	    | ~__vector_bitcast<char>(_Abi::template _S_implicit_mask<_Tp>());
+	if constexpr (sizeof(__k) == 16)
+	  {
+	    const auto __x = __vector_bitcast<long long>(__kk);
+	    return (__x[0] | __x[1]) != 0;
+	  }
+	else if constexpr (sizeof(__k) <= 8)
+	  return __bit_cast<__int_for_sizeof_t<decltype(__kk)>>(__kk) != 0;
+	else
+	  __assert_unreachable<_Tp>();
+      }
+
+    // }}}
+    // _S_none_of {{{
+    template <typename _Tp>
+      _GLIBCXX_SIMD_INTRINSIC static bool _S_none_of(simd_mask<_Tp, _Abi> __k)
+      {
+	const auto __kk = _Abi::_S_masked(__k._M_data);
+	if constexpr (sizeof(__k) == 16)
+	  {
+	    const auto __x = __vector_bitcast<long long>(__kk);
+	    return (__x[0] | __x[1]) == 0;
+	  }
+	else if constexpr (sizeof(__k) <= 8)
+	  return __bit_cast<__int_for_sizeof_t<decltype(__kk)>>(__kk) == 0;
+	else
+	  __assert_unreachable<_Tp>();
+      }
+
+    // }}}
+    // _S_some_of {{{
+    template <typename _Tp>
+      _GLIBCXX_SIMD_INTRINSIC static bool _S_some_of(simd_mask<_Tp, _Abi> __k)
+      {
+	if constexpr (sizeof(__k) <= 8)
+	  {
+	    const auto __kk = __vector_bitcast<char>(__k._M_data)
+			      | ~__vector_bitcast<char>(
+				_Abi::template _S_implicit_mask<_Tp>());
+	    using _Up = make_unsigned_t<__int_for_sizeof_t<decltype(__kk)>>;
+	    return __bit_cast<_Up>(__kk) + 1 > 1;
+	  }
+	else
+	  return _Base::_S_some_of(__k);
+      }
+
+    // }}}
+    // _S_popcount {{{
+    template <typename _Tp>
+      _GLIBCXX_SIMD_INTRINSIC static int _S_popcount(simd_mask<_Tp, _Abi> __k)
+      {
+	if constexpr (sizeof(_Tp) == 1)
+	  {
+	    const auto __s8 = __vector_bitcast<_SChar>(__k._M_data);
+	    int8x8_t __tmp = __lo64(__s8) + __hi64z(__s8);
+	    return -vpadd_s8(vpadd_s8(vpadd_s8(__tmp, int8x8_t()), int8x8_t()),
+			     int8x8_t())[0];
+	  }
+	else if constexpr (sizeof(_Tp) == 2)
+	  {
+	    const auto __s16 = __vector_bitcast<short>(__k._M_data);
+	    int16x4_t __tmp = __lo64(__s16) + __hi64z(__s16);
+	    return -vpadd_s16(vpadd_s16(__tmp, int16x4_t()), int16x4_t())[0];
+	  }
+	else if constexpr (sizeof(_Tp) == 4)
+	  {
+	    const auto __s32 = __vector_bitcast<int>(__k._M_data);
+	    int32x2_t __tmp = __lo64(__s32) + __hi64z(__s32);
+	    return -vpadd_s32(__tmp, int32x2_t())[0];
+	  }
+	else if constexpr (sizeof(_Tp) == 8)
+	  {
+	    static_assert(sizeof(__k) == 16);
+	    const auto __s64 = __vector_bitcast<long>(__k._M_data);
+	    return -(__s64[0] + __s64[1]);
+	  }
+      }
+
+    // }}}
+    // _S_find_first_set {{{
+    template <typename _Tp>
+      _GLIBCXX_SIMD_INTRINSIC static int
+      _S_find_first_set(simd_mask<_Tp, _Abi> __k)
+      {
+	// TODO: the _Base implementation is not optimal for NEON
+	return _Base::_S_find_first_set(__k);
+      }
+
+    // }}}
+    // _S_find_last_set {{{
+    template <typename _Tp>
+      _GLIBCXX_SIMD_INTRINSIC static int
+      _S_find_last_set(simd_mask<_Tp, _Abi> __k)
+      {
+	// TODO: the _Base implementation is not optimal for NEON
+	return _Base::_S_find_last_set(__k);
+      }
+
+    // }}}
+  }; // }}}
+
+_GLIBCXX_SIMD_END_NAMESPACE
+#endif // __cplusplus >= 201703L
+#endif // _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_
+// vim: foldmethod=marker sw=2 noet ts=8 sts=2 tw=80
--- a/libstdc++-v3/include/experimental/bits/simd_ppc.h
+++ b/libstdc++-v3/include/experimental/bits/simd_ppc.h
@ -0,0 +1,123 @@
+// Simd PowerPC specific implementations -*- C++ -*-
+
+// Copyright (C) 2020 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// Under Section 7 of GPL version 3, you are granted additional
+// permissions described in the GCC Runtime Library Exception, version
+// 3.1, as published by the Free Software Foundation.
+
+// You should have received a copy of the GNU General Public License and
+// a copy of the GCC Runtime Library Exception along with this program;
+// see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+// <http://www.gnu.org/licenses/>.
+
+#ifndef _GLIBCXX_EXPERIMENTAL_SIMD_PPC_H_
+#define _GLIBCXX_EXPERIMENTAL_SIMD_PPC_H_
+
+#if __cplusplus >= 201703L
+
+#ifndef __ALTIVEC__
+#error "simd_ppc.h may only be included when AltiVec/VMX is available"
+#endif
+
+_GLIBCXX_SIMD_BEGIN_NAMESPACE
+
+// _SimdImplPpc {{{
+template <typename _Abi>
+  struct _SimdImplPpc : _SimdImplBuiltin<_Abi>
+  {
+    using _Base = _SimdImplBuiltin<_Abi>;
+
+    // Byte and halfword shift instructions on PPC only consider the low 3 or 4
+    // bits of the RHS. Consequently, shifting by sizeof(_Tp)*CHAR_BIT (or more)
+    // is UB without extra measures. To match scalar behavior, byte and halfword
+    // shifts need an extra fixup step.
+
+    // _S_bit_shift_left {{{
+    template <typename _Tp, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
+      _S_bit_shift_left(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
+      {
+	__x = _Base::_S_bit_shift_left(__x, __y);
+	if constexpr (sizeof(_Tp) < sizeof(int))
+	  __x._M_data
+	    = (__y._M_data < sizeof(_Tp) * __CHAR_BIT__) & __x._M_data;
+	return __x;
+      }
+
+    template <typename _Tp, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
+      _S_bit_shift_left(_SimdWrapper<_Tp, _Np> __x, int __y)
+      {
+	__x = _Base::_S_bit_shift_left(__x, __y);
+	if constexpr (sizeof(_Tp) < sizeof(int))
+	  {
+	    if (__y >= sizeof(_Tp) * __CHAR_BIT__)
+	      return {};
+	  }
+	return __x;
+      }
+
+    // }}}
+    // _S_bit_shift_right {{{
+    template <typename _Tp, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
+      _S_bit_shift_right(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
+      {
+	if constexpr (sizeof(_Tp) < sizeof(int))
+	  {
+	    constexpr int __nbits = sizeof(_Tp) * __CHAR_BIT__;
+	    if constexpr (is_unsigned_v<_Tp>)
+	      return (__y._M_data < __nbits)
+		     & _Base::_S_bit_shift_right(__x, __y)._M_data;
+	    else
+	      {
+		_Base::_S_masked_assign(_SimdWrapper<_Tp, _Np>(__y._M_data
+							       >= __nbits),
+					__y, __nbits - 1);
+		return _Base::_S_bit_shift_right(__x, __y);
+	      }
+	  }
+	else
+	  return _Base::_S_bit_shift_right(__x, __y);
+      }
+
+    template <typename _Tp, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
+      _S_bit_shift_right(_SimdWrapper<_Tp, _Np> __x, int __y)
+      {
+	if constexpr (sizeof(_Tp) < sizeof(int))
+	  {
+	    constexpr int __nbits = sizeof(_Tp) * __CHAR_BIT__;
+	    if (__y >= __nbits)
+	      {
+		if constexpr (is_unsigned_v<_Tp>)
+		  return {};
+		else
+		  return _Base::_S_bit_shift_right(__x, __nbits - 1);
+	      }
+	  }
+	return _Base::_S_bit_shift_right(__x, __y);
+      }
+
+    // }}}
+  };
+
+// }}}
+
+_GLIBCXX_SIMD_END_NAMESPACE
+#endif // __cplusplus >= 201703L
+#endif // _GLIBCXX_EXPERIMENTAL_SIMD_PPC_H_
+
+// vim: foldmethod=marker sw=2 noet ts=8 sts=2 tw=80
--- a/libstdc++-v3/include/experimental/bits/simd_scalar.h
+++ b/libstdc++-v3/include/experimental/bits/simd_scalar.h
@ -0,0 +1,772 @@
+// Simd scalar ABI specific implementations -*- C++ -*-
+
+// Copyright (C) 2020 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// Under Section 7 of GPL version 3, you are granted additional
+// permissions described in the GCC Runtime Library Exception, version
+// 3.1, as published by the Free Software Foundation.
+
+// You should have received a copy of the GNU General Public License and
+// a copy of the GCC Runtime Library Exception along with this program;
+// see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+// <http://www.gnu.org/licenses/>.
+
+#ifndef _GLIBCXX_EXPERIMENTAL_SIMD_SCALAR_H_
+#define _GLIBCXX_EXPERIMENTAL_SIMD_SCALAR_H_
+#if __cplusplus >= 201703L
+
+#include <cmath>
+
+_GLIBCXX_SIMD_BEGIN_NAMESPACE
+
+// __promote_preserving_unsigned{{{
+// work around crazy semantics of unsigned integers of lower rank than int:
+// Before applying an operator the operands are promoted to int. In which case
+// over- or underflow is UB, even though the operand types were unsigned.
+template <typename _Tp>
+  _GLIBCXX_SIMD_INTRINSIC constexpr decltype(auto)
+  __promote_preserving_unsigned(const _Tp& __x)
+  {
+    if constexpr (is_signed_v<decltype(+__x)> && is_unsigned_v<_Tp>)
+      return static_cast<unsigned int>(__x);
+    else
+      return __x;
+  }
+
+// }}}
+
+struct _CommonImplScalar;
+struct _CommonImplBuiltin;
+struct _SimdImplScalar;
+struct _MaskImplScalar;
+
+// simd_abi::_Scalar {{{
+struct simd_abi::_Scalar
+{
+  template <typename _Tp>
+    static constexpr size_t _S_size = 1;
+
+  template <typename _Tp>
+    static constexpr size_t _S_full_size = 1;
+
+  template <typename _Tp>
+    static constexpr bool _S_is_partial = false;
+
+  struct _IsValidAbiTag : true_type {};
+
+  template <typename _Tp>
+    struct _IsValidSizeFor : true_type {};
+
+  template <typename _Tp>
+    struct _IsValid : __is_vectorizable<_Tp> {};
+
+  template <typename _Tp>
+    static constexpr bool _S_is_valid_v = _IsValid<_Tp>::value;
+
+  _GLIBCXX_SIMD_INTRINSIC static constexpr bool _S_masked(bool __x)
+  { return __x; }
+
+  using _CommonImpl = _CommonImplScalar;
+  using _SimdImpl = _SimdImplScalar;
+  using _MaskImpl = _MaskImplScalar;
+
+  template <typename _Tp, bool = _S_is_valid_v<_Tp>>
+    struct __traits : _InvalidTraits {};
+
+  template <typename _Tp>
+    struct __traits<_Tp, true>
+    {
+      using _IsValid = true_type;
+      using _SimdImpl = _SimdImplScalar;
+      using _MaskImpl = _MaskImplScalar;
+      using _SimdMember = _Tp;
+      using _MaskMember = bool;
+
+      static constexpr size_t _S_simd_align = alignof(_SimdMember);
+      static constexpr size_t _S_mask_align = alignof(_MaskMember);
+
+      // nothing the user can spell converts to/from simd/simd_mask
+      struct _SimdCastType { _SimdCastType() = delete; };
+      struct _MaskCastType { _MaskCastType() = delete; };
+      struct _SimdBase {};
+      struct _MaskBase {};
+    };
+};
+
+// }}}
+// _CommonImplScalar {{{
+struct _CommonImplScalar
+{
+  // _S_store {{{
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC static void _S_store(_Tp __x, void* __addr)
+    { __builtin_memcpy(__addr, &__x, sizeof(_Tp)); }
+
+  // }}}
+  // _S_store_bool_array(_BitMask) {{{
+  template <size_t _Np, bool _Sanitized>
+    _GLIBCXX_SIMD_INTRINSIC static constexpr void
+    _S_store_bool_array(_BitMask<_Np, _Sanitized> __x, bool* __mem)
+    {
+      __make_dependent_t<decltype(__x), _CommonImplBuiltin>::_S_store_bool_array(
+	__x, __mem);
+    }
+
+  // }}}
+};
+
+// }}}
+// _SimdImplScalar {{{
+struct _SimdImplScalar
+{
+  // member types {{{2
+  using abi_type = simd_abi::scalar;
+
+  template <typename _Tp>
+    using _TypeTag = _Tp*;
+
+  // _S_broadcast {{{2
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC static constexpr _Tp _S_broadcast(_Tp __x) noexcept
+    { return __x; }
+
+  // _S_generator {{{2
+  template <typename _Fp, typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC static constexpr _Tp _S_generator(_Fp&& __gen,
+							      _TypeTag<_Tp>)
+    { return __gen(_SizeConstant<0>()); }
+
+  // _S_load {{{2
+  template <typename _Tp, typename _Up>
+    _GLIBCXX_SIMD_INTRINSIC static _Tp _S_load(const _Up* __mem,
+					       _TypeTag<_Tp>) noexcept
+    { return static_cast<_Tp>(__mem[0]); }
+
+  // _S_masked_load {{{2
+  template <typename _Tp, typename _Up>
+    static inline _Tp _S_masked_load(_Tp __merge, bool __k,
+				     const _Up* __mem) noexcept
+    {
+      if (__k)
+	__merge = static_cast<_Tp>(__mem[0]);
+      return __merge;
+    }
+
+  // _S_store {{{2
+  template <typename _Tp, typename _Up>
+    static inline void _S_store(_Tp __v, _Up* __mem, _TypeTag<_Tp>) noexcept
+    { __mem[0] = static_cast<_Up>(__v); }
+
+  // _S_masked_store {{{2
+  template <typename _Tp, typename _Up>
+    static inline void _S_masked_store(const _Tp __v, _Up* __mem,
+				       const bool __k) noexcept
+    { if (__k) __mem[0] = __v; }
+
+  // _S_negate {{{2
+  template <typename _Tp>
+    static constexpr inline bool _S_negate(_Tp __x) noexcept
+    { return !__x; }
+
+  // _S_reduce {{{2
+  template <typename _Tp, typename _BinaryOperation>
+    static constexpr inline _Tp
+    _S_reduce(const simd<_Tp, simd_abi::scalar>& __x, _BinaryOperation&)
+    { return __x._M_data; }
+
+  // _S_min, _S_max {{{2
+  template <typename _Tp>
+    static constexpr inline _Tp _S_min(const _Tp __a, const _Tp __b)
+    { return std::min(__a, __b); }
+
+  template <typename _Tp>
+    static constexpr inline _Tp _S_max(const _Tp __a, const _Tp __b)
+    { return std::max(__a, __b); }
+
+  // _S_complement {{{2
+  template <typename _Tp>
+    static constexpr inline _Tp _S_complement(_Tp __x) noexcept
+    { return static_cast<_Tp>(~__x); }
+
+  // _S_unary_minus {{{2
+  template <typename _Tp>
+    static constexpr inline _Tp _S_unary_minus(_Tp __x) noexcept
+    { return static_cast<_Tp>(-__x); }
+
+  // arithmetic operators {{{2
+  template <typename _Tp>
+    static constexpr inline _Tp _S_plus(_Tp __x, _Tp __y)
+    {
+      return static_cast<_Tp>(__promote_preserving_unsigned(__x)
+			      + __promote_preserving_unsigned(__y));
+    }
+
+  template <typename _Tp>
+    static constexpr inline _Tp _S_minus(_Tp __x, _Tp __y)
+    {
+      return static_cast<_Tp>(__promote_preserving_unsigned(__x)
+			      - __promote_preserving_unsigned(__y));
+    }
+
+  template <typename _Tp>
+    static constexpr inline _Tp _S_multiplies(_Tp __x, _Tp __y)
+    {
+      return static_cast<_Tp>(__promote_preserving_unsigned(__x)
+			      * __promote_preserving_unsigned(__y));
+    }
+
+  template <typename _Tp>
+    static constexpr inline _Tp _S_divides(_Tp __x, _Tp __y)
+    {
+      return static_cast<_Tp>(__promote_preserving_unsigned(__x)
+			      / __promote_preserving_unsigned(__y));
+    }
+
+  template <typename _Tp>
+    static constexpr inline _Tp _S_modulus(_Tp __x, _Tp __y)
+    {
+      return static_cast<_Tp>(__promote_preserving_unsigned(__x)
+			      % __promote_preserving_unsigned(__y));
+    }
+
+  template <typename _Tp>
+    static constexpr inline _Tp _S_bit_and(_Tp __x, _Tp __y)
+    {
+      if constexpr (is_floating_point_v<_Tp>)
+	{
+	  using _Ip = __int_for_sizeof_t<_Tp>;
+	  return __bit_cast<_Tp>(__bit_cast<_Ip>(__x) & __bit_cast<_Ip>(__y));
+	}
+      else
+	return static_cast<_Tp>(__promote_preserving_unsigned(__x)
+				& __promote_preserving_unsigned(__y));
+    }
+
+  template <typename _Tp>
+    static constexpr inline _Tp _S_bit_or(_Tp __x, _Tp __y)
+    {
+      if constexpr (is_floating_point_v<_Tp>)
+	{
+	  using _Ip = __int_for_sizeof_t<_Tp>;
+	  return __bit_cast<_Tp>(__bit_cast<_Ip>(__x) | __bit_cast<_Ip>(__y));
+	}
+      else
+	return static_cast<_Tp>(__promote_preserving_unsigned(__x)
+				| __promote_preserving_unsigned(__y));
+    }
+
+  template <typename _Tp>
+    static constexpr inline _Tp _S_bit_xor(_Tp __x, _Tp __y)
+    {
+      if constexpr (is_floating_point_v<_Tp>)
+	{
+	  using _Ip = __int_for_sizeof_t<_Tp>;
+	  return __bit_cast<_Tp>(__bit_cast<_Ip>(__x) ^ __bit_cast<_Ip>(__y));
+	}
+      else
+	return static_cast<_Tp>(__promote_preserving_unsigned(__x)
+				^ __promote_preserving_unsigned(__y));
+    }
+
+  template <typename _Tp>
+    static constexpr inline _Tp _S_bit_shift_left(_Tp __x, int __y)
+    { return static_cast<_Tp>(__promote_preserving_unsigned(__x) << __y); }
+
+  template <typename _Tp>
+    static constexpr inline _Tp _S_bit_shift_right(_Tp __x, int __y)
+    { return static_cast<_Tp>(__promote_preserving_unsigned(__x) >> __y); }
+
+  // math {{{2
+  // frexp, modf and copysign implemented in simd_math.h
+  template <typename _Tp>
+    using _ST = _SimdTuple<_Tp, simd_abi::scalar>;
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC static _Tp _S_acos(_Tp __x)
+    { return std::acos(__x); }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC static _Tp _S_asin(_Tp __x)
+    { return std::asin(__x); }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC static _Tp _S_atan(_Tp __x)
+    { return std::atan(__x); }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC static _Tp _S_cos(_Tp __x)
+    { return std::cos(__x); }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC static _Tp _S_sin(_Tp __x)
+    { return std::sin(__x); }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC static _Tp _S_tan(_Tp __x)
+    { return std::tan(__x); }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC static _Tp _S_acosh(_Tp __x)
+    { return std::acosh(__x); }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC static _Tp _S_asinh(_Tp __x)
+    { return std::asinh(__x); }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC static _Tp _S_atanh(_Tp __x)
+    { return std::atanh(__x); }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC static _Tp _S_cosh(_Tp __x)
+    { return std::cosh(__x); }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC static _Tp _S_sinh(_Tp __x)
+    { return std::sinh(__x); }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC static _Tp _S_tanh(_Tp __x)
+    { return std::tanh(__x); }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC static _Tp _S_atan2(_Tp __x, _Tp __y)
+    { return std::atan2(__x, __y); }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC static _Tp _S_exp(_Tp __x)
+    { return std::exp(__x); }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC static _Tp _S_exp2(_Tp __x)
+    { return std::exp2(__x); }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC static _Tp _S_expm1(_Tp __x)
+    { return std::expm1(__x); }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC static _Tp _S_log(_Tp __x)
+    { return std::log(__x); }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC static _Tp _S_log10(_Tp __x)
+    { return std::log10(__x); }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC static _Tp _S_log1p(_Tp __x)
+    { return std::log1p(__x); }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC static _Tp _S_log2(_Tp __x)
+    { return std::log2(__x); }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC static _Tp _S_logb(_Tp __x)
+    { return std::logb(__x); }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC static _ST<int> _S_ilogb(_Tp __x)
+    { return {std::ilogb(__x)}; }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC static _Tp _S_pow(_Tp __x, _Tp __y)
+    { return std::pow(__x, __y); }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC static _Tp _S_abs(_Tp __x)
+    { return std::abs(__x); }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC static _Tp _S_fabs(_Tp __x)
+    { return std::fabs(__x); }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC static _Tp _S_sqrt(_Tp __x)
+    { return std::sqrt(__x); }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC static _Tp _S_cbrt(_Tp __x)
+    { return std::cbrt(__x); }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC static _Tp _S_erf(_Tp __x)
+    { return std::erf(__x); }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC static _Tp _S_erfc(_Tp __x)
+    { return std::erfc(__x); }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC static _Tp _S_lgamma(_Tp __x)
+    { return std::lgamma(__x); }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC static _Tp _S_tgamma(_Tp __x)
+    { return std::tgamma(__x); }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC static _Tp _S_trunc(_Tp __x)
+    { return std::trunc(__x); }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC static _Tp _S_floor(_Tp __x)
+    { return std::floor(__x); }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC static _Tp _S_ceil(_Tp __x)
+    { return std::ceil(__x); }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC static _Tp _S_nearbyint(_Tp __x)
+    { return std::nearbyint(__x); }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC static _Tp _S_rint(_Tp __x)
+    { return std::rint(__x); }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC static _ST<long> _S_lrint(_Tp __x)
+    { return {std::lrint(__x)}; }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC static _ST<long long> _S_llrint(_Tp __x)
+    { return {std::llrint(__x)}; }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC static _Tp _S_round(_Tp __x)
+    { return std::round(__x); }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC static _ST<long> _S_lround(_Tp __x)
+    { return {std::lround(__x)}; }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC static _ST<long long> _S_llround(_Tp __x)
+    { return {std::llround(__x)}; }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC static _Tp _S_ldexp(_Tp __x, _ST<int> __y)
+    { return std::ldexp(__x, __y.first); }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC static _Tp _S_scalbn(_Tp __x, _ST<int> __y)
+    { return std::scalbn(__x, __y.first); }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC static _Tp _S_scalbln(_Tp __x, _ST<long> __y)
+    { return std::scalbln(__x, __y.first); }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC static _Tp _S_fmod(_Tp __x, _Tp __y)
+    { return std::fmod(__x, __y); }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC static _Tp _S_remainder(_Tp __x, _Tp __y)
+    { return std::remainder(__x, __y); }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC static _Tp _S_nextafter(_Tp __x, _Tp __y)
+    { return std::nextafter(__x, __y); }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC static _Tp _S_fdim(_Tp __x, _Tp __y)
+    { return std::fdim(__x, __y); }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC static _Tp _S_fmax(_Tp __x, _Tp __y)
+    { return std::fmax(__x, __y); }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC static _Tp _S_fmin(_Tp __x, _Tp __y)
+    { return std::fmin(__x, __y); }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC static _Tp _S_fma(_Tp __x, _Tp __y, _Tp __z)
+    { return std::fma(__x, __y, __z); }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC static _Tp _S_remquo(_Tp __x, _Tp __y, _ST<int>* __z)
+    { return std::remquo(__x, __y, &__z->first); }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC constexpr static _ST<int> _S_fpclassify(_Tp __x)
+    { return {std::fpclassify(__x)}; }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC constexpr static bool _S_isfinite(_Tp __x)
+    { return std::isfinite(__x); }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC constexpr static bool _S_isinf(_Tp __x)
+    { return std::isinf(__x); }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC constexpr static bool _S_isnan(_Tp __x)
+    { return std::isnan(__x); }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC constexpr static bool _S_isnormal(_Tp __x)
+    { return std::isnormal(__x); }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC constexpr static bool _S_signbit(_Tp __x)
+    { return std::signbit(__x); }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC constexpr static bool _S_isgreater(_Tp __x, _Tp __y)
+    { return std::isgreater(__x, __y); }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC constexpr static bool _S_isgreaterequal(_Tp __x,
+								    _Tp __y)
+    { return std::isgreaterequal(__x, __y); }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC constexpr static bool _S_isless(_Tp __x, _Tp __y)
+    { return std::isless(__x, __y); }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC constexpr static bool _S_islessequal(_Tp __x, _Tp __y)
+    { return std::islessequal(__x, __y); }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC constexpr static bool _S_islessgreater(_Tp __x,
+								   _Tp __y)
+    { return std::islessgreater(__x, __y); }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC constexpr static bool _S_isunordered(_Tp __x,
+								 _Tp __y)
+    { return std::isunordered(__x, __y); }
+
+  // _S_increment & _S_decrement{{{2
+  template <typename _Tp>
+    constexpr static inline void _S_increment(_Tp& __x)
+    { ++__x; }
+
+  template <typename _Tp>
+    constexpr static inline void _S_decrement(_Tp& __x)
+    { --__x; }
+
+
+  // compares {{{2
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC constexpr static bool _S_equal_to(_Tp __x, _Tp __y)
+    { return __x == __y; }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC constexpr static bool _S_not_equal_to(_Tp __x,
+								  _Tp __y)
+    { return __x != __y; }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC constexpr static bool _S_less(_Tp __x, _Tp __y)
+    { return __x < __y; }
+
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC constexpr static bool _S_less_equal(_Tp __x,
+								_Tp __y)
+    { return __x <= __y; }
+
+  // smart_reference access {{{2
+  template <typename _Tp, typename _Up>
+    constexpr static void _S_set(_Tp& __v, [[maybe_unused]] int __i,
+				 _Up&& __x) noexcept
+    {
+      _GLIBCXX_DEBUG_ASSERT(__i == 0);
+      __v = static_cast<_Up&&>(__x);
+    }
+
+  // _S_masked_assign {{{2
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC constexpr static void
+    _S_masked_assign(bool __k, _Tp& __lhs, _Tp __rhs)
+    { if (__k) __lhs = __rhs; }
+
+  // _S_masked_cassign {{{2
+  template <typename _Op, typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC constexpr static void
+    _S_masked_cassign(const bool __k, _Tp& __lhs, const _Tp __rhs, _Op __op)
+    { if (__k) __lhs = __op(_SimdImplScalar{}, __lhs, __rhs); }
+
+  // _S_masked_unary {{{2
+  template <template <typename> class _Op, typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC constexpr static _Tp _S_masked_unary(const bool __k,
+								 const _Tp __v)
+    { return static_cast<_Tp>(__k ? _Op<_Tp>{}(__v) : __v); }
+
+  // }}}2
+};
+
+// }}}
+// _MaskImplScalar {{{
+struct _MaskImplScalar
+{
+  // member types {{{
+  template <typename _Tp>
+    using _TypeTag = _Tp*;
+
+  // }}}
+  // _S_broadcast {{{
+  template <typename>
+    _GLIBCXX_SIMD_INTRINSIC static constexpr bool _S_broadcast(bool __x)
+    { return __x; }
+
+  // }}}
+  // _S_load {{{
+  template <typename>
+    _GLIBCXX_SIMD_INTRINSIC static constexpr bool _S_load(const bool* __mem)
+    { return __mem[0]; }
+
+  // }}}
+  // _S_to_bits {{{
+  _GLIBCXX_SIMD_INTRINSIC static constexpr _SanitizedBitMask<1>
+  _S_to_bits(bool __x)
+  { return __x; }
+
+  // }}}
+  // _S_convert {{{
+  template <typename, bool _Sanitized>
+    _GLIBCXX_SIMD_INTRINSIC static constexpr bool
+    _S_convert(_BitMask<1, _Sanitized> __x)
+    { return __x[0]; }
+
+  template <typename, typename _Up, typename _UAbi>
+    _GLIBCXX_SIMD_INTRINSIC static constexpr bool
+    _S_convert(simd_mask<_Up, _UAbi> __x)
+    { return __x[0]; }
+
+  // }}}
+  // _S_from_bitmask {{{2
+  template <typename _Tp>
+    _GLIBCXX_SIMD_INTRINSIC constexpr static bool
+    _S_from_bitmask(_SanitizedBitMask<1> __bits, _TypeTag<_Tp>) noexcept
+    { return __bits[0]; }
+
+  // _S_masked_load {{{2
+  _GLIBCXX_SIMD_INTRINSIC constexpr static bool
+  _S_masked_load(bool __merge, bool __mask, const bool* __mem) noexcept
+  {
+    if (__mask)
+      __merge = __mem[0];
+    return __merge;
+  }
+
+  // _S_store {{{2
+  _GLIBCXX_SIMD_INTRINSIC static void _S_store(bool __v, bool* __mem) noexcept
+  { __mem[0] = __v; }
+
+  // _S_masked_store {{{2
+  _GLIBCXX_SIMD_INTRINSIC static void
+  _S_masked_store(const bool __v, bool* __mem, const bool __k) noexcept
+  {
+    if (__k)
+      __mem[0] = __v;
+  }
+
+  // logical and bitwise operators {{{2
+  static constexpr bool _S_logical_and(bool __x, bool __y)
+  { return __x && __y; }
+
+  static constexpr bool _S_logical_or(bool __x, bool __y)
+  { return __x || __y; }
+
+  static constexpr bool _S_bit_not(bool __x)
+  { return !__x; }
+
+  static constexpr bool _S_bit_and(bool __x, bool __y)
+  { return __x && __y; }
+
+  static constexpr bool _S_bit_or(bool __x, bool __y)
+  { return __x || __y; }
+
+  static constexpr bool _S_bit_xor(bool __x, bool __y)
+  { return __x != __y; }
+
+  // smart_reference access {{{2
+  constexpr static void _S_set(bool& __k, [[maybe_unused]] int __i,
+			       bool __x) noexcept
+  {
+    _GLIBCXX_DEBUG_ASSERT(__i == 0);
+    __k = __x;
+  }
+
+  // _S_masked_assign {{{2
+  _GLIBCXX_SIMD_INTRINSIC static void _S_masked_assign(bool __k, bool& __lhs,
+						       bool __rhs)
+  {
+    if (__k)
+      __lhs = __rhs;
+  }
+
+  // }}}2
+  // _S_all_of {{{
+  template <typename _Tp, typename _Abi>
+    _GLIBCXX_SIMD_INTRINSIC constexpr static bool
+    _S_all_of(simd_mask<_Tp, _Abi> __k)
+    { return __k._M_data; }
+
+  // }}}
+  // _S_any_of {{{
+  template <typename _Tp, typename _Abi>
+    _GLIBCXX_SIMD_INTRINSIC constexpr static bool
+    _S_any_of(simd_mask<_Tp, _Abi> __k)
+    { return __k._M_data; }
+
+  // }}}
+  // _S_none_of {{{
+  template <typename _Tp, typename _Abi>
+    _GLIBCXX_SIMD_INTRINSIC constexpr static bool
+    _S_none_of(simd_mask<_Tp, _Abi> __k)
+    { return !__k._M_data; }
+
+  // }}}
+  // _S_some_of {{{
+  template <typename _Tp, typename _Abi>
+    _GLIBCXX_SIMD_INTRINSIC constexpr static bool
+    _S_some_of(simd_mask<_Tp, _Abi>)
+    { return false; }
+
+  // }}}
+  // _S_popcount {{{
+  template <typename _Tp, typename _Abi>
+    _GLIBCXX_SIMD_INTRINSIC constexpr static int
+    _S_popcount(simd_mask<_Tp, _Abi> __k)
+    { return __k._M_data; }
+
+  // }}}
+  // _S_find_first_set {{{
+  template <typename _Tp, typename _Abi>
+    _GLIBCXX_SIMD_INTRINSIC constexpr static int
+    _S_find_first_set(simd_mask<_Tp, _Abi>)
+    { return 0; }
+
+  // }}}
+  // _S_find_last_set {{{
+  template <typename _Tp, typename _Abi>
+    _GLIBCXX_SIMD_INTRINSIC constexpr static int
+    _S_find_last_set(simd_mask<_Tp, _Abi>)
+    { return 0; }
+
+  // }}}
+};
+
+// }}}
+
+_GLIBCXX_SIMD_END_NAMESPACE
+#endif // __cplusplus >= 201703L
+#endif // _GLIBCXX_EXPERIMENTAL_SIMD_SCALAR_H_
+
+// vim: foldmethod=marker sw=2 noet ts=8 sts=2 tw=80
--- a/libstdc++-v3/include/experimental/bits/simd_x86.h
+++ b/libstdc++-v3/include/experimental/bits/simd_x86.h
--- a/libstdc++-v3/include/experimental/bits/simd_x86_conversions.h
+++ b/libstdc++-v3/include/experimental/bits/simd_x86_conversions.h
--- a/libstdc++-v3/include/experimental/simd
+++ b/libstdc++-v3/include/experimental/simd
@ -0,0 +1,70 @@
+// Components for element-wise operations on data-parallel objects -*- C++ -*-
+
+// Copyright (C) 2020 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// Under Section 7 of GPL version 3, you are granted additional
+// permissions described in the GCC Runtime Library Exception, version
+// 3.1, as published by the Free Software Foundation.
+
+// You should have received a copy of the GNU General Public License and
+// a copy of the GCC Runtime Library Exception along with this program;
+// see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+// <http://www.gnu.org/licenses/>.
+
+/** @file experimental/simd
+ *  This is a TS C++ Library header.
+ */
+
+//
+// N4773 §9 data-parallel types library
+//
+
+#ifndef _GLIBCXX_EXPERIMENTAL_SIMD
+#define _GLIBCXX_EXPERIMENTAL_SIMD
+
+#define __cpp_lib_experimental_parallel_simd 201803
+
+#pragma GCC diagnostic push
+// Many [[gnu::vector_size(N)]] types might lead to a -Wpsabi warning which is
+// irrelevant as those functions never appear on ABI borders
+#ifndef __clang__
+#pragma GCC diagnostic ignored "-Wpsabi"
+#endif
+
+// If __OPTIMIZE__ is not defined some intrinsics are defined as macros, making
+// use of C casts internally. This requires us to disable the warning as it
+// would otherwise yield many false positives.
+#ifndef __OPTIMIZE__
+#pragma GCC diagnostic ignored "-Wold-style-cast"
+#endif
+
+#include "bits/simd_detail.h"
+#include "bits/simd.h"
+#include "bits/simd_fixed_size.h"
+#include "bits/simd_scalar.h"
+#include "bits/simd_builtin.h"
+#include "bits/simd_converter.h"
+#if _GLIBCXX_SIMD_X86INTRIN
+#include "bits/simd_x86.h"
+#elif _GLIBCXX_SIMD_HAVE_NEON
+#include "bits/simd_neon.h"
+#elif __ALTIVEC__
+#include "bits/simd_ppc.h"
+#endif
+#include "bits/simd_math.h"
+
+#pragma GCC diagnostic pop
+
+#endif // _GLIBCXX_EXPERIMENTAL_SIMD
+// vim: ft=cpp
--- a/libstdc++-v3/testsuite/experimental/simd/standard_abi_usable.cc
+++ b/libstdc++-v3/testsuite/experimental/simd/standard_abi_usable.cc
@ -0,0 +1,64 @@
+// { dg-options "-std=c++17 -fno-fast-math" }
+// { dg-do compile { target c++17 } }
+
+// Copyright (C) 2020 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
+#include <experimental/simd>
+
+template <typename V>
+  void
+  is_usable()
+  {
+    static_assert(std::is_default_constructible_v<V>);
+    static_assert(std::is_destructible_v         <V>);
+    static_assert(std::is_default_constructible_v<typename V::mask_type>);
+    static_assert(std::is_destructible_v         <typename V::mask_type>);
+  }
+
+template <typename T>
+  void
+  test01()
+  {
+    namespace stdx = std::experimental;
+    is_usable<stdx::simd<T>>();
+    is_usable<stdx::native_simd<T>>();
+    is_usable<stdx::fixed_size_simd<T, 3>>();
+    is_usable<stdx::fixed_size_simd<T, stdx::simd_abi::max_fixed_size<T>>>();
+  }
+
+int main()
+{
+  test01<char>();
+  test01<wchar_t>();
+  test01<char16_t>();
+  test01<char32_t>();
+
+  test01<signed char>();
+  test01<unsigned char>();
+  test01<short>();
+  test01<unsigned short>();
+  test01<int>();
+  test01<unsigned int>();
+  test01<long>();
+  test01<unsigned long>();
+  test01<long long>();
+  test01<unsigned long long>();
+  test01<float>();
+  test01<double>();
+  test01<long double>();
+}
--- a/libstdc++-v3/testsuite/experimental/simd/standard_abi_usable_2.cc
+++ b/libstdc++-v3/testsuite/experimental/simd/standard_abi_usable_2.cc
@ -0,0 +1,4 @@
+// { dg-options "-std=c++17 -ffast-math" }
+// { dg-do compile }
+
+#include "standard_abi_usable.cc"
--- a/libstdc++-v3/testsuite/libstdc++-dg/conformance.exp
+++ b/libstdc++-v3/testsuite/libstdc++-dg/conformance.exp
@ -89,12 +89,14 @@ if {[info exists tests_file] && [file exists $tests_file]} {
 	    # 3. wchar_t tests, if not supported.
 	    # 4. thread tests, if not supported. 
 	    # 5. *_filebuf, if file I/O is not supported.
+	    # 6. simd tests.
 	    if { [string first _xin $t] == -1
 		 && [string first performance $t] == -1
 		 && (${v3-wchar_t} || [string first wchar_t $t] == -1) 
 		 && (${v3-threads} || [string first thread $t] == -1)  
 		 && ([string first "_filebuf" $t] == -1
-		     || [check_v3_target_fileio]) } {
+		     || [check_v3_target_fileio])
+		 && [string first "/experimental/simd/" $t] == -1 } {
 		lappend tests $t
 	    }
 	}
@ -107,5 +109,19 @@ global DEFAULT_CXXFLAGS
 global PCH_CXXFLAGS
 dg-runtest $tests "" "$DEFAULT_CXXFLAGS $PCH_CXXFLAGS"

+# Finally run simd tests with extra SIMD-relevant flags
+global DEFAULT_VECTCFLAGS
+global EFFECTIVE_TARGETS
+set DEFAULT_VECTCFLAGS ""
+set EFFECTIVE_TARGETS ""
+
+if [check_vect_support_and_set_flags] {
+  lappend DEFAULT_VECTCFLAGS "-O2"
+  lappend DEFAULT_VECTCFLAGS "-Wno-psabi"
+  et-dg-runtest dg-runtest [lsort \
+    [glob -nocomplain $srcdir/experimental/simd/*.cc]] \
+    "$DEFAULT_VECTCFLAGS" "$DEFAULT_CXXFLAGS $PCH_CXXFLAGS"
+}
+
 # All done.
 dg-finish