re PR libstdc++/35588 ([parallel mode] parallel std::sort and bind())

2008-04-09 Johannes Singler <singler@ira.uka.de> * include/parallel/multiway_merge.h: Moved decisions to compiletime instead of runtime. * include/parallel/losertree.h: Removed obsolete variants, added variant that uses pointers in the loser tree. * include/parallel/types.h: Remove obsolete settings options from enum. * include/parallel/features.h: Remove obsolete compile-time switches. * include/parallel/compiletime_settings.h: Remove obsolete variant that copies back *after* sorting. * include/parallel/tags.h: Add one new tag for compile-time switch. * include/parallel/merge.h: Adapt to changes in multiway_merge.h. * include/parallel/multiway_mergesort.h: Adapt to changes in multiway_merge.h. Factor out splitting variants. Remove obsolete variant that copies back *after* sorting. * include/parallel/sort.h: Adapt to changes in multiway_mergesort.h. * testsuite/25_algorithms/sort/35588.cc: Added test case from / for PR 35588. From-SVN: r134148
2008-04-09 16:47:00 +00:00 · 2008-04-09 16:47:00 +00:00 · e5e07b67a4
commit e5e07b67a4
parent 757483b6ee
11 changed files with 2508 additions and 2359 deletions
--- a/libstdc++-v3/ChangeLog
+++ b/libstdc++-v3/ChangeLog
@ -1,3 +1,29 @@
+2008-04-09  Johannes Singler  <singler@ira.uka.de>
+
+         * include/parallel/multiway_merge.h:
+           Moved decisions to compiletime instead of runtime.
+         * include/parallel/losertree.h:
+           Removed obsolete variants, added variant that uses pointers
+           in the loser tree.
+         * include/parallel/types.h:
+           Remove obsolete settings options from enum.
+         * include/parallel/features.h:
+           Remove obsolete compile-time switches.
+         * include/parallel/compiletime_settings.h:
+           Remove obsolete variant that copies back *after* sorting.
+         * include/parallel/tags.h:
+           Add one new tag for compile-time switch.
+         * include/parallel/merge.h:
+           Adapt to changes in multiway_merge.h.
+         * include/parallel/multiway_mergesort.h:
+           Adapt to changes in multiway_merge.h.
+           Factor out splitting variants.
+           Remove obsolete variant that copies back *after* sorting.
+         * include/parallel/sort.h:
+           Adapt to changes in multiway_mergesort.h.
+         * testsuite/25_algorithms/sort/35588.cc:
+           Added test case from / for PR 35588.
+
 2008-03-29  Paolo Carlini  <pcarlini@suse.de>

 	PR libstdc++/35725
--- a/libstdc++-v3/include/parallel/compiletime_settings.h
+++ b/libstdc++-v3/include/parallel/compiletime_settings.h
@ -73,17 +73,9 @@
 *  __gnu_parallel::parallel_random_shuffle(). */
 #define _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1 0
 #endif
-#ifndef _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB 
+#ifndef _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
 /** @brief Switch on many _GLIBCXX_PARALLEL_ASSERTions in parallel code.
 *  Consider the size of the TLB for
 *  __gnu_parallel::parallel_random_shuffle(). */
 #define _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB 0
 #endif
-
-#ifndef _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST
-/** @brief First copy the data, sort it locally, and merge it back
- * (0); or copy it back after everything is done (1).
- *
- *  Recommendation: 0 */
-#define _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST 0
-#endif
--- a/libstdc++-v3/include/parallel/features.h
+++ b/libstdc++-v3/include/parallel/features.h
@ -61,66 +61,6 @@
 #define _GLIBCXX_BAL_QUICKSORT 1
 #endif

-#ifndef _GLIBCXX_LOSER_TREE
-/** @def _GLIBCXX_LOSER_TREE
- *  @brief Include guarded (sequences may run empty) loser tree,
- *  moving objects.
- *  @see __gnu_parallel::_Settings multiway_merge_algorithm */
-#define _GLIBCXX_LOSER_TREE 1
-#endif
-
-#ifndef _GLIBCXX_LOSER_TREE_EXPLICIT
-/** @def _GLIBCXX_LOSER_TREE_EXPLICIT
- *  @brief Include standard loser tree, storing two flags for infimum
- *  and supremum.
- *  @see __gnu_parallel::_Settings multiway_merge_algorithm */
-#define _GLIBCXX_LOSER_TREE_EXPLICIT 0
-#endif
-
-#ifndef _GLIBCXX_LOSER_TREE_REFERENCE
-/** @def _GLIBCXX_LOSER_TREE_REFERENCE
- *  @brief Include some loser tree variant.
- *  @see __gnu_parallel::_Settings multiway_merge_algorithm */
-#define _GLIBCXX_LOSER_TREE_REFERENCE 0
-#endif
-
-#ifndef _GLIBCXX_LOSER_TREE_POINTER
-/** @def _GLIBCXX_LOSER_TREE_POINTER
- *  @brief Include some loser tree variant.
- *  @see __gnu_parallel::_Settings multiway_merge_algorithm */
-#define _GLIBCXX_LOSER_TREE_POINTER 1
-#endif
-
-#ifndef _GLIBCXX_LOSER_TREE_UNGUARDED
-/** @def _GLIBCXX_LOSER_TREE_UNGUARDED
- *  @brief Include unguarded (sequences must not run empty) loser
- *  tree, moving objects.
- *  @see __gnu_parallel::_Settings multiway_merge_algorithm */
-#define _GLIBCXX_LOSER_TREE_UNGUARDED 0
-#endif
-
-#ifndef _GLIBCXX_LOSER_TREE_POINTER_UNGUARDED
-/** @def _GLIBCXX_LOSER_TREE_POINTER_UNGUARDED
- *  @brief Include some loser tree variant.
- *  @see __gnu_parallel::_Settings multiway_merge_algorithm */
-#define _GLIBCXX_LOSER_TREE_POINTER_UNGUARDED 1
-#endif
-
-#ifndef _GLIBCXX_LOSER_TREE_COMBINED
-/** @def _GLIBCXX_LOSER_TREE_COMBINED
- *  @brief Include some loser tree variant.
- *  @see __gnu_parallel::_Settings multiway_merge_algorithm */
-#define _GLIBCXX_LOSER_TREE_COMBINED 0
-#endif
-
-#ifndef _GLIBCXX_LOSER_TREE_SENTINEL
-/** @def _GLIBCXX_LOSER_TREE_SENTINEL
- *  @brief Include some loser tree variant.
- *  @see __gnu_parallel::_Settings multiway_merge_algorithm */
-#define _GLIBCXX_LOSER_TREE_SENTINEL 0
-#endif
-
-
 #ifndef _GLIBCXX_FIND_GROWING_BLOCKS
 /** @brief Include the growing blocks variant for std::find.
 *  @see __gnu_parallel::_Settings::find_algorithm */
--- a/libstdc++-v3/include/parallel/losertree.h
+++ b/libstdc++-v3/include/parallel/losertree.h
--- a/libstdc++-v3/include/parallel/merge.h
+++ b/libstdc++-v3/include/parallel/merge.h
@ -239,19 +239,26 @@ namespace __gnu_parallel
 			   std::iterator_traits<RandomAccessIterator1>::
 			   difference_type max_length, Comparator comp)
    {
-      typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
-	value_type;
+      typedef typename
+          std::iterator_traits<RandomAccessIterator1>::value_type value_type;
      typedef typename std::iterator_traits<RandomAccessIterator1>::
 	difference_type difference_type1 /* == difference_type2 */;
      typedef typename std::iterator_traits<RandomAccessIterator3>::
 	difference_type difference_type3;
+      typedef typename std::pair<RandomAccessIterator1, RandomAccessIterator1>
+        iterator_pair;

      std::pair<RandomAccessIterator1, RandomAccessIterator1>
 	seqs[2] = { std::make_pair(begin1, end1),
 		    std::make_pair(begin2, end2) };
-      RandomAccessIterator3 
-	target_end = parallel_multiway_merge(seqs, seqs + 2, target,
-					     comp, max_length, true, false);
+      RandomAccessIterator3
+        target_end = parallel_multiway_merge
+          < /* stable = */ true, /* sentinels = */ false>(
+            seqs, seqs + 2, target, comp,
+            multiway_merge_exact_splitting
+              < /* stable = */ true, iterator_pair*,
+                Comparator, difference_type1>,
+            max_length);

      return target_end;
    }
--- a/libstdc++-v3/include/parallel/multiway_merge.h
+++ b/libstdc++-v3/include/parallel/multiway_merge.h
--- a/libstdc++-v3/include/parallel/multiway_mergesort.h
+++ b/libstdc++-v3/include/parallel/multiway_mergesort.h
@ -80,26 +80,9 @@ template<typename RandomAccessIterator>
    /** @brief Start indices, per thread. */
    difference_type* starts;

-    /** @brief Temporary arrays for each thread.
-     *
-     *  Indirection Allows using the temporary storage in different
-     *  ways, without code duplication.
-     *  @see _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST */
-    value_type** temporaries;
-
-#if _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST
    /** @brief Storage in which to sort. */
-    RandomAccessIterator* sorting_places;
+    value_type** temporary;

-    /** @brief Storage into which to merge. */
-    value_type** merging_places;
-#else
-    /** @brief Storage in which to sort. */
-    value_type** sorting_places;
-
-    /** @brief Storage into which to merge. */
-    RandomAccessIterator* merging_places;
-#endif
    /** @brief Samples. */
    value_type* samples;

@ -108,9 +91,6 @@ template<typename RandomAccessIterator>

    /** @brief Pieces of data to merge @c [thread][sequence] */
    std::vector<Piece<difference_type> >* pieces;
-
-    /** @brief Stable sorting desired. */
-    bool stable;
 };

 /**
@ -122,7 +102,7 @@ template<typename RandomAccessIterator>
 template<typename RandomAccessIterator, typename _DifferenceTp>
  void 
  determine_samples(PMWMSSortingData<RandomAccessIterator>* sd,
-                    _DifferenceTp& num_samples)
+                    _DifferenceTp num_samples)
  {
    typedef std::iterator_traits<RandomAccessIterator> traits_type;
    typedef typename traits_type::value_type value_type;
@ -130,8 +110,6 @@ template<typename RandomAccessIterator, typename _DifferenceTp>

    thread_index_t iam = omp_get_thread_num();

-    num_samples = _Settings::get().sort_mwms_oversampling * sd->num_threads - 1;
-
    difference_type* es = new difference_type[num_samples + 2];

    equally_split(sd->starts[iam + 1] - sd->starts[iam], 
@ -144,11 +122,201 @@ template<typename RandomAccessIterator, typename _DifferenceTp>
    delete[] es;
  }

+/** @brief Split consistently. */
+template<bool exact, typename RandomAccessIterator,
+          typename Comparator, typename SortingPlacesIterator>
+  struct split_consistently
+  {
+  };
+
+/** @brief Split by exact splitting. */
+template<typename RandomAccessIterator, typename Comparator,
+          typename SortingPlacesIterator>
+  struct split_consistently
+    <true, RandomAccessIterator, Comparator, SortingPlacesIterator>
+  {
+    void operator()(
+      const thread_index_t iam,
+      PMWMSSortingData<RandomAccessIterator>* sd,
+      Comparator& comp,
+      const typename
+        std::iterator_traits<RandomAccessIterator>::difference_type
+          num_samples)
+      const
+  {
+#   pragma omp barrier
+
+    std::vector<std::pair<SortingPlacesIterator, SortingPlacesIterator> >
+        seqs(sd->num_threads);
+    for (thread_index_t s = 0; s < sd->num_threads; s++)
+      seqs[s] = std::make_pair(sd->temporary[s],
+                                sd->temporary[s]
+                                    + (sd->starts[s + 1] - sd->starts[s]));
+
+    std::vector<SortingPlacesIterator> offsets(sd->num_threads);
+
+    // if not last thread
+    if (iam < sd->num_threads - 1)
+      multiseq_partition(seqs.begin(), seqs.end(),
+                          sd->starts[iam + 1], offsets.begin(), comp);
+
+    for (int seq = 0; seq < sd->num_threads; seq++)
+      {
+        // for each sequence
+        if (iam < (sd->num_threads - 1))
+          sd->pieces[iam][seq].end = offsets[seq] - seqs[seq].first;
+        else
+          // very end of this sequence
+          sd->pieces[iam][seq].end =
+              sd->starts[seq + 1] - sd->starts[seq];
+      }
+
+#   pragma omp barrier
+
+    for (thread_index_t seq = 0; seq < sd->num_threads; seq++)
+      {
+        // For each sequence.
+        if (iam > 0)
+          sd->pieces[iam][seq].begin = sd->pieces[iam - 1][seq].end;
+        else
+          // Absolute beginning.
+          sd->pieces[iam][seq].begin = 0;
+      }
+  }   
+  };
+
+/** @brief Split by sampling. */ 
+template<typename RandomAccessIterator, typename Comparator,
+          typename SortingPlacesIterator>
+  struct split_consistently<false, RandomAccessIterator, Comparator,
+                             SortingPlacesIterator>
+  {
+    void operator()(
+        const thread_index_t iam,
+        PMWMSSortingData<RandomAccessIterator>* sd,
+        Comparator& comp,
+        const typename
+          std::iterator_traits<RandomAccessIterator>::difference_type
+            num_samples)
+        const
+    {
+      typedef std::iterator_traits<RandomAccessIterator> traits_type;
+      typedef typename traits_type::value_type value_type;
+      typedef typename traits_type::difference_type difference_type;
+
+      determine_samples(sd, num_samples);
+
+#     pragma omp barrier
+
+#     pragma omp single
+      __gnu_sequential::sort(sd->samples,
+                             sd->samples + (num_samples * sd->num_threads),
+                             comp);
+
+#     pragma omp barrier
+
+      for (thread_index_t s = 0; s < sd->num_threads; ++s)
+        {
+          // For each sequence.
+          if (num_samples * iam > 0)
+            sd->pieces[iam][s].begin =
+                std::lower_bound(sd->temporary[s],
+                    sd->temporary[s]
+                        + (sd->starts[s + 1] - sd->starts[s]),
+                    sd->samples[num_samples * iam],
+                    comp)
+                - sd->temporary[s];
+          else
+            // Absolute beginning.
+            sd->pieces[iam][s].begin = 0;
+
+          if ((num_samples * (iam + 1)) < (num_samples * sd->num_threads))
+            sd->pieces[iam][s].end =
+                std::lower_bound(sd->temporary[s],
+                        sd->temporary[s]
+                            + (sd->starts[s + 1] - sd->starts[s]),
+                        sd->samples[num_samples * (iam + 1)],
+                        comp)
+                - sd->temporary[s];
+          else
+            // Absolute end.
+            sd->pieces[iam][s].end = sd->starts[s + 1] - sd->starts[s];
+        }
+    }
+  };
+  
+template<bool stable, typename RandomAccessIterator, typename Comparator>
+  struct possibly_stable_sort
+  {
+  };
+
+template<typename RandomAccessIterator, typename Comparator>
+  struct possibly_stable_sort<true, RandomAccessIterator, Comparator>
+  {
+    void operator()(const RandomAccessIterator& begin,
+                     const RandomAccessIterator& end, Comparator& comp) const
+    {
+      __gnu_sequential::stable_sort(begin, end, comp); 
+    }
+  };
+
+template<typename RandomAccessIterator, typename Comparator>
+  struct possibly_stable_sort<false, RandomAccessIterator, Comparator>
+  {
+    void operator()(const RandomAccessIterator begin,
+                     const RandomAccessIterator end, Comparator& comp) const
+    {
+      __gnu_sequential::sort(begin, end, comp); 
+    }
+  };
+
+template<bool stable, typename SeqRandomAccessIterator,
+          typename RandomAccessIterator, typename Comparator,
+          typename DiffType>
+  struct possibly_stable_multiway_merge
+  {
+  };
+
+template<typename SeqRandomAccessIterator, typename RandomAccessIterator,
+          typename Comparator, typename DiffType>
+  struct possibly_stable_multiway_merge
+    <true, SeqRandomAccessIterator, RandomAccessIterator, Comparator,
+    DiffType>
+  {
+    void operator()(const SeqRandomAccessIterator& seqs_begin,
+                      const SeqRandomAccessIterator& seqs_end,
+                      const RandomAccessIterator& target,
+                      Comparator& comp,
+                      DiffType length_am) const
+    {
+      stable_multiway_merge(seqs_begin, seqs_end, target, comp,
+                       length_am, sequential_tag());
+    }
+  };
+
+template<typename SeqRandomAccessIterator, typename RandomAccessIterator,
+          typename Comparator, typename DiffType>
+  struct possibly_stable_multiway_merge
+    <false, SeqRandomAccessIterator, RandomAccessIterator, Comparator,
+    DiffType>
+  {
+    void operator()(const SeqRandomAccessIterator& seqs_begin,
+                      const SeqRandomAccessIterator& seqs_end,
+                      const RandomAccessIterator& target,
+                      Comparator& comp,
+                      DiffType length_am) const
+    {
+      multiway_merge(seqs_begin, seqs_end, target, comp,
+                       length_am, sequential_tag());
+    }
+  };
+
 /** @brief PMWMS code executed by each thread.
  *  @param sd Pointer to algorithm data.
  *  @param comp Comparator.
  */
-template<typename RandomAccessIterator, typename Comparator>
+template<bool stable, bool exact, typename RandomAccessIterator,
+          typename Comparator>
  void 
  parallel_sort_mwms_pu(PMWMSSortingData<RandomAccessIterator>* sd,
                        Comparator& comp)
@ -162,165 +330,65 @@ template<typename RandomAccessIterator, typename Comparator>
    // Length of this thread's chunk, before merging.
    difference_type length_local = sd->starts[iam + 1] - sd->starts[iam];

-#if _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST
-    typedef RandomAccessIterator SortingPlacesIterator;
+    // Sort in temporary storage, leave space for sentinel.

-    // Sort in input storage.
-    sd->sorting_places[iam] = sd->source + sd->starts[iam];
-#else
    typedef value_type* SortingPlacesIterator;

-    // Sort in temporary storage, leave space for sentinel.
-    sd->sorting_places[iam] = sd->temporaries[iam] = 
+    sd->temporary[iam] =
        static_cast<value_type*>(
        ::operator new(sizeof(value_type) * (length_local + 1)));

    // Copy there.
    std::uninitialized_copy(sd->source + sd->starts[iam],
                            sd->source + sd->starts[iam] + length_local,
-                            sd->sorting_places[iam]);
-#endif
+                            sd->temporary[iam]);

-    // Sort locally.
-    if (sd->stable)
-      __gnu_sequential::stable_sort(sd->sorting_places[iam],
-                                    sd->sorting_places[iam] + length_local,
-                                    comp);
-    else
-      __gnu_sequential::sort(sd->sorting_places[iam],
-                             sd->sorting_places[iam] + length_local,
-                             comp);
+    possibly_stable_sort<stable, SortingPlacesIterator, Comparator>()
+        (sd->temporary[iam], sd->temporary[iam] + length_local, comp);

-    // Invariant: locally sorted subsequence in sd->sorting_places[iam],
-    // sd->sorting_places[iam] + length_local.
-    const _Settings& __s = _Settings::get();
-    if (__s.sort_splitting == SAMPLING)
-      {
-        difference_type num_samples;
-        determine_samples(sd, num_samples);
+    // Invariant: locally sorted subsequence in sd->temporary[iam],
+    // sd->temporary[iam] + length_local.

-#       pragma omp barrier
+    // No barrier here: Synchronization is done by the splitting routine.

-#       pragma omp single
-        __gnu_sequential::sort(sd->samples,
-                               sd->samples + (num_samples * sd->num_threads),
-                               comp);
-
-#       pragma omp barrier
-
-        for (int s = 0; s < sd->num_threads; ++s)
-          {
-            // For each sequence.
-              if (num_samples * iam > 0)
-                sd->pieces[iam][s].begin = 
-                    std::lower_bound(sd->sorting_places[s],
-                        sd->sorting_places[s]
-                            + (sd->starts[s + 1] - sd->starts[s]),
-                        sd->samples[num_samples * iam],
-                        comp)
-                    - sd->sorting_places[s];
-            else
-              // Absolute beginning.
-              sd->pieces[iam][s].begin = 0;
-
-            if ((num_samples * (iam + 1)) < (num_samples * sd->num_threads))
-              sd->pieces[iam][s].end =
-                  std::lower_bound(sd->sorting_places[s],
-                          sd->sorting_places[s]
-                              + (sd->starts[s + 1] - sd->starts[s]),
-                          sd->samples[num_samples * (iam + 1)],
-                          comp)
-                  - sd->sorting_places[s];
-            else
-              // Absolute end.
-              sd->pieces[iam][s].end = sd->starts[s + 1] - sd->starts[s];
-            }
-      }
-    else if (__s.sort_splitting == EXACT)
-      {
-#       pragma omp barrier
-
-        std::vector<std::pair<SortingPlacesIterator, SortingPlacesIterator> >
-            seqs(sd->num_threads);
-        for (int s = 0; s < sd->num_threads; ++s)
-          seqs[s] = std::make_pair(sd->sorting_places[s],
-                                   sd->sorting_places[s]
-                                       + (sd->starts[s + 1] - sd->starts[s]));
-
-        std::vector<SortingPlacesIterator> offsets(sd->num_threads);
-
-        // if not last thread
-        if (iam < sd->num_threads - 1)
-          multiseq_partition(seqs.begin(), seqs.end(),
-                             sd->starts[iam + 1], offsets.begin(), comp);
-
-        for (int seq = 0; seq < sd->num_threads; ++seq)
-          {
-            // for each sequence
-            if (iam < (sd->num_threads - 1))
-              sd->pieces[iam][seq].end = offsets[seq] - seqs[seq].first;
-            else
-              // very end of this sequence
-              sd->pieces[iam][seq].end = (sd->starts[seq + 1]
-					  - sd->starts[seq]);
-          }
-
-#       pragma omp barrier
-
-        for (int seq = 0; seq < sd->num_threads; ++seq)
-          {
-            // For each sequence.
-            if (iam > 0)
-              sd->pieces[iam][seq].begin = sd->pieces[iam - 1][seq].end;
-            else
-              // Absolute beginning.
-              sd->pieces[iam][seq].begin = 0;
-          }
-      }
+    difference_type num_samples =
+        _Settings::get().sort_mwms_oversampling * sd->num_threads - 1;
+    split_consistently
+      <exact, RandomAccessIterator, Comparator, SortingPlacesIterator>()
+        (iam, sd, comp, num_samples);

    // Offset from target begin, length after merging.
    difference_type offset = 0, length_am = 0;
-    for (int s = 0; s < sd->num_threads; ++s)
+    for (thread_index_t s = 0; s < sd->num_threads; s++)
      {
        length_am += sd->pieces[iam][s].end - sd->pieces[iam][s].begin;
        offset += sd->pieces[iam][s].begin;
      }

-#if _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST
-    // Merge to temporary storage, uninitialized creation not possible
-    // since there is no multiway_merge calling the placement new
-    // instead of the assignment operator.
-    // XXX incorrect (de)construction
-    sd->merging_places[iam] = sd->temporaries[iam] =
-        static_cast<value_type*>(::operator new(sizeof(value_type)
-						* length_am));
-#else
-    // Merge directly to target.
-    sd->merging_places[iam] = sd->source + offset;
-#endif
-    std::vector<std::pair<SortingPlacesIterator, SortingPlacesIterator> >
-        seqs(sd->num_threads);
+    typedef std::vector<
+      std::pair<SortingPlacesIterator, SortingPlacesIterator> >
+        seq_vector_type;
+    seq_vector_type seqs(sd->num_threads);

    for (int s = 0; s < sd->num_threads; ++s)
      {
        seqs[s] =
-	  std::make_pair(sd->sorting_places[s] + sd->pieces[iam][s].begin,
-			 sd->sorting_places[s] + sd->pieces[iam][s].end);
+          std::make_pair(sd->temporary[s] + sd->pieces[iam][s].begin,
+        sd->temporary[s] + sd->pieces[iam][s].end);
      }

-    multiway_merge(seqs.begin(), seqs.end(), sd->merging_places[iam], comp,
-                   length_am, sd->stable, false, sequential_tag());
+    possibly_stable_multiway_merge<
+        stable,
+        typename seq_vector_type::iterator,
+        RandomAccessIterator,
+        Comparator, difference_type>()
+          (seqs.begin(), seqs.end(),
+           sd->source + offset, comp,
+           length_am);

 #   pragma omp barrier

-#if _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST
-    // Write back.
-    std::copy(sd->merging_places[iam],
-              sd->merging_places[iam] + length_am,
-              sd->source + offset);
-#endif
-
-    ::operator delete(sd->temporaries[iam]);
+    ::operator delete(sd->temporary[iam]);
  }

 /** @brief PMWMS main call.
@ -329,21 +397,22 @@ template<typename RandomAccessIterator, typename Comparator>
  *  @param comp Comparator.
  *  @param n Length of sequence.
  *  @param num_threads Number of threads to use.
-  *  @param stable Stable sorting.
  */
-template<typename RandomAccessIterator, typename Comparator>
+template<bool stable, bool exact, typename RandomAccessIterator,
+           typename Comparator>
  void
  parallel_sort_mwms(RandomAccessIterator begin, RandomAccessIterator end,
-                     Comparator comp, typename
-		     std::iterator_traits<RandomAccessIterator>::
-		     difference_type n, int num_threads, bool stable)
+                     Comparator comp,
+                     thread_index_t num_threads)
  {
-    _GLIBCXX_CALL(n)
+    _GLIBCXX_CALL(end - begin)

    typedef std::iterator_traits<RandomAccessIterator> traits_type;
    typedef typename traits_type::value_type value_type;
    typedef typename traits_type::difference_type difference_type;

+    difference_type n = end - begin;
+
    if (n <= 1)
      return;

@ -354,7 +423,6 @@ template<typename RandomAccessIterator, typename Comparator>
    // shared variables
    PMWMSSortingData<RandomAccessIterator> sd;
    difference_type* starts;
-    const _Settings& __s = _Settings::get();

 #   pragma omp parallel num_threads(num_threads)
      {
@ -364,23 +432,16 @@ template<typename RandomAccessIterator, typename Comparator>
          {
            sd.num_threads = num_threads;
            sd.source = begin;
-            sd.temporaries = new value_type*[num_threads];

-#if _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST
-            sd.sorting_places = new RandomAccessIterator[num_threads];
-            sd.merging_places = new value_type*[num_threads];
-#else
-            sd.sorting_places = new value_type*[num_threads];
-            sd.merging_places = new RandomAccessIterator[num_threads];
-#endif
+            sd.temporary = new value_type*[num_threads];

-            if (__s.sort_splitting == SAMPLING)
+            if (!exact)
              {
-                unsigned int size = 
-                    (__s.sort_mwms_oversampling * num_threads - 1)
+                difference_type size =
+                    (_Settings::get().sort_mwms_oversampling * num_threads - 1)
                        * num_threads;
                sd.samples = static_cast<value_type*>(
-		  ::operator new(size * sizeof(value_type)));
+                              ::operator new(size * sizeof(value_type)));
              }
            else
              sd.samples = NULL;
@ -390,7 +451,6 @@ template<typename RandomAccessIterator, typename Comparator>
            for (int s = 0; s < num_threads; ++s)
              sd.pieces[s].resize(num_threads);
            starts = sd.starts = new difference_type[num_threads + 1];
-            sd.stable = stable;

            difference_type chunk_length = n / num_threads;
            difference_type split = n % num_threads;
@ -401,18 +461,16 @@ template<typename RandomAccessIterator, typename Comparator>
                pos += (i < split) ? (chunk_length + 1) : chunk_length;
              }
            starts[num_threads] = pos;
-          }
+          } //single

        // Now sort in parallel.
-        parallel_sort_mwms_pu(&sd, comp);
+        parallel_sort_mwms_pu<stable, exact>(&sd, comp);
      } //parallel

    delete[] starts;
-    delete[] sd.temporaries;
-    delete[] sd.sorting_places;
-    delete[] sd.merging_places;
+    delete[] sd.temporary;

-    if (__s.sort_splitting == SAMPLING)
+    if (!exact)
      ::operator delete(sd.samples);

    delete[] sd.offsets;
--- a/libstdc++-v3/include/parallel/sort.h
+++ b/libstdc++-v3/include/parallel/sort.h
@ -71,7 +71,7 @@ namespace __gnu_parallel
  template<typename RandomAccessIterator, typename Comparator>
    inline void
    parallel_sort(RandomAccessIterator begin, RandomAccessIterator end,
-		  Comparator comp, bool stable)
+                  Comparator comp, bool stable)
    {
      _GLIBCXX_CALL(end - begin)
      typedef std::iterator_traits<RandomAccessIterator> traits_type;
@ -79,25 +79,43 @@ namespace __gnu_parallel
      typedef typename traits_type::difference_type difference_type;

      if (begin != end)
-	{
-	  difference_type n = end - begin;
+      {
+        difference_type n = end - begin;

-	  if (false) ;
+        if (false) ;
 #if _GLIBCXX_MERGESORT
-	  else if (stable || _Settings::get().sort_algorithm == MWMS)
-	    parallel_sort_mwms(begin, end, comp, n, get_max_threads(), stable);
+        else if (stable)
+          {
+            if(_Settings::get().sort_splitting == EXACT)
+              parallel_sort_mwms<true, true>
+                (begin, end, comp, get_max_threads());
+            else
+              parallel_sort_mwms<true, false>
+                (begin, end, comp, get_max_threads());
+          }
+        else if (_Settings::get().sort_algorithm == MWMS)
+          {
+            if(_Settings::get().sort_splitting == EXACT)
+              parallel_sort_mwms<false, true>
+                (begin, end, comp, get_max_threads());
+            else
+              parallel_sort_mwms<false, false>
+                (begin, end, comp, get_max_threads());
+          }
 #endif
 #if _GLIBCXX_QUICKSORT
-	  else if (!stable && _Settings::get().sort_algorithm == QS)
-	    parallel_sort_qs(begin, end, comp, n, get_max_threads());
+        else if (!stable && _Settings::get().sort_algorithm == QS)
+          parallel_sort_qs(begin, end, comp, n, get_max_threads());
 #endif
 #if _GLIBCXX_BAL_QUICKSORT
-	  else if (!stable && _Settings::get().sort_algorithm == QS_BALANCED)
-	    parallel_sort_qsb(begin, end, comp, n, get_max_threads());
+        else if (!stable && _Settings::get().sort_algorithm == QS_BALANCED)
+          parallel_sort_qsb(begin, end, comp, n, get_max_threads());
 #endif
-	  else
-	    __gnu_sequential::sort(begin, end, comp);
-	}
+        else if(stable)
+          __gnu_sequential::stable_sort(begin, end, comp);
+        else
+          __gnu_sequential::sort(begin, end, comp);
+      }
    }
 } // end namespace __gnu_parallel

--- a/libstdc++-v3/include/parallel/tags.h
+++ b/libstdc++-v3/include/parallel/tags.h
@ -44,6 +44,9 @@ namespace __gnu_parallel
  /** @brief Forces sequential execution at compile time. */
  struct sequential_tag { };

+  /** @brief Forces exact splitting in multiway merge at compile time. */
+  struct exact_tag { };
+
  /** @brief Recommends parallel execution at compile time. */
  struct parallel_tag { };

--- a/libstdc++-v3/include/parallel/types.h
+++ b/libstdc++-v3/include/parallel/types.h
@ -87,15 +87,10 @@ namespace __gnu_parallel
  /// Merging algorithms: 
  // bubblesort-alike, loser-tree variants, enum sentinel.
  enum _MultiwayMergeAlgorithm
-    { 
-      BUBBLE, 
-      LOSER_TREE_EXPLICIT, 
-      LOSER_TREE, 
-      LOSER_TREE_COMBINED, 
-      LOSER_TREE_SENTINEL, 
-      ENUM_SENTINEL 
+    {
+      LOSER_TREE
    };
-  
+
  /// Partial sum algorithms: recursive, linear.
  enum _PartialSumAlgorithm 
    { 
--- a/libstdc++-v3/testsuite/25_algorithms/sort/35588.cc
+++ b/libstdc++-v3/testsuite/25_algorithms/sort/35588.cc
@ -0,0 +1,32 @@
+// Copyright (C) 2008 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 2, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING.  If not, write to the Free
+// Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
+// USA.
+
+#include <algorithm>
+#include <functional>
+#include <tr1/functional>
+
+// libstdc++/35588
+int main()
+{
+  using namespace std;
+  using namespace tr1;
+  using namespace placeholders;
+
+  int t[10];
+  sort(t, t+10, bind(less<int>(), _1, _2));
+}