Improve implementation approach comments in [T]::reverse()
This commit is contained in:
parent
e8fad325fe
commit
1f891d11f5
@ -540,12 +540,24 @@ impl<T> SliceExt for [T] {
|
|||||||
let mut i: usize = 0;
|
let mut i: usize = 0;
|
||||||
let ln = self.len();
|
let ln = self.len();
|
||||||
|
|
||||||
|
// For very small types, all the individual reads in the normal
|
||||||
|
// path perform poorly. We can do better, given efficient unaligned
|
||||||
|
// load/store, by loading a larger chunk and reversing a register.
|
||||||
|
|
||||||
|
// Ideally LLVM would do this for us, as it knows better than we do
|
||||||
|
// whether unaligned reads are efficient (since that changes between
|
||||||
|
// different ARM versions, for example) and what the best chunk size
|
||||||
|
// would be. Unfortunately, as of LLVM 4.0 (2017-05) it only unrolls
|
||||||
|
// the loop, so we need to do this ourselves. (Hypothesis: reverse
|
||||||
|
// is troublesome because the sides can be aligned differently --
|
||||||
|
// will be, when the length is odd -- so there's no way of emitting
|
||||||
|
// pre- and postludes to use fully-aligned SIMD in the middle.)
|
||||||
|
|
||||||
let fast_unaligned =
|
let fast_unaligned =
|
||||||
cfg!(any(target_arch = "x86", target_arch = "x86_64"));
|
cfg!(any(target_arch = "x86", target_arch = "x86_64"));
|
||||||
|
|
||||||
if fast_unaligned && mem::size_of::<T>() == 1 {
|
if fast_unaligned && mem::size_of::<T>() == 1 {
|
||||||
// Single-byte read & write are comparatively slow. Instead,
|
// Use the llvm.bswap intrinsic to reverse u8s in a usize
|
||||||
// work in usize chunks and get bswap to do the hard work.
|
|
||||||
let chunk = mem::size_of::<usize>();
|
let chunk = mem::size_of::<usize>();
|
||||||
while i + chunk - 1 < ln / 2 {
|
while i + chunk - 1 < ln / 2 {
|
||||||
unsafe {
|
unsafe {
|
||||||
@ -561,8 +573,7 @@ impl<T> SliceExt for [T] {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if fast_unaligned && mem::size_of::<T>() == 2 {
|
if fast_unaligned && mem::size_of::<T>() == 2 {
|
||||||
// Not quite as good as the above, but still helpful.
|
// Use rotate-by-16 to reverse u16s in a u32
|
||||||
// Same general idea, read bigger and do the swap in a register.
|
|
||||||
let chunk = mem::size_of::<u32>() / 2;
|
let chunk = mem::size_of::<u32>() / 2;
|
||||||
while i + chunk - 1 < ln / 2 {
|
while i + chunk - 1 < ln / 2 {
|
||||||
unsafe {
|
unsafe {
|
||||||
|
Loading…
Reference in New Issue
Block a user