Optimize Storage::swap

Removes 4 movaps instructions from generated assembly.
author: Joe Wilm <joe@jwilm.com> 2018-05-29 21:37:56 -0700
committer: Joe Wilm <joe@jwilm.com> 2018-05-29 21:37:56 -0700
commit: 4698356f8bc718d8d138bb1908059cd286e3a563 (patch)
tree: 92ac42273d17f2dcd42e2bf03d670b6efd951179
parent: ceb0476ccbee3683f38971451965f91607b3d4fc (diff)
download: alacritty-4698356f8bc718d8d138bb1908059cd286e3a563.tar.gz
alacritty-4698356f8bc718d8d138bb1908059cd286e3a563.zip
1 files changed, 32 insertions, 6 deletions
diff --git a/src/grid/storage.rs b/src/grid/storage.rs
index 57afde82..0f0f611b 100644
--- a/src/grid/storage.rs
+++ b/src/grid/storage.rs
@@ -14,7 +14,7 @@
 use std::ops::{Index, IndexMut};
 use std::slice;
 
-use index::{Column, Line};
+use index::Line;
 use super::Row;
 
 /// Maximum number of invisible lines before buffer is resized
@@ -198,17 +198,40 @@ impl<T> Storage<T> {
         self.inner.swap(a, b);
     }
 
-    /// Swap two lines in raw buffer
+    /// Swap implementation for Row<T>.
     ///
-    /// # Panics
+    /// Exploits the known size of Row<T> to produce a slightly more efficient
+    /// swap than going through slice::swap.
     ///
-    /// `swap` will panic if either `a` or `b` are out-of-bounds of the
-    /// underlying storage.
+    /// The default implementation from swap generates 8 movups and 4 movaps
+    /// instructions. This implementation achieves the swap in only 8 movups
+    /// instructions.
+    ///
+    // TODO Once specialization is available, Storage<T> can be fully generic
+    //      again instead of enforcing inner: Vec<Row<T>>.
     pub fn swap(&mut self, a: usize, b: usize) {
+        debug_assert!(::std::mem::size_of::<Row<T>>() == 32);
+
         let a = self.compute_index(a);
         let b = self.compute_index(b);
 
-        self.inner.swap(a, b);
+        unsafe {
+            // Cast to a qword array to opt out of copy restrictions and avoid
+            // drop hazards. Byte array is no good here since for whatever
+            // reason LLVM won't optimized it.
+            let a_ptr = self.inner.as_mut_ptr().offset(a as isize) as *mut u64;
+            let b_ptr = self.inner.as_mut_ptr().offset(b as isize) as *mut u64;
+
+            // Copy 1 qword at a time
+            //
+            // The optimizer unrolls this loop and vectorizes it.
+            let mut tmp: u64;
+            for i in 0..4 {
+                tmp = *a_ptr.offset(i);
+                *a_ptr.offset(i) = *b_ptr.offset(i);
+                *b_ptr.offset(i) = tmp;
+            }
+        }
     }
 
     /// Iterator over *logical* entries in the storage
@@ -299,6 +322,9 @@ impl<'a, T: 'a> Iterator for IterMut<'a, T> {
     }
 }
 
+#[cfg(test)]
+use index::Column;
+
 /// Grow the buffer one line at the end of the buffer
 ///
 /// Before:
author	Joe Wilm <joe@jwilm.com>	2018-05-29 21:37:56 -0700
committer	Joe Wilm <joe@jwilm.com>	2018-05-29 21:37:56 -0700
commit	4698356f8bc718d8d138bb1908059cd286e3a563 (patch)
tree	92ac42273d17f2dcd42e2bf03d670b6efd951179
parent	ceb0476ccbee3683f38971451965f91607b3d4fc (diff)
download	alacritty-4698356f8bc718d8d138bb1908059cd286e3a563.tar.gz alacritty-4698356f8bc718d8d138bb1908059cd286e3a563.zip