From 265fc6757d874c7a8c0a82a1ef167a4e6c748ebc Mon Sep 17 00:00:00 2001
From: Joe Wilm <joe@jwilm.com>
Date: Sun, 20 May 2018 16:18:39 -0700
Subject: fixup! Specialize Storage::swap for Row<T>

---
 src/grid/storage.rs | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)
diff --git a/src/grid/storage.rs b/src/grid/storage.rs
index a6d0d2a8..3468ab6b 100644
--- a/src/grid/storage.rs
+++ b/src/grid/storage.rs
@@ -46,27 +46,25 @@ impl<T> Swap for Storage<Row<T>> {
     /// swap than going through slice::swap.
     ///
     /// The default implementation from swap generates 8 movups and 4 movaps
-    /// instructions. This implementation only uses 8 movups instructions.
+    /// instructions. This implementation achieves the swap in only 8 movups
+    /// instructions.
     fn swap(&mut self, a: usize, b: usize) {
-        use std::mem::{size_of, uninitialized};
-        use ::libc::memcpy;
-
-        debug_assert!(size_of::<Row<T>>() == 32);
+        debug_assert!(::std::mem::size_of::<Row<T>>() == 32);
 
         let a = self.compute_index(a);
         let b = self.compute_index(b);
 
         unsafe {
-            // Cast to a u64 array of size 4 to pretend that the data is copy
+            // Cast to a qword array to opt out of copy restrictions and avoid
+            // drop hazards. Byte array is no good here since for whatever
+            // reason LLVM won't optimized it.
             let a_ptr = self.inner.as_mut_ptr().offset(a as isize) as *mut u64;
             let b_ptr = self.inner.as_mut_ptr().offset(b as isize) as *mut u64;
 
-            // Swap space
-            let mut tmp: u64;
-
             // Copy 1 qword at a time
             //
             // The optimizer unrolls this loop and vectorizes it.
+            let mut tmp: u64;
             for i in 0..4 {
                 tmp = *a_ptr.offset(i);
                 *a_ptr.offset(i) = *b_ptr.offset(i);
-- 
cgit v1.2.3-54-g00ecf