From 7d95de5cbcdb1dbe9c1ccc3903e26c611c6cfb32 Mon Sep 17 00:00:00 2001
From: Micah Elizabeth Scott <micah@scanlime.org>
Date: Tue, 23 Jul 2013 11:42:04 -0700
Subject: [PATCH] Swizzle, now faster and with less register spill

---
 firmware/fadecandy.cpp | 475 ++++++++++++++++++++++-------------------
 1 file changed, 250 insertions(+), 225 deletions(-)

diff --git a/firmware/fadecandy.cpp b/firmware/fadecandy.cpp
index ab182f0..3a60956 100644
--- a/firmware/fadecandy.cpp
+++ b/firmware/fadecandy.cpp
@@ -54,7 +54,7 @@ ALWAYS_INLINE static inline uint32_t lutInterpolate(const uint16_t *lut, uint32_
     return (lut[index] * invAlpha + lut[index + 1] * alpha) >> 8;
 }
 
-ALWAYS_INLINE static inline uint32_t updatePixel(uint32_t icPrev, uint32_t icNext, unsigned n)
+static inline uint32_t updatePixel(uint32_t icPrev, uint32_t icNext, unsigned n)
 {
     /*
      * Update pipeline for one pixel:
@@ -135,230 +135,255 @@ static void updateDrawBuffer(unsigned interpCoefficient)
 
     for (int i = 0; i < LEDS_PER_STRIP; ++i) {
 
-        uint32_t plane0 = updatePixel(icPrev, icNext, i + LEDS_PER_STRIP * 0);
-        uint32_t plane1 = updatePixel(icPrev, icNext, i + LEDS_PER_STRIP * 1);
-        uint32_t plane2 = updatePixel(icPrev, icNext, i + LEDS_PER_STRIP * 2);
-        uint32_t plane3 = updatePixel(icPrev, icNext, i + LEDS_PER_STRIP * 3);
-        uint32_t plane4 = updatePixel(icPrev, icNext, i + LEDS_PER_STRIP * 4);
-        uint32_t plane5 = updatePixel(icPrev, icNext, i + LEDS_PER_STRIP * 5);
-        uint32_t plane6 = updatePixel(icPrev, icNext, i + LEDS_PER_STRIP * 6);
-        uint32_t plane7 = updatePixel(icPrev, icNext, i + LEDS_PER_STRIP * 7);
-
-        *(out++) = ( (plane0 >> 23) & (1 << 0 ) ) |     // Bit 23
-                   ( (plane1 >> 22) & (1 << 1 ) ) |
-                   ( (plane2 >> 21) & (1 << 2 ) ) |
-                   ( (plane3 >> 20) & (1 << 3 ) ) |
-                   ( (plane4 >> 19) & (1 << 4 ) ) |
-                   ( (plane5 >> 18) & (1 << 5 ) ) |
-                   ( (plane6 >> 17) & (1 << 6 ) ) |
-                   ( (plane7 >> 16) & (1 << 7 ) ) |
-
-                   ( (plane0 >> 14) & (1 << 8 ) ) |     // Bit 22
-                   ( (plane1 >> 13) & (1 << 9 ) ) |
-                   ( (plane2 >> 12) & (1 << 10) ) |
-                   ( (plane3 >> 11) & (1 << 11) ) |
-                   ( (plane4 >> 10) & (1 << 12) ) |
-                   ( (plane5 >> 9 ) & (1 << 13) ) |
-                   ( (plane6 >> 8 ) & (1 << 14) ) |
-                   ( (plane7 >> 7 ) & (1 << 15) ) |
-
-                   ( (plane0 >> 5 ) & (1 << 16) ) |     // Bit 21
-                   ( (plane1 >> 4 ) & (1 << 17) ) |
-                   ( (plane2 >> 3 ) & (1 << 18) ) |
-                   ( (plane3 >> 2 ) & (1 << 19) ) |
-                   ( (plane4 >> 1 ) & (1 << 20) ) |
-                   ( (plane5      ) & (1 << 21) ) |
-                   ( (plane6 << 1 ) & (1 << 22) ) |
-                   ( (plane7 << 2 ) & (1 << 23) ) |
-
-                   ( (plane0 << 4 ) & (1 << 24) ) |     // Bit 20
-                   ( (plane1 << 5 ) & (1 << 25) ) |
-                   ( (plane2 << 6 ) & (1 << 26) ) |
-                   ( (plane3 << 7 ) & (1 << 27) ) |
-                   ( (plane4 << 8 ) & (1 << 28) ) |
-                   ( (plane5 << 9 ) & (1 << 29) ) |
-                   ( (plane6 << 10) & (1 << 30) ) |
-                   ( (plane7 << 11) & (1 << 31) ) ;
-
-        *(out++) = ( (plane0 >> 19) & (1 << 0 ) ) |     // Bit 19
-                   ( (plane1 >> 18) & (1 << 1 ) ) |
-                   ( (plane2 >> 17) & (1 << 2 ) ) |
-                   ( (plane3 >> 16) & (1 << 3 ) ) |
-                   ( (plane4 >> 15) & (1 << 4 ) ) |
-                   ( (plane5 >> 14) & (1 << 5 ) ) |
-                   ( (plane6 >> 13) & (1 << 6 ) ) |
-                   ( (plane7 >> 12) & (1 << 7 ) ) |
-
-                   ( (plane0 >> 10) & (1 << 8 ) ) |     // Bit 18
-                   ( (plane1 >> 9 ) & (1 << 9 ) ) |
-                   ( (plane2 >> 8 ) & (1 << 10) ) |
-                   ( (plane3 >> 7 ) & (1 << 11) ) |
-                   ( (plane4 >> 6 ) & (1 << 12) ) |
-                   ( (plane5 >> 5 ) & (1 << 13) ) |
-                   ( (plane6 >> 4 ) & (1 << 14) ) |
-                   ( (plane7 >> 3 ) & (1 << 15) ) |
-
-                   ( (plane0 >> 1 ) & (1 << 16) ) |     // Bit 17
-                   ( (plane1      ) & (1 << 17) ) |
-                   ( (plane2 << 1 ) & (1 << 18) ) |
-                   ( (plane3 << 2 ) & (1 << 19) ) |
-                   ( (plane4 << 3 ) & (1 << 20) ) |
-                   ( (plane5 << 4 ) & (1 << 21) ) |
-                   ( (plane6 << 5 ) & (1 << 22) ) |
-                   ( (plane7 << 6 ) & (1 << 23) ) |
-
-                   ( (plane0 << 8 ) & (1 << 24) ) |     // Bit 16
-                   ( (plane1 << 9 ) & (1 << 25) ) |
-                   ( (plane2 << 10) & (1 << 26) ) |
-                   ( (plane3 << 11) & (1 << 27) ) |
-                   ( (plane4 << 12) & (1 << 28) ) |
-                   ( (plane5 << 13) & (1 << 29) ) |
-                   ( (plane6 << 14) & (1 << 30) ) |
-                   ( (plane7 << 15) & (1 << 31) ) ;
-
-        *(out++) = ( (plane0 >> 15) & (1 << 0 ) ) |     // Bit 15
-                   ( (plane1 >> 14) & (1 << 1 ) ) |
-                   ( (plane2 >> 13) & (1 << 2 ) ) |
-                   ( (plane3 >> 12) & (1 << 3 ) ) |
-                   ( (plane4 >> 11) & (1 << 4 ) ) |
-                   ( (plane5 >> 10) & (1 << 5 ) ) |
-                   ( (plane6 >> 9 ) & (1 << 6 ) ) |
-                   ( (plane7 >> 8 ) & (1 << 7 ) ) |
-
-                   ( (plane0 >> 6 ) & (1 << 8 ) ) |     // Bit 14
-                   ( (plane1 >> 5 ) & (1 << 9 ) ) |
-                   ( (plane2 >> 4 ) & (1 << 10) ) |
-                   ( (plane3 >> 3 ) & (1 << 11) ) |
-                   ( (plane4 >> 2 ) & (1 << 12) ) |
-                   ( (plane5 >> 1 ) & (1 << 13) ) |
-                   ( (plane6      ) & (1 << 14) ) |
-                   ( (plane7 << 1 ) & (1 << 15) ) |
-
-                   ( (plane0 << 3 ) & (1 << 16) ) |     // Bit 13
-                   ( (plane1 << 4 ) & (1 << 17) ) |
-                   ( (plane2 << 5 ) & (1 << 18) ) |
-                   ( (plane3 << 6 ) & (1 << 19) ) |
-                   ( (plane4 << 7 ) & (1 << 20) ) |
-                   ( (plane5 << 8 ) & (1 << 21) ) |
-                   ( (plane6 << 9 ) & (1 << 22) ) |
-                   ( (plane7 << 10) & (1 << 23) ) |
-
-                   ( (plane0 << 12) & (1 << 24) ) |     // Bit 12
-                   ( (plane1 << 13) & (1 << 25) ) |
-                   ( (plane2 << 14) & (1 << 26) ) |
-                   ( (plane3 << 15) & (1 << 27) ) |
-                   ( (plane4 << 16) & (1 << 28) ) |
-                   ( (plane5 << 17) & (1 << 29) ) |
-                   ( (plane6 << 18) & (1 << 30) ) |
-                   ( (plane7 << 19) & (1 << 31) ) ;
-
-        *(out++) = ( (plane0 >> 11) & (1 << 0 ) ) |     // Bit 11
-                   ( (plane1 >> 10) & (1 << 1 ) ) |
-                   ( (plane2 >> 9 ) & (1 << 2 ) ) |
-                   ( (plane3 >> 8 ) & (1 << 3 ) ) |
-                   ( (plane4 >> 7 ) & (1 << 4 ) ) |
-                   ( (plane5 >> 6 ) & (1 << 5 ) ) |
-                   ( (plane6 >> 5 ) & (1 << 6 ) ) |
-                   ( (plane7 >> 4 ) & (1 << 7 ) ) |
-
-                   ( (plane0 >> 2 ) & (1 << 8 ) ) |     // Bit 10
-                   ( (plane1 >> 1 ) & (1 << 9 ) ) |
-                   ( (plane2      ) & (1 << 10) ) |
-                   ( (plane3 << 1 ) & (1 << 11) ) |
-                   ( (plane4 << 2 ) & (1 << 12) ) |
-                   ( (plane5 << 3 ) & (1 << 13) ) |
-                   ( (plane6 << 4 ) & (1 << 14) ) |
-                   ( (plane7 << 5 ) & (1 << 15) ) |
-
-                   ( (plane0 << 7 ) & (1 << 16) ) |     // Bit 9
-                   ( (plane1 << 8 ) & (1 << 17) ) |
-                   ( (plane2 << 9 ) & (1 << 18) ) |
-                   ( (plane3 << 10) & (1 << 19) ) |
-                   ( (plane4 << 11) & (1 << 20) ) |
-                   ( (plane5 << 12) & (1 << 21) ) |
-                   ( (plane6 << 13) & (1 << 22) ) |
-                   ( (plane7 << 14) & (1 << 23) ) |
-
-                   ( (plane0 << 16) & (1 << 24) ) |     // Bit 8
-                   ( (plane1 << 17) & (1 << 25) ) |
-                   ( (plane2 << 18) & (1 << 26) ) |
-                   ( (plane3 << 19) & (1 << 27) ) |
-                   ( (plane4 << 20) & (1 << 28) ) |
-                   ( (plane5 << 21) & (1 << 29) ) |
-                   ( (plane6 << 22) & (1 << 30) ) |
-                   ( (plane7 << 23) & (1 << 31) ) ;
-
-        *(out++) = ( (plane0 >> 7 ) & (1 << 0 ) ) |     // Bit 7
-                   ( (plane1 >> 6 ) & (1 << 1 ) ) |
-                   ( (plane2 >> 5 ) & (1 << 2 ) ) |
-                   ( (plane3 >> 4 ) & (1 << 3 ) ) |
-                   ( (plane4 >> 3 ) & (1 << 4 ) ) |
-                   ( (plane5 >> 2 ) & (1 << 5 ) ) |
-                   ( (plane6 >> 1 ) & (1 << 6 ) ) |
-                   ( (plane7      ) & (1 << 7 ) ) |
-
-                   ( (plane0 << 2 ) & (1 << 8 ) ) |     // Bit 6
-                   ( (plane1 << 3 ) & (1 << 9 ) ) |
-                   ( (plane2 << 4 ) & (1 << 10) ) |
-                   ( (plane3 << 5 ) & (1 << 11) ) |
-                   ( (plane4 << 6 ) & (1 << 12) ) |
-                   ( (plane5 << 7 ) & (1 << 13) ) |
-                   ( (plane6 << 8 ) & (1 << 14) ) |
-                   ( (plane7 << 9 ) & (1 << 15) ) |
-
-                   ( (plane0 << 11) & (1 << 16) ) |     // Bit 5
-                   ( (plane1 << 12) & (1 << 17) ) |
-                   ( (plane2 << 13) & (1 << 18) ) |
-                   ( (plane3 << 14) & (1 << 19) ) |
-                   ( (plane4 << 15) & (1 << 20) ) |
-                   ( (plane5 << 16) & (1 << 21) ) |
-                   ( (plane6 << 17) & (1 << 22) ) |
-                   ( (plane7 << 18) & (1 << 23) ) |
-
-                   ( (plane0 << 20) & (1 << 24) ) |     // Bit 4
-                   ( (plane1 << 21) & (1 << 25) ) |
-                   ( (plane2 << 22) & (1 << 26) ) |
-                   ( (plane3 << 23) & (1 << 27) ) |
-                   ( (plane4 << 24) & (1 << 28) ) |
-                   ( (plane5 << 25) & (1 << 29) ) |
-                   ( (plane6 << 26) & (1 << 30) ) |
-                   ( (plane7 << 27) & (1 << 31) ) ;
-
-        *(out++) = ( (plane0 >> 3 ) & (1 << 0 ) ) |     // Bit 3
-                   ( (plane1 >> 2 ) & (1 << 1 ) ) |
-                   ( (plane2 >> 1 ) & (1 << 2 ) ) |
-                   ( (plane3      ) & (1 << 3 ) ) |
-                   ( (plane4 << 1 ) & (1 << 4 ) ) |
-                   ( (plane5 << 2 ) & (1 << 5 ) ) |
-                   ( (plane6 << 3 ) & (1 << 6 ) ) |
-                   ( (plane7 << 4 ) & (1 << 7 ) ) |
-
-                   ( (plane0 << 6 ) & (1 << 8 ) ) |     // Bit 2
-                   ( (plane1 << 7 ) & (1 << 9 ) ) |
-                   ( (plane2 << 8 ) & (1 << 10) ) |
-                   ( (plane3 << 9 ) & (1 << 11) ) |
-                   ( (plane4 << 10) & (1 << 12) ) |
-                   ( (plane5 << 11) & (1 << 13) ) |
-                   ( (plane6 << 12) & (1 << 14) ) |
-                   ( (plane7 << 13) & (1 << 15) ) |
-
-                   ( (plane0 << 15) & (1 << 16) ) |     // Bit 1
-                   ( (plane1 << 16) & (1 << 17) ) |
-                   ( (plane2 << 17) & (1 << 18) ) |
-                   ( (plane3 << 18) & (1 << 19) ) |
-                   ( (plane4 << 19) & (1 << 20) ) |
-                   ( (plane5 << 20) & (1 << 21) ) |
-                   ( (plane6 << 21) & (1 << 22) ) |
-                   ( (plane7 << 22) & (1 << 23) ) |
-
-                   ( (plane0 << 24) & (1 << 24) ) |     // Bit 0
-                   ( (plane1 << 25) & (1 << 25) ) |
-                   ( (plane2 << 26) & (1 << 26) ) |
-                   ( (plane3 << 27) & (1 << 27) ) |
-                   ( (plane4 << 28) & (1 << 28) ) |
-                   ( (plane5 << 29) & (1 << 29) ) |
-                   ( (plane6 << 30) & (1 << 30) ) |
-                   ( (plane7 << 31) & (1 << 31) ) ;
+        // Eight bit planes
+        union {
+            uint32_t word;
+            struct {
+                uint32_t x0:1, x1:1, x2:1, x3:1, x4:1, x5:1, x6:1, x7:1,
+                         y0:1, y1:1, y2:1, y3:1, y4:1, y5:1, y6:1, y7:1,
+                         z0:1, z1:1, z2:1, z3:1, z4:1, z5:1, z6:1, z7:1,
+                         spare:8;
+            };
+        } p0, p1, p2, p3, p4, p5, p6, p7;
+
+        // Six output words
+        union {
+            uint32_t word;
+            struct {
+                uint32_t p0a:1, p1a:1, p2a:1, p3a:1, p4a:1, p5a:1, p6a:1, p7a:1,
+                         p0b:1, p1b:1, p2b:1, p3b:1, p4b:1, p5b:1, p6b:1, p7b:1,
+                         p0c:1, p1c:1, p2c:1, p3c:1, p4c:1, p5c:1, p6c:1, p7c:1,
+                         p0d:1, p1d:1, p2d:1, p3d:1, p4d:1, p5d:1, p6d:1, p7d:1;
+            };
+        } o0, o1, o2, o3, o4, o5;
+
+        /*
+         * Remap bits.
+         * This generates fairly efficient code using the UBFX and BFI instructions.
+         */
+
+        p0.word = updatePixel(icPrev, icNext, i + LEDS_PER_STRIP * 0);
+
+        o5.p0d = p0.x0;
+        o5.p0c = p0.x1;
+        o5.p0b = p0.x2;
+        o5.p0a = p0.x3;
+        o4.p0d = p0.x4;
+        o4.p0c = p0.x5;
+        o4.p0b = p0.x6;
+        o4.p0a = p0.x7;
+        o3.p0d = p0.y0;
+        o3.p0c = p0.y1;
+        o3.p0b = p0.y2;
+        o3.p0a = p0.y3;
+        o2.p0d = p0.y4;
+        o2.p0c = p0.y5;
+        o2.p0b = p0.y6;
+        o2.p0a = p0.y7;
+        o1.p0d = p0.z0;
+        o1.p0c = p0.z1;
+        o1.p0b = p0.z2;
+        o1.p0a = p0.z3;
+        o0.p0d = p0.z4;
+        o0.p0c = p0.z5;
+        o0.p0b = p0.z6;
+        o0.p0a = p0.z7;
+
+        p1.word = updatePixel(icPrev, icNext, i + LEDS_PER_STRIP * 1);
+
+        o5.p1d = p1.x0;
+        o5.p1c = p1.x1;
+        o5.p1b = p1.x2;
+        o5.p1a = p1.x3;
+        o4.p1d = p1.x4;
+        o4.p1c = p1.x5;
+        o4.p1b = p1.x6;
+        o4.p1a = p1.x7;
+        o3.p1d = p1.y0;
+        o3.p1c = p1.y1;
+        o3.p1b = p1.y2;
+        o3.p1a = p1.y3;
+        o2.p1d = p1.y4;
+        o2.p1c = p1.y5;
+        o2.p1b = p1.y6;
+        o2.p1a = p1.y7;
+        o1.p1d = p1.z0;
+        o1.p1c = p1.z1;
+        o1.p1b = p1.z2;
+        o1.p1a = p1.z3;
+        o0.p1d = p1.z4;
+        o0.p1c = p1.z5;
+        o0.p1b = p1.z6;
+        o0.p1a = p1.z7;
+
+        p2.word = updatePixel(icPrev, icNext, i + LEDS_PER_STRIP * 2);
+
+        o5.p2d = p2.x0;
+        o5.p2c = p2.x1;
+        o5.p2b = p2.x2;
+        o5.p2a = p2.x3;
+        o4.p2d = p2.x4;
+        o4.p2c = p2.x5;
+        o4.p2b = p2.x6;
+        o4.p2a = p2.x7;
+        o3.p2d = p2.y0;
+        o3.p2c = p2.y1;
+        o3.p2b = p2.y2;
+        o3.p2a = p2.y3;
+        o2.p2d = p2.y4;
+        o2.p2c = p2.y5;
+        o2.p2b = p2.y6;
+        o2.p2a = p2.y7;
+        o1.p2d = p2.z0;
+        o1.p2c = p2.z1;
+        o1.p2b = p2.z2;
+        o1.p2a = p2.z3;
+        o0.p2d = p2.z4;
+        o0.p2c = p2.z5;
+        o0.p2b = p2.z6;
+        o0.p2a = p2.z7;
+
+        p3.word = updatePixel(icPrev, icNext, i + LEDS_PER_STRIP * 3);
+
+        o5.p3d = p3.x0;
+        o5.p3c = p3.x1;
+        o5.p3b = p3.x2;
+        o5.p3a = p3.x3;
+        o4.p3d = p3.x4;
+        o4.p3c = p3.x5;
+        o4.p3b = p3.x6;
+        o4.p3a = p3.x7;
+        o3.p3d = p3.y0;
+        o3.p3c = p3.y1;
+        o3.p3b = p3.y2;
+        o3.p3a = p3.y3;
+        o2.p3d = p3.y4;
+        o2.p3c = p3.y5;
+        o2.p3b = p3.y6;
+        o2.p3a = p3.y7;
+        o1.p3d = p3.z0;
+        o1.p3c = p3.z1;
+        o1.p3b = p3.z2;
+        o1.p3a = p3.z3;
+        o0.p3d = p3.z4;
+        o0.p3c = p3.z5;
+        o0.p3b = p3.z6;
+        o0.p3a = p3.z7;
+
+        p4.word = updatePixel(icPrev, icNext, i + LEDS_PER_STRIP * 4);
+
+        o5.p4d = p4.x0;
+        o5.p4c = p4.x1;
+        o5.p4b = p4.x2;
+        o5.p4a = p4.x3;
+        o4.p4d = p4.x4;
+        o4.p4c = p4.x5;
+        o4.p4b = p4.x6;
+        o4.p4a = p4.x7;
+        o3.p4d = p4.y0;
+        o3.p4c = p4.y1;
+        o3.p4b = p4.y2;
+        o3.p4a = p4.y3;
+        o2.p4d = p4.y4;
+        o2.p4c = p4.y5;
+        o2.p4b = p4.y6;
+        o2.p4a = p4.y7;
+        o1.p4d = p4.z0;
+        o1.p4c = p4.z1;
+        o1.p4b = p4.z2;
+        o1.p4a = p4.z3;
+        o0.p4d = p4.z4;
+        o0.p4c = p4.z5;
+        o0.p4b = p4.z6;
+        o0.p4a = p4.z7;
+
+        p5.word = updatePixel(icPrev, icNext, i + LEDS_PER_STRIP * 5);
+
+        o5.p5d = p5.x0;
+        o5.p5c = p5.x1;
+        o5.p5b = p5.x2;
+        o5.p5a = p5.x3;
+        o4.p5d = p5.x4;
+        o4.p5c = p5.x5;
+        o4.p5b = p5.x6;
+        o4.p5a = p5.x7;
+        o3.p5d = p5.y0;
+        o3.p5c = p5.y1;
+        o3.p5b = p5.y2;
+        o3.p5a = p5.y3;
+        o2.p5d = p5.y4;
+        o2.p5c = p5.y5;
+        o2.p5b = p5.y6;
+        o2.p5a = p5.y7;
+        o1.p5d = p5.z0;
+        o1.p5c = p5.z1;
+        o1.p5b = p5.z2;
+        o1.p5a = p5.z3;
+        o0.p5d = p5.z4;
+        o0.p5c = p5.z5;
+        o0.p5b = p5.z6;
+        o0.p5a = p5.z7;
+
+        p6.word = updatePixel(icPrev, icNext, i + LEDS_PER_STRIP * 6);
+
+        o5.p6d = p6.x0;
+        o5.p6c = p6.x1;
+        o5.p6b = p6.x2;
+        o5.p6a = p6.x3;
+        o4.p6d = p6.x4;
+        o4.p6c = p6.x5;
+        o4.p6b = p6.x6;
+        o4.p6a = p6.x7;
+        o3.p6d = p6.y0;
+        o3.p6c = p6.y1;
+        o3.p6b = p6.y2;
+        o3.p6a = p6.y3;
+        o2.p6d = p6.y4;
+        o2.p6c = p6.y5;
+        o2.p6b = p6.y6;
+        o2.p6a = p6.y7;
+        o1.p6d = p6.z0;
+        o1.p6c = p6.z1;
+        o1.p6b = p6.z2;
+        o1.p6a = p6.z3;
+        o0.p6d = p6.z4;
+        o0.p6c = p6.z5;
+        o0.p6b = p6.z6;
+        o0.p6a = p6.z7;
+
+        p7.word = updatePixel(icPrev, icNext, i + LEDS_PER_STRIP * 7);
+
+        o5.p7d = p7.x0;
+        o5.p7c = p7.x1;
+        o5.p7b = p7.x2;
+        o5.p7a = p7.x3;
+        o4.p7d = p7.x4;
+        o4.p7c = p7.x5;
+        o4.p7b = p7.x6;
+        o4.p7a = p7.x7;
+        o3.p7d = p7.y0;
+        o3.p7c = p7.y1;
+        o3.p7b = p7.y2;
+        o3.p7a = p7.y3;
+        o2.p7d = p7.y4;
+        o2.p7c = p7.y5;
+        o2.p7b = p7.y6;
+        o2.p7a = p7.y7;
+        o1.p7d = p7.z0;
+        o1.p7c = p7.z1;
+        o1.p7b = p7.z2;
+        o1.p7a = p7.z3;
+        o0.p7d = p7.z4;
+        o0.p7c = p7.z5;
+        o0.p7b = p7.z6;
+        o0.p7a = p7.z7;
+
+        *(out++) = o0.word;
+        *(out++) = o1.word;
+        *(out++) = o2.word;
+        *(out++) = o3.word;
+        *(out++) = o4.word;
+        *(out++) = o5.word;
     }
 }
 
-- 
GitLab